From eb9997326fa43559aa1d652f3d2477d88e2b4cb0 Mon Sep 17 00:00:00 2001 From: Eric Furst Date: Sun, 1 Mar 2026 07:39:28 -0500 Subject: [PATCH] Shell script run_retrieve.sh for non-LLM gneration queries (returns only chunks), track development notes and README. --- README.md | 4 +- deploy_public.sh | 41 +- devlog.md | 1035 ++++++++++++++++++++++++++++++++++++++++++++++ run_retrieve.sh | 29 ++ 4 files changed, 1089 insertions(+), 20 deletions(-) create mode 100644 devlog.md create mode 100755 run_retrieve.sh diff --git a/README.md b/README.md index 8e4ff30..47a127c 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ ssearch/ ├── archived/ # Superseded script versions ├── saved_output/ # Saved query results and model comparisons ├── requirements.txt # Python dependencies -├── devlog.txt # Development log and experimental findings +├── devlog.md # Development log and experimental findings └── *.ipynb # Jupyter notebooks (HyDE, metrics, sandbox) ``` @@ -198,4 +198,4 @@ Three Jupyter notebooks document exploration and analysis: - **Jan 2026**: Command-line interface, prompt improvements, model comparison (command-r7b selected). - **Feb 2026**: Cross-encoder re-ranking, hybrid BM25+vector retrieval, LlamaIndex upgrade to 0.14.14, OpenAI API backend, incremental updates, clippings search (ChromaDB), project reorganization. -See `devlog.txt` for detailed development notes and experimental findings. +See `devlog.md` for detailed development notes and experimental findings. diff --git a/deploy_public.sh b/deploy_public.sh index 9240f1b..33f2fbf 100755 --- a/deploy_public.sh +++ b/deploy_public.sh @@ -3,9 +3,11 @@ # # Usage: ./deploy_public.sh ["optional commit message"] # -# Checks out an orphan public branch, copies the public files from main, -# generates a public README (stripping private sections), commits, and -# force-pushes to origin. Then switches back to main. +# Checks out the public branch, updates it with public files from main, +# generates a public README (stripping private sections), commits if +# anything changed, and pushes to origin. Then switches back to main. +# +# On first run (no public branch exists), creates an orphan branch. # # E.M.F. February 2026 @@ -50,12 +52,14 @@ MAIN_HEAD=$(git rev-parse --short HEAD) echo "Deploying main ($MAIN_HEAD) -> $BRANCH..." -# Delete local public branch if it exists -git branch -D "$BRANCH" 2>/dev/null || true - -# Create fresh orphan -git checkout --orphan "$BRANCH" -git rm -rf . >/dev/null 2>&1 || true +# Check out public branch, or create orphan if it doesn't exist yet +if git show-ref --verify --quiet "refs/heads/$BRANCH"; then + git checkout "$BRANCH" +else + echo "No local $BRANCH branch — creating orphan..." + git checkout --orphan "$BRANCH" + git rm -rf . >/dev/null 2>&1 || true +fi # Copy public files from main for f in "${PUBLIC_FILES[@]}"; do @@ -74,22 +78,23 @@ awk ' skip { next } /archived\// { next } /saved_output\// { next } -/devlog\.txt/ { next } +/devlog\.md/ { next } /\*\.ipynb/ { next } { print } ' README.md > README.tmp && mv README.tmp README.md # Stage only the public files (not untracked files on disk) git add "${PUBLIC_FILES[@]}" README.md -git commit -m "$COMMIT_MSG -Co-Authored-By: Claude Opus 4.6 " - -# Push -git push --force "$REMOTE" "$BRANCH" +# Commit only if there are changes +if git diff --cached --quiet; then + echo "No changes to deploy." +else + git commit -m "$COMMIT_MSG" + git push "$REMOTE" "$BRANCH" + echo "" + echo "Done. Deployed main ($MAIN_HEAD) -> $REMOTE/$BRANCH" +fi # Switch back to main git checkout main - -echo "" -echo "Done. Deployed main ($MAIN_HEAD) -> $REMOTE/$BRANCH" diff --git a/devlog.md b/devlog.md new file mode 100644 index 0000000..055f620 --- /dev/null +++ b/devlog.md @@ -0,0 +1,1035 @@ +# ssearch development log + +## Active files (after Feb 27 reorganization) + +- `build_store.py` — build/update journal vector store (incremental) +- `query_hybrid.py` — hybrid BM25 + vector query with LLM synthesis +- `retrieve.py` — hybrid verbatim chunk retrieval (no LLM) +- `search_keywords.py` — keyword search via POS-based term extraction +- `run_query.sh` — shell wrapper for interactive querying +- `clippings_search/build_clippings.py` — build/update clippings vector store (ChromaDB) +- `clippings_search/retrieve_clippings.py` — verbatim clippings retrieval +- `deploy_public.sh` — deploy public files to Forgejo + +Earlier scripts moved to `archived/`: +`build.py`, `build_exp.py`, `query_topk.py`, `query_catalog.py`, `query_exp.py`, +`query_topk_prompt.py`, `query_topk_prompt_engine.py`, `query_topk_prompt_dw.py`, +`query_rewrite_hyde.py`, `query_multitool.py`, `shared/build.py`, `shared/query.py`, +`vs_metrics.py`, `claude_diagnostic.py`, `query_claude_sonnet.py`, `query_tree.py`, +`query_topk_prompt_engine_v3.py`, `retrieve_raw.py` + +## Best configuration + +- **Embedding**: BAAI/bge-large-en-v1.5, 256 token chunks, 25 token overlap +- **Re-ranker**: cross-encoder/ms-marco-MiniLM-L-12-v2 (retrieve top-30, re-rank to top-15) +- **LLM**: command-r7b via Ollama (temperature 0.3). OpenAI gpt-4o-mini available as alternative. +- **Retrieval**: hybrid BM25 + vector, cross-encoder re-ranked + +## To do + +1. [DONE] Test v3 (cross-encoder re-ranking) and compare results with v2. + Selected ms-marco-MiniLM-L-12-v2 after testing three models. + +2. [DONE] Verbatim retrieval mode (`retrieve_raw.py`). Uses + `index.as_retriever()` instead of `index.as_query_engine()` to get + chunks without LLM synthesis. Re-ranks with the same cross-encoder, + then outputs raw chunk text with metadata and scores. + +3. [DONE] Keyword search pipeline (`search_keywords.py`). Extracts + nouns and adjectives via NLTK POS tagging, then greps data files. + Complements vector search for exact names, places, dates. + +4. [DONE] BM25 hybrid retrieval (sparse + dense). Two scripts: + `query_hybrid.py` (with LLM synthesis) and `retrieve.py` + (verbatim chunks, no LLM). Both run BM25 (top-20) and vector (top-20) + retrievers, merge/deduplicate, then cross-encoder re-rank to top-15. + Uses llama-index-retrievers-bm25. + +5. Explore query expansion (multiple phrasings, merged retrieval) + +6. Explore different vector store strategies (database) + +7. [DONE] Test ChatGPT API for final LLM generation (instead of local Ollama) + +8. [DONE] Remove API key from this file. Moved to `~/.bashrc` as `OPENAI_API_KEY`. + + The retrieval pipeline (embedding, vector search, cross-encoder re-ranking) + stays the same. Only the final synthesis LLM changes. + + **Steps:** + 1. Install the LlamaIndex OpenAI integration: + ``` + pip install llama-index-llms-openai + ``` + 2. Set API key as environment variable: + ``` + export OPENAI_API_KEY="sk-..." + ``` + (Or store in a `.env` file and load with python-dotenv. Do NOT commit + the key to version control.) + 3. In the query script, replace the Ollama LLM with OpenAI: + ```python + # Current (local): + from llama_index.llms.ollama import Ollama + Settings.llm = Ollama( + model="command-r7b", + request_timeout=360.0, + context_window=8000, + ) + + # New (API): + from llama_index.llms.openai import OpenAI + Settings.llm = OpenAI( + model="gpt-4o-mini", # or "gpt-4o" for higher quality + temperature=0.1, + ) + ``` + 4. Run the query script as usual. Everything else (embedding model, + vector store, cross-encoder re-ranker, prompt) is unchanged. + 5. Compare output quality and response time against command-r7b. + + Models to try: gpt-4o-mini (cheap, fast), gpt-4o (better quality). + The prompt should work without modification since it's model-agnostic — + just context + instructions. + + Note: This adds an external API dependency and per-query cost. + The embedding and re-ranking remain fully local/offline. + + API KEY: moved to `~/.bashrc` as `OPENAI_API_KEY` (do not store in repo) + + **Getting an OpenAI API key:** + 1. Go to https://platform.openai.com/ and sign up (or log in). + 2. Navigate to API keys: Settings > API keys (or https://platform.openai.com/api-keys). + 3. Click "Create new secret key", give it a name, and copy it. + The key starts with `sk-` and is shown only once. + 4. Add billing: Settings > Billing. Load a small amount ($5-10) + to start. API calls are pay-per-use, not a subscription. + 5. Set the key in your shell before running a query: + ``` + export OPENAI_API_KEY="sk-..." + ``` + Or add to `~/.zshrc` (or `~/.bashrc`) to persist across sessions. + Do NOT commit the key to version control or put it in scripts. + + **Approximate cost per query (Feb 2026):** + - gpt-4o-mini: ~$0.001-0.003 (15 chunks of context) + - gpt-4o: ~$0.01-0.03 + +--- + +## February 27, 2026 + +### Project reorganization + +Reorganized the project structure with Claude Code. Goals: drop legacy version +numbers from filenames, archive superseded scripts, group clippings search into +a subdirectory, and clean up storage directory names. + +**Script renames:** +- `build_exp_claude.py` → `build_store.py` +- `query_hybrid_bm25_v4.py` → `query_hybrid.py` +- `retrieve_hybrid_raw.py` → `retrieve.py` + +**Archived (moved to `archived/`):** +- `query_topk_prompt_engine_v3.py` — superseded by hybrid BM25+vector query +- `retrieve_raw.py` — superseded by hybrid retrieval + +**Clippings search subdirectory:** +- `build_clippings.py` → `clippings_search/build_clippings.py` +- `retrieve_clippings.py` → `clippings_search/retrieve_clippings.py` +- Scripts use `./` paths relative to project root, so no path changes needed + when run as `python clippings_search/build_clippings.py` from root. + +**Storage renames:** +- `storage_exp/` → `store/` (journal vector store) +- `storage_clippings/` → `clippings_search/store_clippings/` (clippings vector store) +- Deleted unused `storage/` (original August 2025 store, never updated) + +**Updated references** in `run_query.sh`, `.gitignore`, `CLAUDE.md`, `README.md`, +and all Python scripts that referenced old storage paths. + +### Deploy script (`deploy_public.sh`) + +Created `deploy_public.sh` to automate publishing to Forgejo. Previously, +maintaining the public branch required manually recreating an orphan branch, +copying files, editing the README, and force-pushing — error-prone and tedious. + +The script: +1. Checks that we're on `main` with no uncommitted changes +2. Deletes the local public branch and creates a fresh orphan +3. Copies listed public files from `main` (via `git checkout main -- `) +4. Generates a public README by stripping private sections (Notebooks, + Development history) and private file references using `awk` +5. Stages only the listed files (not untracked files on disk) +6. Commits with a message and force-pushes to `origin/public` +7. Switches back to `main` + +Fixed a bug where `git add .` picked up untracked files (`output_test.txt`, +`run_retrieve.sh`). Changed to `git add "${PUBLIC_FILES[@]}" README.md`. + +### Forgejo setup + +Set up SSH push to Forgejo instance. Required adding SSH public key to Forgejo +user settings. The remote uses a Tailscale address. + +### MIT License + +Added MIT License (Copyright (c) 2026 E. M. Furst) to both main and public branches. + +### Devlog migration + +Migrated `devlog.txt` to `devlog.md` with markdown formatting. + +--- + +## February 20, 2026 + +### Offline use: environment variables must be set before imports + +Despite setting `HF_HUB_OFFLINE=1` and `SENTENCE_TRANSFORMERS_HOME=./models` +(added Feb 16), the scripts still failed offline with a `ConnectionError` trying +to reach huggingface.co. The error came from `AutoTokenizer.from_pretrained()` +calling `list_repo_templates()`, which makes an HTTP request to the HuggingFace API. + +**Root cause:** the `huggingface_hub` library evaluates `HF_HUB_OFFLINE` at import +time, not at call time. The constant is set once in `huggingface_hub/constants.py`: + +```python +HF_HUB_OFFLINE = _is_true(os.environ.get("HF_HUB_OFFLINE") + or os.environ.get("TRANSFORMERS_OFFLINE")) +``` + +In all four scripts, the `os.environ` lines came AFTER the imports: + +```python +from llama_index.embeddings.huggingface import HuggingFaceEmbedding # triggers import of huggingface_hub +from llama_index.core.postprocessor import SentenceTransformerRerank +import os + +os.environ["HF_HUB_OFFLINE"] = "1" # too late, constant already False +``` + +By the time `os.environ` was set, `huggingface_hub` had already imported and locked +the constant to `False`. The env var existed in the process environment but the +library never re-read it. + +**Fix:** moved `import os` and all three `os.environ` calls to the top of each file, +before any llama_index or huggingface imports: + +```python +import os +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models" +os.environ["HF_HUB_OFFLINE"] = "1" + +from llama_index.core import ... # now these see the env vars +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +``` + +Updated scripts: `query_topk_prompt_engine_v3.py`, `retrieve_raw.py`, +`query_hybrid_bm25_v4.py`, `retrieve_hybrid_raw.py`. + +**General lesson for offline HuggingFace use:** + +The HuggingFace ecosystem has multiple libraries that check for offline mode: +- `huggingface_hub`: reads `HF_HUB_OFFLINE` (or `TRANSFORMERS_OFFLINE`) at import +- `transformers`: delegates to huggingface_hub's constant +- `sentence-transformers`: delegates to huggingface_hub's constant + +All of them evaluate the flag ONCE at module load time. This means: +1. `os.environ` must be set before ANY import that touches `huggingface_hub` +2. Setting the env var in a "Globals" section after imports does NOT work +3. Even indirect imports count — `llama_index.embeddings.huggingface` + transitively imports `huggingface_hub`, so the flag must precede it +4. Alternatively, set the env var in the shell before running Python: + ```bash + export HF_HUB_OFFLINE=1 + ``` + This always works because it's set before any Python code runs. +5. The newer `transformers` library (v4.50+) added `list_repo_templates()` in + `AutoTokenizer.from_pretrained()`, which makes network calls that weren't + present in earlier versions. This is why the Feb 16 fix worked initially + (or appeared to) but broke after a package update. + +This is a common pitfall for anyone running HuggingFace models offline (e.g., +on a laptop without network, air-gapped environments, or behind restrictive +firewalls). The models are cached locally and work fine — but the library +still tries to check for updates unless the offline flag is set correctly. + +--- + +### Incremental vector store updates + +Added incremental update mode to `build_store.py` (then `build_exp_claude.py`). +Previously the script rebuilt the entire vector store from scratch every run +(~1848 files). Now it defaults to incremental mode: loads the existing index, +compares against `./data`, and only processes new, modified, or deleted files. + +**Usage:** +```bash +python build_store.py # incremental update (default) +python build_store.py --rebuild # full rebuild from scratch +``` + +**How it works:** +- The LlamaIndex docstore (`store/docstore.json`) already tracks every + indexed document with metadata: `file_name`, `file_size`, `last_modified_date`. +- The script scans `./data/*.txt` and classifies each file: + - **New:** `file_name` not in docstore → insert + - **Modified:** `file_size` or `last_modified_date` differs → delete + re-insert + - **Deleted:** in docstore but not on disk → delete + - **Unchanged:** skip +- Uses `index.insert()` and `index.delete_ref_doc()` from the LlamaIndex API. +- The same `SentenceSplitter` (256 tokens, 25 overlap) is applied via + `Settings.transformations` so chunks match the original build. + +**Timing:** incremental update with nothing to do takes ~17s (loading the index). +Full rebuild takes several minutes. First incremental run after a stale index +found 8 new files and 204 modified files, completed in ~65s. + +**Important detail:** `SimpleDirectoryReader` converts file timestamps to UTC +(`datetime.fromtimestamp(mtime, tz=timezone.utc)`) before formatting as +`YYYY-MM-DD`. The comparison logic must use UTC too, or files modified late in +the day will show as "modified" due to the date rolling forward in UTC. This +caused a false-positive bug on the first attempt. + +This enables running the build as a cron job to keep the vector store current +as new journal entries are added. + +--- + +## February 18, 2026 + +### LLM comparison: gpt-4o-mini (OpenAI API) vs command-r7b (local Ollama) + +Test query: "Passages that quote Louis Menand." (hybrid BM25+vector, v4) +Retrieval was identical (same 15 chunks, same scores) — only synthesis differs. +Results saved in `tests/results_openai.txt` and `tests/results_commandr7b.txt`. + +**gpt-4o-mini:** +- Cited 6 files (2025-11-04, 2025-02-14, 2022-08-14, 2025-07-27, + 2025-02-05, 2024-09-04). Drew from chunks ranked as low as #14. +- Better at distinguishing direct quotes from paraphrases and indirect + references. Provided a structured summary with numbered entries. +- 44 seconds total (most of that is local retrieval/re-ranking; the + API call itself is nearly instant). + +**command-r7b:** +- Cited 2 files (2025-11-04, 2022-08-14). Focused on the top-scored + chunks and ignored lower-ranked ones. +- Pulled out actual quotes verbatim as block quotes — more useful if + you want the exact text rather than a summary. +- 78 seconds total. + +**Summary:** gpt-4o-mini is broader (more sources, better use of the full +context window) and nearly 2x faster. command-r7b is more focused and +reproduces exact quotes. Both correctly identified the core passages. +The quality difference is noticeable but not dramatic — the retrieval +pipeline does most of the heavy lifting. + +### Temperature experiments + +The gpt-4o-mini test used temperature=0.1 (nearly deterministic). command-r7b +via Ollama defaults to temperature=0.8 — so the two models were tested at very +different temperatures, which may account for some of the stylistic difference. + +**Temperature guidance for RAG synthesis:** + +| Range | Behavior | Use case | +|-------|----------|----------| +| 0.0–0.1 | Nearly deterministic. Picks highest-probability tokens. | Factual extraction, consistency. Can "tunnel vision." | +| 0.3–0.5 | Moderate. More varied phrasing, draws connections across chunks. | Good middle ground for RAG (prompt already constrains context). | +| 0.7–1.0 | Creative/varied. Riskier for RAG — may paraphrase loosely. | Not ideal for faithfulness to source text. | + +**Follow-up: temperature=0.3 for both models (same query, same retrieval)** + +**command-r7b at 0.3 (was 0.8):** Major improvement. Cited 6 files (was 2). +Drew from lower-ranked chunks including #15. Used the full context window +instead of fixating on top hits. Took 94s (was 78s) due to more output. + +**gpt-4o-mini at 0.3 (was 0.1):** Nearly identical to 0.1 run. Same 6 files, +same structure. Slightly more interpretive phrasing but no meaningful +change. This model is less sensitive to temperature for RAG synthesis. + +**Key finding:** Temperature is a critical but often overlooked parameter when +evaluating the generation stage of a RAG pipeline. In our tests, a local 7B model +(command-r7b) went from citing 2 sources to 6 — a 3x improvement in context +utilization — simply by lowering temperature from 0.8 to 0.3. At the higher +temperature, the model "wandered" during generation, focusing on the most salient +chunks and producing repetitive output. At the lower temperature, it methodically +worked through the full context window. + +**Implications for RAG evaluation methodology:** +1. When comparing LLMs for RAG synthesis, temperature must be controlled + across models. Our initial comparison (gpt-4o-mini at 0.1 vs + command-r7b at 0.8 default) overstated the quality gap between models. +2. The "right" temperature for RAG is lower than for open-ended generation. + The prompt and retrieved context already constrain the task; high + temperature adds noise rather than creativity. +3. Temperature affects context utilization, not just style. A model that + appears to "ignore" lower-ranked chunks may simply need a lower + temperature to attend to them. +4. At temperature=0.3, a local 7B model and a cloud API model converged + on similar quality (6 files cited, good coverage, mix of quotes and + paraphrase). The retrieval pipeline does most of the heavy lifting; + the generation model's job is to faithfully synthesize what was retrieved. + +**Testing method:** Hold retrieval constant (same query, same vector store, +same re-ranker, same top-15 chunks). Vary only the LLM and temperature. +Compare on: number of source files cited, whether lower-ranked chunks +are used, faithfulness to source text, and total query time. Results +saved in `tests/` with naming convention `results__t.txt`. + +--- + +### LlamaIndex upgrade to 0.14.14 + +Upgraded LlamaIndex from 0.13.1 to 0.14.14 to add OpenAI API support. + +Installing `llama-index-llms-openai` pulled in `llama-index-core` 0.14.14, which +was incompatible with the existing companion packages (all pinned to <0.14). +Fixed by upgrading all companion packages together: + +```bash +pip install --upgrade llama-index-embeddings-huggingface \ + llama-index-readers-file llama-index-llms-ollama \ + llama-index-retrievers-bm25 +``` + +**Final package versions:** + +| Package | Version | Was | +|---------|---------|-----| +| llama-index-core | 0.14.14 | 0.13.1 | +| llama-index-embeddings-huggingface | 0.6.1 | 0.6.0 | +| llama-index-llms-ollama | 0.9.1 | 0.7.0 | +| llama-index-llms-openai | 0.6.18 | new | +| llama-index-readers-file | 0.5.6 | 0.5.0 | +| llama-index-retrievers-bm25 | 0.6.5 | unchanged | +| llama-index-workflows | 2.14.2 | 1.3.0 | + +Smoke test: `retrieve_raw.py "mining towns"` — works, same results as before. +No vector store rebuild needed. The existing store loaded fine with 0.14. + +--- + +### Paragraph separator validation + +Checked whether `paragraph_separator="\n\n"` in `build_store.py` makes sense +for the journal data. + +Results from scanning all 1,846 files in `./data/`: +- 1,796 files (97%) use `\n\n` as paragraph boundaries +- 28 files use single newlines only +- 22 files have no newlines at all +- Average paragraphs per file: 10.8 (median 7, range 0–206) +- 900 files (49%) also use `---` as a topic/section separator + +The `\n\n` setting is correct. `SentenceSplitter` tries to break at +`paragraph_separator` boundaries first, then falls back to sentence boundaries, +then words. With 256-token chunks, this keeps semantically related sentences +together within a paragraph. + +The `---` separators are already surrounded by `\n\n` (e.g., `\n\n---\n\n`), so +they naturally act as break points too. No special handling needed. + +Note: `"\n\n"` is actually the default value for `paragraph_separator` in +LlamaIndex's `SentenceSplitter`. The explicit setting documents intent but is +functionally redundant. + +List-style entries with single newlines between items (e.g., `2001-09-14.txt`) +stay together within a chunk, which is desirable — lists shouldn't be split +line by line. + +--- + +## February 16, 2026 + +### Cross-encoder model caching for offline use + +Cached the cross-encoder model (`cross-encoder/ms-marco-MiniLM-L-12-v2`) in +`./models/` for offline use. Previously, `HuggingFaceEmbedding` already used +`cache_folder="./models"` with `local_files_only=True` for the embedding model, +but the cross-encoder (loaded via `SentenceTransformerRerank` → `CrossEncoder`) +had no `cache_folder` parameter and would fail offline when it tried to phone +home for updates. + +**Fix:** all scripts that use the cross-encoder now set two environment variables +before model initialization: +```python +os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models" +os.environ["HF_HUB_OFFLINE"] = "1" +``` + +`SENTENCE_TRANSFORMERS_HOME` directs the `CrossEncoder` to look in `./models/` +for cached weights. `HF_HUB_OFFLINE` prevents any network access attempt. + +The model was cached using `huggingface_hub.snapshot_download()`: +```python +from huggingface_hub import snapshot_download +snapshot_download('cross-encoder/ms-marco-MiniLM-L-12-v2', cache_dir='./models') +``` + +**Models now in `./models/`:** +- `models--BAAI--bge-large-en-v1.5` (embedding, bi-encoder) +- `models--cross-encoder--ms-marco-MiniLM-L-12-v2` (re-ranker, cross-encoder) +- `models--sentence-transformers--all-mpnet-base-v2` (old embedding, kept) + +--- + +## February 15, 2026 + +### Design note on `search_keywords.py` + +The POS tagger has a fundamental limitation: it was trained on declarative +prose, not imperative queries. A query like "Find passages that mention Louis +Menand" causes the tagger to classify "find" and "mention" as nouns (NN) +rather than verbs, because the imperative sentence structure is unusual in +its training data. This floods results with false positives (304 matches +across 218 files instead of the handful mentioning Menand). + +More fundamentally: for term-based searches, the POS tagging layer adds +minimal value over bare grep. If the input is "Louis Menand", POS tagging +extracts "louis menand" — identical to what grep would match. The tool's +real value is not the NLP layer but the convenience wrapper: searching all +files at once, joining multi-word proper nouns, sorting by match count, and +showing context around matches. It's essentially a formatted multi-file grep. + +Possible future direction: merge keyword search results with semantic search +results. The keyword pipeline catches exact names, places, and dates that +embeddings miss, while the semantic pipeline catches thematic relevance that +keywords miss. A hybrid approach could combine both result sets, using keyword +matches to boost or supplement vector retrieval. This connects to the BM25 +hybrid retrieval idea (to-do item 4). + +### New scripts: `query_hybrid_bm25_v4.py` and `retrieve_hybrid_raw.py` + +Implemented BM25 hybrid retrieval (to-do item 4). Both scripts run two +retrievers in parallel on the same query: +- **Vector retriever:** top-20 by cosine similarity (semantic meaning) +- **BM25 retriever:** top-20 by term frequency (exact lexical matching) + +Results are merged and deduplicated by node ID, then passed to the +cross-encoder re-ranker (`ms-marco-MiniLM-L-12-v2`) → top-15. + +`query_hybrid_bm25_v4.py` feeds the re-ranked chunks to the LLM (same v3 +prompt and command-r7b model). `retrieve_hybrid_raw.py` outputs the raw +chunks with source annotations: `[vector-only]`, `[bm25-only]`, or +`[vector+bm25]`, showing which retriever nominated each result. + +The BM25 retriever uses `BM25Retriever.from_defaults(index=index)` from +`llama-index-retrievers-bm25` (v0.6.5). It indexes the nodes already +stored in the persisted vector store — no separate build step needed. + +**Key idea:** BM25's job is only to nominate candidates that vector similarity +might miss (exact names, dates, specific terms). The cross-encoder decides +final relevance regardless of where candidates came from. + +--- + +## February 12, 2026 + +Updated vector store, now 4816 chunks. + +Scope of a language model based search: LLMs can summarize, but lack the +ability to critically read and compare information. ChatGPT can summarize +the literature that I've cited, but it cannot critique it. (It could +generate from published critiques.) Our ability to critically read and +synthesize from literature is an important skill. (Most reviews fall far +short, simply aggregating "advances" without asking why, how, or whether +they are real or not.) + +--- + +## February 11, 2026 + +### Project tidy-up and cross-encoder re-ranking (v3) + +Tidied up the project with Claude Code: +- Generated `README.md` and `CLAUDE.md` documentation +- Archived superseded scripts (v1 query engines, old build scripts, `shared/`, + `experimental/query_multitool.py`) +- Removed stale `storage_exp` copy (Aug 2025 backup, ~105 MB) +- Removed empty `shared/` and `experimental/` directories + +Created `query_topk_prompt_engine_v3.py`: adds cross-encoder re-ranking. + +**The idea:** the current pipeline (v2) uses a bi-encoder (`BAAI/bge-large-en-v1.5`) +that encodes query and chunks independently, then compares via cosine similarity. +This is fast but approximate — the query and chunk never "see" each other. + +A cross-encoder takes the query and chunk as a single concatenated input, with +full attention between all tokens. It scores the pair jointly, which captures +nuance that dot-product similarity misses (paraphrase, negation, indirect +relevance). The tradeoff is speed: you can't pre-compute scores. + +**v3 uses a two-stage approach:** +1. Retrieve top-30 via bi-encoder (fast, approximate) +2. Re-rank to top-15 with cross-encoder (slow, precise) +3. Pass re-ranked chunks to LLM for synthesis + +Cross-encoder model: `cross-encoder/ms-marco-MiniLM-L-6-v2` (~80 MB, 6 layers). +Trained on MS MARCO passage ranking. Should add only a few seconds to query time +for 30 candidates. + +### Bi-encoder vs cross-encoder + +**Bi-encoder (what the pipeline had):** +The embedding model (`BAAI/bge-large-en-v1.5`) encodes the query and each chunk +independently into vectors. Similarity is a dot product between two vectors that +were computed separately. This is fast — you can pre-compute all chunk vectors +once at build time and just compare against the query vector at search time. But +because query and chunk never "see" each other during encoding, the model can +miss subtle relevance signals. + +**Cross-encoder (what v3 adds):** +A cross-encoder takes the query and a chunk as a single input pair: +`[query, chunk]` concatenated together. It reads both simultaneously through the +transformer, with full attention between every token in the query and every token +in the chunk. It outputs a single relevance score. This is much more accurate +because the model can reason about the specific relationship between your question +and the passage — word overlap, paraphrase, negation, context. + +The tradeoff: it's slow. You can't pre-compute anything because the score depends +on the specific query. Scoring 4,692 chunks this way would take too long. + +**Why the two-stage approach works:** +``` +4,692 chunks → bi-encoder (fast, approximate) → top 30 + top 30 → cross-encoder (slow, precise) → top 15 + top 15 → LLM synthesis → response +``` + +**Concrete example:** If you search "times the author felt conflicted about career +choices," the bi-encoder might rank a chunk about "job satisfaction" highly because +the vectors are close. But a chunk that says "I couldn't decide whether to stay or +leave" — without using the word "career" — might score lower in vector space. The +cross-encoder, reading both query and chunk together, would recognize that "couldn't +decide whether to stay or leave" is highly relevant to "felt conflicted about career +choices." + +### Prompt update for v3 + +Updated the v3 prompt to account for re-ranked context. Changes: +- Tells the LLM the context is from a "personal journal collection" and has been + "selected and ranked for relevance" +- "Examine ALL provided excerpts, not just the top few" — counters single-file + collapse seen in initial testing +- "When multiple files touch on the query, note what each one contributes" — + encourages breadth across sources +- "End with a list of all files that contributed" — stronger than v2's vague + "list all relevant source files" + +Also updated `run_query.sh` to point to v3. + +### v3 test results + +**Query: "Passages that describe mining towns."** +- Response cited 2 passages from `2023-03-15.txt` (coal mining, great-grandfather) +- Source documents included 7 distinct files across 15 chunks +- Top cross-encoder score: -1.177 (`2025-09-14.txt`) +- LLM focused on `2023-03-15.txt` which had the most explicit mining content +- Query time: 76 seconds +- Note: cross-encoder scores are raw logits (negative), not 0–1 cosine similarity + +**Query: "I am looking for entries that discuss memes and cognition."** +- Response cited 6 distinct files with specific content from each: + `2025-07-14` (Dennett/Blackmore on memes), `2023-09-20` (Hurley model), + `2024-03-24` (multiple drafts model), `2021-04-25` (consciousness discussion), + `2026-01-08` (epistemological frameworks), `2025-03-10` (Extended Mind Theory) +- Top cross-encoder score: 4.499 (`2026-01-08.txt`) — clear separation from rest +- LLM drew from chunks ranked 3rd, 4th, 5th, 12th, and 15th — confirming it + examines the full context, not just top hits +- Query time: 71 seconds + +**Observations:** +- The v3 prompt produces much better multi-source synthesis than v2's prompt +- Cross-encoder scores show clear separation between strong and weak matches +- The re-ranker + new prompt together encourage breadth across files +- Query time comparable to v2 (~70–80 seconds) + +### Cross-encoder model comparison + +Tested three cross-encoder models on the same query ("Discussions of Kondiaronk +and the Wendats") to compare re-ranking behavior. + +**1. cross-encoder/ms-marco-MiniLM-L-12-v2 (baseline)** +- Scores: raw logits, wide spread (top score 3.702) +- Clear separation between strong and weak matches +- Balanced ranking: `2025-06-07.txt` #1, `2025-07-28.txt` #2, `2024-12-25.txt` #3 +- Query time: ~70–80 seconds +- Trained on MS MARCO passage ranking (query → relevant passage) + +**2. cross-encoder/stsb-roberta-base** +- Scores: 0.308 to 0.507 — very compressed range (0.199 spread) +- Poor differentiation: model can't clearly separate relevant from irrelevant +- Pulled in `2019-07-03.txt` at #2 (not in L-12 results), dropped `2024-12-25.txt` +- Query time: 92 seconds +- Trained on STS Benchmark (semantic similarity, not passage ranking) — + wrong task for re-ranking. Measures "are these texts about the same thing?" + rather than "is this passage a good answer to this query?" + +**3. BAAI/bge-reranker-v2-m3** +- Scores: calibrated probabilities (0–1). Sharp top (0.812), then 0.313, 0.262… + Bottom 6 chunks at 0.001 (model says: not relevant at all) +- Very confident about #1 (`2025-07-28.txt` at 0.812), but long zero tail +- 5 of 15 chunks from `2025-07-28.txt` — heavy concentration on one file +- Query time: 125 seconds (50% slower than L-12) +- Multilingual model, larger than ms-marco MiniLM variants + +**Summary:** + +| Model | Score spread | Speed | Differentiation | +|-------|-------------|-------|-----------------| +| ms-marco-MiniLM-L-12-v2 | Wide (logits) | ~70–80s | Good, balanced | +| BAAI/bge-reranker-v2-m3 | Sharp top/zeros | ~125s | Confident #1, weak tail | +| stsb-roberta-base | Compressed | ~92s | Poor | + +**Decision:** ms-marco-MiniLM-L-12-v2 is the best fit. Purpose-built for passage +ranking, fastest of the three, and produces balanced rankings with good score +separation. The BAAI model's zero-tail problem means 6 of 15 chunks are dead +weight in the context window (could be mitigated by lowering `RERANK_TOP_N` or +adding a score cutoff, but adds complexity for marginal gain). The stsb model +is simply wrong for this task — semantic similarity ≠ passage relevance. + +### New scripts: `retrieve_raw.py` and `search_keywords.py` + +**`retrieve_raw.py`** — Verbatim chunk retrieval, no LLM. Uses the LlamaIndex +retriever API instead of the query engine: + +```python +# v3 uses as_query_engine() — full pipeline including LLM synthesis +query_engine = index.as_query_engine( + similarity_top_k=30, + text_qa_template=PROMPT, + node_postprocessors=[reranker], +) +response = query_engine.query(q) # returns LLM-generated text + +# retrieve_raw.py uses as_retriever() — stops after retrieval +retriever = index.as_retriever(similarity_top_k=30) +nodes = retriever.retrieve(q) # returns raw NodeWithScore objects +reranked = reranker.postprocess_nodes(nodes, query_str=q) +``` + +The key distinction: `as_query_engine()` wraps retrieval + synthesis into one +call (retriever → node postprocessors → response synthesizer → LLM). +`as_retriever()` returns just the retriever component, giving back the raw +nodes with their text and metadata. The re-ranker's `postprocess_nodes()` +method can still be called manually on the retrieved nodes. + +Each node has: +- `node.get_content()` — the chunk text +- `node.metadata` — dict with `file_name`, `file_path`, etc. +- `node.score` — similarity or re-ranker score + +This separation is useful for inspecting what the pipeline retrieves before +the LLM processes it, and for building alternative output formats. + +**`search_keywords.py`** — Keyword search via NLTK POS tagging. Completely +separate from the vector store pipeline. Extracts nouns (NN, NNS, NNP, NNPS) +and adjectives (JJ, JJR, JJS) from the query using `nltk.pos_tag()`, then +searches `./data/*.txt` with regex. Catches exact terms that embeddings miss. +NLTK data (`punkt_tab`, `averaged_perceptron_tagger_eng`) is auto-downloaded on +first run. + +--- + +## January 12, 2026 + +### Best practices for query rewriting + +1. **Understand the original intent:** Clarify the core intent behind the query. + Sometimes that means expanding a terse question into a more descriptive one, + or breaking a complex query into smaller, more focused sub-queries. + +2. **Leverage LlamaIndex's built-in rewriting tools:** LlamaIndex has query + transformation utilities that can help automatically rephrase or enrich + queries. Use them as a starting point and tweak the results. + +3. **Using a model to generate rewrites:** Have a language model generate a + "clarified" version of the query. Feed the model the initial query and + ask it to rephrase or add context. + +**Step-by-step approach:** +- **Initial query expansion:** Take the raw user query and expand it with + natural language context. +- **Model-assisted rewriting:** Use a model to generate alternate phrasings. + Prompt with something like, "Please rewrite this query in a more detailed + form for better retrieval results." +- **Testing and iteration:** Test rewritten versions and see which yield + the best matches. + +--- + +## January 1, 2026 + +Updated `storage_exp` by running `build_exp.py`. + +--- + +## September 6, 2025 + +Rebuilt `storage_exp`: 2048 embeddings. Took about 4 minutes. + +Need to experiment more with query rewrites. Save the query but match on +extracted terms? You can imagine an agent that decides between a search like +grep and a more semantic search. The search is not good at finding dates +("What did the author say on DATE") or when searching for certain terms +("What did the author say about libraries?"). + +--- + +## August 28, 2025 + +### Email embedding experiment + +Idea: given a strong (or top) hit, use this node to find similar chunks. + +Working with demo. Saved 294 emails from `president@udel.edu`. Embedding +these took nearly 45 minutes. The resulting vector store is larger than the +journals. The search is ok, but could be optimized by stripping the headers. + +To make the text files: +```bash +textutil -convert txt *.eml +``` + +The resulting text: 145,204 lines, 335,690 words, 9,425,696 characters total +(~9.4 MB of text). + +``` +$ python build.py +Parsing nodes: 100%|████████| 294/294 [00:31<00:00, 9.28it/s] +Generating embeddings: ... (19 batches of 2048) + +Total = 2,571 seconds = 42 minutes 51 seconds. +``` + +Vector store size: +``` +$ ls -lh storage/ +-rw-r--r-- 867M default__vector_store.json +-rw-r--r-- 100M docstore.json +-rw-r--r-- 18B graph_store.json +-rw-r--r-- 72B image__vector_store.json +-rw-r--r-- 3.1M index_store.json +``` + +That's a big vector store! The journals have a vector store that is only 90M +(an order of magnitude smaller) from a body of texts that is ~3 MB. + +After extracting just the text/html from the eml files: 21,313 lines, +130,901 words, 946,474 characters total — much smaller. Build time dropped +to ~1:15. Store size dropped to ~25 MB. + +--- + +## August 27, 2025 + +The wrapped query works great on the decwriter! Queries take about 83 seconds, +and sometimes up to 95 seconds if the model needs to be loaded. Longest query +so far (had to load all models) is 98 seconds. + +--- + +## August 26, 2025 + +- Started an "experimental" folder for combining semantic + LLM-guided regex search. +- Created an "archive" folder for older versions. +- Wrote a shell script wrapper and a version that takes input on the command line. + +Timed the retrieval (backup was running, so probably longer): +``` +real 1m20.971s +user 0m13.074s +sys 0m1.429s +``` + +--- + +## August 25, 2025 + +- Build a bash wrapper around the python query engine. The bash wrapper would + handle input and output. +- Expand the search to extract keywords and do a regex search on those. Can you + search the real text chunks and sort by a similarity calc? +- What if you returned more results and sorted these by a cluster grouping? + +--- + +## August 21, 2025 + +### HyDE experiments + +HyDE stands for Hypothetical Document Embeddings. + +Took out HyDE to test generation. Not sure HyDE is doing anything. Indeed, it is +not generating results that are any better or different than just using the +`BAAI/bge-large-en-v1.5` embedding model and a custom prompt. The BAAI/bge model +gives very good results! + +**Compared llama3.1:8B with command-r7b.** Both are about the same size and give +similar results. ChatGPT is pretty adamant that command-r7b will stick more to +the retrieved content. This is reinforced by the following exercise: + +**command-r7b output** (RAG faithfulness test): +> The last day you can file your 2023 taxes without incurring any penalties is +> April 15th, 2024. This is the official filing deadline for the 2023 tax year. +> Filing after this date will result in a late fee, with a 5% penalty per month +> up to a maximum of 25%. + +**llama3.1:7b output:** +> April 15th, 2024. +> +> Note: The context only mentions the filing deadline and late fees, not any +> possible extensions or exceptions. + +ChatGPT says: LLaMA 3 8B might answer correctly but add a guess like "extensions +are available." Command R 7B is more likely to stay within the context boundaries. +This is what we see. + +--- + +## August 20, 2025 + +### Prompt engineering + +Tried doing a query rewrite, but this is difficult. Reverted back. Got a pretty +good result with this question: + +> "What would the author say about art vs. engineering?" + +A prompt that starts with "What would the author say..." or "What does the author +say..." leads to higher similarity scores. + +Implemented the HyDE rewrite of the prompt and that seems to lead to better +results, too. + +### Prompt comparison + +First prompt (research assistant, bulleted list): +``` +"""You are a research assistant. You're given journal snippets (CONTEXT) and +a user query. Your job is NOT to write an essay but to list the best-matching +journal files with a 1–2 sentence rationale. ...""" +``` + +Second prompt (expert research assistant, theme + 10 files): +``` +"""You are an expert research assistant. You are given top-ranked journal +excerpts (CONTEXT) and a user's QUERY. ... Format your answer in two parts: +1. Summary Theme 2. Matching Files (bullet list of 10)...""" +``` + +The second prompt provides better responses. + +### Chunk size experiments + +Experimenting with chunking. Using 512 and 10 overlap: 2412 vectors. Tried +512 tokens and 0 overlap. Changed the paragraph separator to `"\n\n"`. The +default is `"\n\n\n"` for some reason. + +Reduced chunks to 256 tokens to see if higher similarity scores result. It +decreased them a bit. Tried 384 tokens and 40 overlap. The 256 and 25 worked +better — restored. Will work on semantic gap with the query. + +### Embedding model switch + +Switched the embedding model to `BAAI/bge-large-en-v1.5`. It seems to do +better, although it requires more time to embed the vector store. +Interestingly, the variance of the embedding values is much lower. The +distribution is narrower, although the values skew in a different way. There +is a broader distribution of clusters in the vectors. + +--- + +## August 17, 2025 + +Working on the Jupyter notebook to measure stats of the vector store. + +Links: +- [Summarization](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/q_and_a/#summarization) +- [Querying](https://docs.llamaindex.ai/en/stable/understanding/querying/querying/) +- [Indexing](https://docs.llamaindex.ai/en/stable/understanding/indexing/indexing/) +- [API Reference](https://docs.llamaindex.ai/en/stable/api_reference/) + +--- + +## August 14, 2025 + +Ideas for the document search pipeline: +- Search by cosine similarity for semantic properties +- Generate search terms and search by regex — names, specific topics or words + +**Problem:** HuggingFace requires internet connection. +**Solution:** download locally. + +HuggingFace caches models at `~/.cache/huggingface/hub/`. It will redownload +them if forced to or if there is a model update. + +**Solution:** ran first (online), which downloaded to the local directory. +Then used `local_files_only=True` to run offline: +```python +embed_model = HuggingFaceEmbedding( + cache_folder="./models", + model_name="all-mpnet-base-v2", + local_files_only=True, +) +``` + +### LlamaIndex concepts + +- **Nodes:** chunks of text (paragraphs, sentences) extracted from documents. + Stored in the document store (e.g., `SimpleDocumentStore`), which keeps + track of the original text and metadata. +- **Vector store:** stores embeddings of nodes. Each entry corresponds to a + node's embedding vector. Query results include node IDs (or metadata) + that link back to the original nodes in the document store. +- Vector store entries are linked to their full content via metadata (e.g., node ID). + +--- + +## August 12, 2025 + +Want to understand the vector store better: +- Is it effective? Are queries effective? +- How many entries are there? +- Why doesn't it find Katie Hafner, but it does find Jimmy Soni? + +Query results are improved with a better prompt. Increased top-k to 50 to +give the model more text to draw from. But it hallucinates at the end of +longer responses. + +The `SimilarityPostprocessor` with `similarity_cutoff=0.78` returned nothing. +The similarity scores must be very low. + +Performance is difficult to tune. Sometimes the models work and sometimes +they don't. Multiple models loaded simultaneously causes issues — use +`ollama ps` and `ollama stop MODEL_NAME`. + +--- + +## August 10, 2025 + +### Project start + +Files made today: `build.py`, `query_topk.py`, `query.py`. + +Build a semantic search of journal texts: +- Ingest all texts and metadata +- Search and return relevant text and file information + +Created `.venv` environment: +```bash +python3 -m venv .venv +pip install llama-index-core llama-index-readers-file \ + llama-index-llms-ollama llama-index-embeddings-huggingface +``` + +Ran `build.py` successfully and generated store. `SimpleDirectoryReader` +stores the filename and file path as metadata. + +**Model comparison (initial):** llama3.1:8B, deepseek-r1:8B, gemma3:1b. +Can't get past a fairly trivial query engine right now. These aren't very +powerful models. Need to keep testing and see what happens. diff --git a/run_retrieve.sh b/run_retrieve.sh new file mode 100755 index 0000000..c56d584 --- /dev/null +++ b/run_retrieve.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# This shell script will handle I/O for the python query engine +# It will take a query and return the formatted results + +# E.M.F. August 2025 + +# Usage: ./run_query.sh + +QUERY_SCRIPT="retrieve.py" + +echo -e "$QUERY_SCRIPT -- retrieve vector store chunks based on similaity + BM25 with reranking.\n" + +# Loop until input is "exit" +while true; do + read -p "Enter your query (or type 'exit' to quit): " query + if [ "$query" == "exit" ] || [ "$query" == "quit" ] || [ "$query" == "" ] ; then + echo "Exiting..." + break + fi + time_start=$(date +%s) + + # Call the python script with the query and format the output + python3 $QUERY_SCRIPT --query "$query" | \ + expand | sed -E 's|(.* )(.*/data)|\1./data|' | fold -s -w 131 + + time_end=$(date +%s) + elapsed=$((time_end - time_start)) + echo -e "Query processed in $elapsed seconds.\n" +done