From eb9997326fa43559aa1d652f3d2477d88e2b4cb0 Mon Sep 17 00:00:00 2001
From: Eric Furst <furst@udel.edu>
Date: Sun, 1 Mar 2026 07:39:28 -0500
Subject: [PATCH] Shell script run_retrieve.sh for non-LLM gneration queries
 (returns only chunks), track development notes and README.

---
 README.md        |    4 +-
 deploy_public.sh |   41 +-
 devlog.md        | 1035 ++++++++++++++++++++++++++++++++++++++++++++++
 run_retrieve.sh  |   29 ++
 4 files changed, 1089 insertions(+), 20 deletions(-)
 create mode 100644 devlog.md
 create mode 100755 run_retrieve.sh

diff --git a/README.md b/README.md
index 8e4ff30..47a127c 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ ssearch/
 ├── archived/                   # Superseded script versions
 ├── saved_output/               # Saved query results and model comparisons
 ├── requirements.txt            # Python dependencies
-├── devlog.txt                  # Development log and experimental findings
+├── devlog.md                   # Development log and experimental findings
 └── *.ipynb                     # Jupyter notebooks (HyDE, metrics, sandbox)
 ```
 
@@ -198,4 +198,4 @@ Three Jupyter notebooks document exploration and analysis:
 - **Jan 2026**: Command-line interface, prompt improvements, model comparison (command-r7b selected).
 - **Feb 2026**: Cross-encoder re-ranking, hybrid BM25+vector retrieval, LlamaIndex upgrade to 0.14.14, OpenAI API backend, incremental updates, clippings search (ChromaDB), project reorganization.
 
-See `devlog.txt` for detailed development notes and experimental findings.
+See `devlog.md` for detailed development notes and experimental findings.
diff --git a/deploy_public.sh b/deploy_public.sh
index 9240f1b..33f2fbf 100755
--- a/deploy_public.sh
+++ b/deploy_public.sh
@@ -3,9 +3,11 @@
 #
 # Usage: ./deploy_public.sh ["optional commit message"]
 #
-# Checks out an orphan public branch, copies the public files from main,
-# generates a public README (stripping private sections), commits, and
-# force-pushes to origin. Then switches back to main.
+# Checks out the public branch, updates it with public files from main,
+# generates a public README (stripping private sections), commits if
+# anything changed, and pushes to origin. Then switches back to main.
+#
+# On first run (no public branch exists), creates an orphan branch.
 #
 # E.M.F. February 2026
 
@@ -50,12 +52,14 @@ MAIN_HEAD=$(git rev-parse --short HEAD)
 
 echo "Deploying main ($MAIN_HEAD) -> $BRANCH..."
 
-# Delete local public branch if it exists
-git branch -D "$BRANCH" 2>/dev/null || true
-
-# Create fresh orphan
-git checkout --orphan "$BRANCH"
-git rm -rf . >/dev/null 2>&1 || true
+# Check out public branch, or create orphan if it doesn't exist yet
+if git show-ref --verify --quiet "refs/heads/$BRANCH"; then
+    git checkout "$BRANCH"
+else
+    echo "No local $BRANCH branch — creating orphan..."
+    git checkout --orphan "$BRANCH"
+    git rm -rf . >/dev/null 2>&1 || true
+fi
 
 # Copy public files from main
 for f in "${PUBLIC_FILES[@]}"; do
@@ -74,22 +78,23 @@ awk '
 skip                    { next }
 /archived\//            { next }
 /saved_output\//        { next }
-/devlog\.txt/           { next }
+/devlog\.md/            { next }
 /\*\.ipynb/             { next }
 { print }
 ' README.md > README.tmp && mv README.tmp README.md
 
 # Stage only the public files (not untracked files on disk)
 git add "${PUBLIC_FILES[@]}" README.md
-git commit -m "$COMMIT_MSG
 
-Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>"
-
-# Push
-git push --force "$REMOTE" "$BRANCH"
+# Commit only if there are changes
+if git diff --cached --quiet; then
+    echo "No changes to deploy."
+else
+    git commit -m "$COMMIT_MSG"
+    git push "$REMOTE" "$BRANCH"
+    echo ""
+    echo "Done. Deployed main ($MAIN_HEAD) -> $REMOTE/$BRANCH"
+fi
 
 # Switch back to main
 git checkout main
-
-echo ""
-echo "Done. Deployed main ($MAIN_HEAD) -> $REMOTE/$BRANCH"
diff --git a/devlog.md b/devlog.md
new file mode 100644
index 0000000..055f620
--- /dev/null
+++ b/devlog.md
@@ -0,0 +1,1035 @@
+# ssearch development log
+
+## Active files (after Feb 27 reorganization)
+
+- `build_store.py` — build/update journal vector store (incremental)
+- `query_hybrid.py` — hybrid BM25 + vector query with LLM synthesis
+- `retrieve.py` — hybrid verbatim chunk retrieval (no LLM)
+- `search_keywords.py` — keyword search via POS-based term extraction
+- `run_query.sh` — shell wrapper for interactive querying
+- `clippings_search/build_clippings.py` — build/update clippings vector store (ChromaDB)
+- `clippings_search/retrieve_clippings.py` — verbatim clippings retrieval
+- `deploy_public.sh` — deploy public files to Forgejo
+
+Earlier scripts moved to `archived/`:
+`build.py`, `build_exp.py`, `query_topk.py`, `query_catalog.py`, `query_exp.py`,
+`query_topk_prompt.py`, `query_topk_prompt_engine.py`, `query_topk_prompt_dw.py`,
+`query_rewrite_hyde.py`, `query_multitool.py`, `shared/build.py`, `shared/query.py`,
+`vs_metrics.py`, `claude_diagnostic.py`, `query_claude_sonnet.py`, `query_tree.py`,
+`query_topk_prompt_engine_v3.py`, `retrieve_raw.py`
+
+## Best configuration
+
+- **Embedding**: BAAI/bge-large-en-v1.5, 256 token chunks, 25 token overlap
+- **Re-ranker**: cross-encoder/ms-marco-MiniLM-L-12-v2 (retrieve top-30, re-rank to top-15)
+- **LLM**: command-r7b via Ollama (temperature 0.3). OpenAI gpt-4o-mini available as alternative.
+- **Retrieval**: hybrid BM25 + vector, cross-encoder re-ranked
+
+## To do
+
+1. [DONE] Test v3 (cross-encoder re-ranking) and compare results with v2.
+   Selected ms-marco-MiniLM-L-12-v2 after testing three models.
+
+2. [DONE] Verbatim retrieval mode (`retrieve_raw.py`). Uses
+   `index.as_retriever()` instead of `index.as_query_engine()` to get
+   chunks without LLM synthesis. Re-ranks with the same cross-encoder,
+   then outputs raw chunk text with metadata and scores.
+
+3. [DONE] Keyword search pipeline (`search_keywords.py`). Extracts
+   nouns and adjectives via NLTK POS tagging, then greps data files.
+   Complements vector search for exact names, places, dates.
+
+4. [DONE] BM25 hybrid retrieval (sparse + dense). Two scripts:
+   `query_hybrid.py` (with LLM synthesis) and `retrieve.py`
+   (verbatim chunks, no LLM). Both run BM25 (top-20) and vector (top-20)
+   retrievers, merge/deduplicate, then cross-encoder re-rank to top-15.
+   Uses llama-index-retrievers-bm25.
+
+5. Explore query expansion (multiple phrasings, merged retrieval)
+
+6. Explore different vector store strategies (database)
+
+7. [DONE] Test ChatGPT API for final LLM generation (instead of local Ollama)
+
+8. [DONE] Remove API key from this file. Moved to `~/.bashrc` as `OPENAI_API_KEY`.
+
+   The retrieval pipeline (embedding, vector search, cross-encoder re-ranking)
+   stays the same. Only the final synthesis LLM changes.
+
+   **Steps:**
+   1. Install the LlamaIndex OpenAI integration:
+      ```
+      pip install llama-index-llms-openai
+      ```
+   2. Set API key as environment variable:
+      ```
+      export OPENAI_API_KEY="sk-..."
+      ```
+      (Or store in a `.env` file and load with python-dotenv. Do NOT commit
+      the key to version control.)
+   3. In the query script, replace the Ollama LLM with OpenAI:
+      ```python
+      # Current (local):
+      from llama_index.llms.ollama import Ollama
+      Settings.llm = Ollama(
+          model="command-r7b",
+          request_timeout=360.0,
+          context_window=8000,
+      )
+
+      # New (API):
+      from llama_index.llms.openai import OpenAI
+      Settings.llm = OpenAI(
+          model="gpt-4o-mini",   # or "gpt-4o" for higher quality
+          temperature=0.1,
+      )
+      ```
+   4. Run the query script as usual. Everything else (embedding model,
+      vector store, cross-encoder re-ranker, prompt) is unchanged.
+   5. Compare output quality and response time against command-r7b.
+
+   Models to try: gpt-4o-mini (cheap, fast), gpt-4o (better quality).
+   The prompt should work without modification since it's model-agnostic —
+   just context + instructions.
+
+   Note: This adds an external API dependency and per-query cost.
+   The embedding and re-ranking remain fully local/offline.
+
+   API KEY: moved to `~/.bashrc` as `OPENAI_API_KEY` (do not store in repo)
+
+   **Getting an OpenAI API key:**
+   1. Go to https://platform.openai.com/ and sign up (or log in).
+   2. Navigate to API keys: Settings > API keys (or https://platform.openai.com/api-keys).
+   3. Click "Create new secret key", give it a name, and copy it.
+      The key starts with `sk-` and is shown only once.
+   4. Add billing: Settings > Billing. Load a small amount ($5-10)
+      to start. API calls are pay-per-use, not a subscription.
+   5. Set the key in your shell before running a query:
+      ```
+      export OPENAI_API_KEY="sk-..."
+      ```
+      Or add to `~/.zshrc` (or `~/.bashrc`) to persist across sessions.
+      Do NOT commit the key to version control or put it in scripts.
+
+   **Approximate cost per query (Feb 2026):**
+   - gpt-4o-mini: ~$0.001-0.003 (15 chunks of context)
+   - gpt-4o: ~$0.01-0.03
+
+---
+
+## February 27, 2026
+
+### Project reorganization
+
+Reorganized the project structure with Claude Code. Goals: drop legacy version
+numbers from filenames, archive superseded scripts, group clippings search into
+a subdirectory, and clean up storage directory names.
+
+**Script renames:**
+- `build_exp_claude.py` → `build_store.py`
+- `query_hybrid_bm25_v4.py` → `query_hybrid.py`
+- `retrieve_hybrid_raw.py` → `retrieve.py`
+
+**Archived (moved to `archived/`):**
+- `query_topk_prompt_engine_v3.py` — superseded by hybrid BM25+vector query
+- `retrieve_raw.py` — superseded by hybrid retrieval
+
+**Clippings search subdirectory:**
+- `build_clippings.py` → `clippings_search/build_clippings.py`
+- `retrieve_clippings.py` → `clippings_search/retrieve_clippings.py`
+- Scripts use `./` paths relative to project root, so no path changes needed
+  when run as `python clippings_search/build_clippings.py` from root.
+
+**Storage renames:**
+- `storage_exp/` → `store/` (journal vector store)
+- `storage_clippings/` → `clippings_search/store_clippings/` (clippings vector store)
+- Deleted unused `storage/` (original August 2025 store, never updated)
+
+**Updated references** in `run_query.sh`, `.gitignore`, `CLAUDE.md`, `README.md`,
+and all Python scripts that referenced old storage paths.
+
+### Deploy script (`deploy_public.sh`)
+
+Created `deploy_public.sh` to automate publishing to Forgejo. Previously,
+maintaining the public branch required manually recreating an orphan branch,
+copying files, editing the README, and force-pushing — error-prone and tedious.
+
+The script:
+1. Checks that we're on `main` with no uncommitted changes
+2. Deletes the local public branch and creates a fresh orphan
+3. Copies listed public files from `main` (via `git checkout main -- <file>`)
+4. Generates a public README by stripping private sections (Notebooks,
+   Development history) and private file references using `awk`
+5. Stages only the listed files (not untracked files on disk)
+6. Commits with a message and force-pushes to `origin/public`
+7. Switches back to `main`
+
+Fixed a bug where `git add .` picked up untracked files (`output_test.txt`,
+`run_retrieve.sh`). Changed to `git add "${PUBLIC_FILES[@]}" README.md`.
+
+### Forgejo setup
+
+Set up SSH push to Forgejo instance. Required adding SSH public key to Forgejo
+user settings. The remote uses a Tailscale address.
+
+### MIT License
+
+Added MIT License (Copyright (c) 2026 E. M. Furst) to both main and public branches.
+
+### Devlog migration
+
+Migrated `devlog.txt` to `devlog.md` with markdown formatting.
+
+---
+
+## February 20, 2026
+
+### Offline use: environment variables must be set before imports
+
+Despite setting `HF_HUB_OFFLINE=1` and `SENTENCE_TRANSFORMERS_HOME=./models`
+(added Feb 16), the scripts still failed offline with a `ConnectionError` trying
+to reach huggingface.co. The error came from `AutoTokenizer.from_pretrained()`
+calling `list_repo_templates()`, which makes an HTTP request to the HuggingFace API.
+
+**Root cause:** the `huggingface_hub` library evaluates `HF_HUB_OFFLINE` at import
+time, not at call time. The constant is set once in `huggingface_hub/constants.py`:
+
+```python
+HF_HUB_OFFLINE = _is_true(os.environ.get("HF_HUB_OFFLINE")
+                           or os.environ.get("TRANSFORMERS_OFFLINE"))
+```
+
+In all four scripts, the `os.environ` lines came AFTER the imports:
+
+```python
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding  # triggers import of huggingface_hub
+from llama_index.core.postprocessor import SentenceTransformerRerank
+import os
+
+os.environ["HF_HUB_OFFLINE"] = "1"   # too late, constant already False
+```
+
+By the time `os.environ` was set, `huggingface_hub` had already imported and locked
+the constant to `False`. The env var existed in the process environment but the
+library never re-read it.
+
+**Fix:** moved `import os` and all three `os.environ` calls to the top of each file,
+before any llama_index or huggingface imports:
+
+```python
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
+os.environ["HF_HUB_OFFLINE"] = "1"
+
+from llama_index.core import ...          # now these see the env vars
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+```
+
+Updated scripts: `query_topk_prompt_engine_v3.py`, `retrieve_raw.py`,
+`query_hybrid_bm25_v4.py`, `retrieve_hybrid_raw.py`.
+
+**General lesson for offline HuggingFace use:**
+
+The HuggingFace ecosystem has multiple libraries that check for offline mode:
+- `huggingface_hub`: reads `HF_HUB_OFFLINE` (or `TRANSFORMERS_OFFLINE`) at import
+- `transformers`: delegates to huggingface_hub's constant
+- `sentence-transformers`: delegates to huggingface_hub's constant
+
+All of them evaluate the flag ONCE at module load time. This means:
+1. `os.environ` must be set before ANY import that touches `huggingface_hub`
+2. Setting the env var in a "Globals" section after imports does NOT work
+3. Even indirect imports count — `llama_index.embeddings.huggingface`
+   transitively imports `huggingface_hub`, so the flag must precede it
+4. Alternatively, set the env var in the shell before running Python:
+   ```bash
+   export HF_HUB_OFFLINE=1
+   ```
+   This always works because it's set before any Python code runs.
+5. The newer `transformers` library (v4.50+) added `list_repo_templates()` in
+   `AutoTokenizer.from_pretrained()`, which makes network calls that weren't
+   present in earlier versions. This is why the Feb 16 fix worked initially
+   (or appeared to) but broke after a package update.
+
+This is a common pitfall for anyone running HuggingFace models offline (e.g.,
+on a laptop without network, air-gapped environments, or behind restrictive
+firewalls). The models are cached locally and work fine — but the library
+still tries to check for updates unless the offline flag is set correctly.
+
+---
+
+### Incremental vector store updates
+
+Added incremental update mode to `build_store.py` (then `build_exp_claude.py`).
+Previously the script rebuilt the entire vector store from scratch every run
+(~1848 files). Now it defaults to incremental mode: loads the existing index,
+compares against `./data`, and only processes new, modified, or deleted files.
+
+**Usage:**
+```bash
+python build_store.py            # incremental update (default)
+python build_store.py --rebuild  # full rebuild from scratch
+```
+
+**How it works:**
+- The LlamaIndex docstore (`store/docstore.json`) already tracks every
+  indexed document with metadata: `file_name`, `file_size`, `last_modified_date`.
+- The script scans `./data/*.txt` and classifies each file:
+  - **New:** `file_name` not in docstore → insert
+  - **Modified:** `file_size` or `last_modified_date` differs → delete + re-insert
+  - **Deleted:** in docstore but not on disk → delete
+  - **Unchanged:** skip
+- Uses `index.insert()` and `index.delete_ref_doc()` from the LlamaIndex API.
+- The same `SentenceSplitter` (256 tokens, 25 overlap) is applied via
+  `Settings.transformations` so chunks match the original build.
+
+**Timing:** incremental update with nothing to do takes ~17s (loading the index).
+Full rebuild takes several minutes. First incremental run after a stale index
+found 8 new files and 204 modified files, completed in ~65s.
+
+**Important detail:** `SimpleDirectoryReader` converts file timestamps to UTC
+(`datetime.fromtimestamp(mtime, tz=timezone.utc)`) before formatting as
+`YYYY-MM-DD`. The comparison logic must use UTC too, or files modified late in
+the day will show as "modified" due to the date rolling forward in UTC. This
+caused a false-positive bug on the first attempt.
+
+This enables running the build as a cron job to keep the vector store current
+as new journal entries are added.
+
+---
+
+## February 18, 2026
+
+### LLM comparison: gpt-4o-mini (OpenAI API) vs command-r7b (local Ollama)
+
+Test query: "Passages that quote Louis Menand." (hybrid BM25+vector, v4)
+Retrieval was identical (same 15 chunks, same scores) — only synthesis differs.
+Results saved in `tests/results_openai.txt` and `tests/results_commandr7b.txt`.
+
+**gpt-4o-mini:**
+- Cited 6 files (2025-11-04, 2025-02-14, 2022-08-14, 2025-07-27,
+  2025-02-05, 2024-09-04). Drew from chunks ranked as low as #14.
+- Better at distinguishing direct quotes from paraphrases and indirect
+  references. Provided a structured summary with numbered entries.
+- 44 seconds total (most of that is local retrieval/re-ranking; the
+  API call itself is nearly instant).
+
+**command-r7b:**
+- Cited 2 files (2025-11-04, 2022-08-14). Focused on the top-scored
+  chunks and ignored lower-ranked ones.
+- Pulled out actual quotes verbatim as block quotes — more useful if
+  you want the exact text rather than a summary.
+- 78 seconds total.
+
+**Summary:** gpt-4o-mini is broader (more sources, better use of the full
+context window) and nearly 2x faster. command-r7b is more focused and
+reproduces exact quotes. Both correctly identified the core passages.
+The quality difference is noticeable but not dramatic — the retrieval
+pipeline does most of the heavy lifting.
+
+### Temperature experiments
+
+The gpt-4o-mini test used temperature=0.1 (nearly deterministic). command-r7b
+via Ollama defaults to temperature=0.8 — so the two models were tested at very
+different temperatures, which may account for some of the stylistic difference.
+
+**Temperature guidance for RAG synthesis:**
+
+| Range | Behavior | Use case |
+|-------|----------|----------|
+| 0.0–0.1 | Nearly deterministic. Picks highest-probability tokens. | Factual extraction, consistency. Can "tunnel vision." |
+| 0.3–0.5 | Moderate. More varied phrasing, draws connections across chunks. | Good middle ground for RAG (prompt already constrains context). |
+| 0.7–1.0 | Creative/varied. Riskier for RAG — may paraphrase loosely. | Not ideal for faithfulness to source text. |
+
+**Follow-up: temperature=0.3 for both models (same query, same retrieval)**
+
+**command-r7b at 0.3 (was 0.8):** Major improvement. Cited 6 files (was 2).
+Drew from lower-ranked chunks including #15. Used the full context window
+instead of fixating on top hits. Took 94s (was 78s) due to more output.
+
+**gpt-4o-mini at 0.3 (was 0.1):** Nearly identical to 0.1 run. Same 6 files,
+same structure. Slightly more interpretive phrasing but no meaningful
+change. This model is less sensitive to temperature for RAG synthesis.
+
+**Key finding:** Temperature is a critical but often overlooked parameter when
+evaluating the generation stage of a RAG pipeline. In our tests, a local 7B model
+(command-r7b) went from citing 2 sources to 6 — a 3x improvement in context
+utilization — simply by lowering temperature from 0.8 to 0.3. At the higher
+temperature, the model "wandered" during generation, focusing on the most salient
+chunks and producing repetitive output. At the lower temperature, it methodically
+worked through the full context window.
+
+**Implications for RAG evaluation methodology:**
+1. When comparing LLMs for RAG synthesis, temperature must be controlled
+   across models. Our initial comparison (gpt-4o-mini at 0.1 vs
+   command-r7b at 0.8 default) overstated the quality gap between models.
+2. The "right" temperature for RAG is lower than for open-ended generation.
+   The prompt and retrieved context already constrain the task; high
+   temperature adds noise rather than creativity.
+3. Temperature affects context utilization, not just style. A model that
+   appears to "ignore" lower-ranked chunks may simply need a lower
+   temperature to attend to them.
+4. At temperature=0.3, a local 7B model and a cloud API model converged
+   on similar quality (6 files cited, good coverage, mix of quotes and
+   paraphrase). The retrieval pipeline does most of the heavy lifting;
+   the generation model's job is to faithfully synthesize what was retrieved.
+
+**Testing method:** Hold retrieval constant (same query, same vector store,
+same re-ranker, same top-15 chunks). Vary only the LLM and temperature.
+Compare on: number of source files cited, whether lower-ranked chunks
+are used, faithfulness to source text, and total query time. Results
+saved in `tests/` with naming convention `results_<model>_t<temp>.txt`.
+
+---
+
+### LlamaIndex upgrade to 0.14.14
+
+Upgraded LlamaIndex from 0.13.1 to 0.14.14 to add OpenAI API support.
+
+Installing `llama-index-llms-openai` pulled in `llama-index-core` 0.14.14, which
+was incompatible with the existing companion packages (all pinned to <0.14).
+Fixed by upgrading all companion packages together:
+
+```bash
+pip install --upgrade llama-index-embeddings-huggingface \
+    llama-index-readers-file llama-index-llms-ollama \
+    llama-index-retrievers-bm25
+```
+
+**Final package versions:**
+
+| Package | Version | Was |
+|---------|---------|-----|
+| llama-index-core | 0.14.14 | 0.13.1 |
+| llama-index-embeddings-huggingface | 0.6.1 | 0.6.0 |
+| llama-index-llms-ollama | 0.9.1 | 0.7.0 |
+| llama-index-llms-openai | 0.6.18 | new |
+| llama-index-readers-file | 0.5.6 | 0.5.0 |
+| llama-index-retrievers-bm25 | 0.6.5 | unchanged |
+| llama-index-workflows | 2.14.2 | 1.3.0 |
+
+Smoke test: `retrieve_raw.py "mining towns"` — works, same results as before.
+No vector store rebuild needed. The existing store loaded fine with 0.14.
+
+---
+
+### Paragraph separator validation
+
+Checked whether `paragraph_separator="\n\n"` in `build_store.py` makes sense
+for the journal data.
+
+Results from scanning all 1,846 files in `./data/`:
+- 1,796 files (97%) use `\n\n` as paragraph boundaries
+- 28 files use single newlines only
+- 22 files have no newlines at all
+- Average paragraphs per file: 10.8 (median 7, range 0–206)
+- 900 files (49%) also use `---` as a topic/section separator
+
+The `\n\n` setting is correct. `SentenceSplitter` tries to break at
+`paragraph_separator` boundaries first, then falls back to sentence boundaries,
+then words. With 256-token chunks, this keeps semantically related sentences
+together within a paragraph.
+
+The `---` separators are already surrounded by `\n\n` (e.g., `\n\n---\n\n`), so
+they naturally act as break points too. No special handling needed.
+
+Note: `"\n\n"` is actually the default value for `paragraph_separator` in
+LlamaIndex's `SentenceSplitter`. The explicit setting documents intent but is
+functionally redundant.
+
+List-style entries with single newlines between items (e.g., `2001-09-14.txt`)
+stay together within a chunk, which is desirable — lists shouldn't be split
+line by line.
+
+---
+
+## February 16, 2026
+
+### Cross-encoder model caching for offline use
+
+Cached the cross-encoder model (`cross-encoder/ms-marco-MiniLM-L-12-v2`) in
+`./models/` for offline use. Previously, `HuggingFaceEmbedding` already used
+`cache_folder="./models"` with `local_files_only=True` for the embedding model,
+but the cross-encoder (loaded via `SentenceTransformerRerank` → `CrossEncoder`)
+had no `cache_folder` parameter and would fail offline when it tried to phone
+home for updates.
+
+**Fix:** all scripts that use the cross-encoder now set two environment variables
+before model initialization:
+```python
+os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
+os.environ["HF_HUB_OFFLINE"] = "1"
+```
+
+`SENTENCE_TRANSFORMERS_HOME` directs the `CrossEncoder` to look in `./models/`
+for cached weights. `HF_HUB_OFFLINE` prevents any network access attempt.
+
+The model was cached using `huggingface_hub.snapshot_download()`:
+```python
+from huggingface_hub import snapshot_download
+snapshot_download('cross-encoder/ms-marco-MiniLM-L-12-v2', cache_dir='./models')
+```
+
+**Models now in `./models/`:**
+- `models--BAAI--bge-large-en-v1.5` (embedding, bi-encoder)
+- `models--cross-encoder--ms-marco-MiniLM-L-12-v2` (re-ranker, cross-encoder)
+- `models--sentence-transformers--all-mpnet-base-v2` (old embedding, kept)
+
+---
+
+## February 15, 2026
+
+### Design note on `search_keywords.py`
+
+The POS tagger has a fundamental limitation: it was trained on declarative
+prose, not imperative queries. A query like "Find passages that mention Louis
+Menand" causes the tagger to classify "find" and "mention" as nouns (NN)
+rather than verbs, because the imperative sentence structure is unusual in
+its training data. This floods results with false positives (304 matches
+across 218 files instead of the handful mentioning Menand).
+
+More fundamentally: for term-based searches, the POS tagging layer adds
+minimal value over bare grep. If the input is "Louis Menand", POS tagging
+extracts "louis menand" — identical to what grep would match. The tool's
+real value is not the NLP layer but the convenience wrapper: searching all
+files at once, joining multi-word proper nouns, sorting by match count, and
+showing context around matches. It's essentially a formatted multi-file grep.
+
+Possible future direction: merge keyword search results with semantic search
+results. The keyword pipeline catches exact names, places, and dates that
+embeddings miss, while the semantic pipeline catches thematic relevance that
+keywords miss. A hybrid approach could combine both result sets, using keyword
+matches to boost or supplement vector retrieval. This connects to the BM25
+hybrid retrieval idea (to-do item 4).
+
+### New scripts: `query_hybrid_bm25_v4.py` and `retrieve_hybrid_raw.py`
+
+Implemented BM25 hybrid retrieval (to-do item 4). Both scripts run two
+retrievers in parallel on the same query:
+- **Vector retriever:** top-20 by cosine similarity (semantic meaning)
+- **BM25 retriever:** top-20 by term frequency (exact lexical matching)
+
+Results are merged and deduplicated by node ID, then passed to the
+cross-encoder re-ranker (`ms-marco-MiniLM-L-12-v2`) → top-15.
+
+`query_hybrid_bm25_v4.py` feeds the re-ranked chunks to the LLM (same v3
+prompt and command-r7b model). `retrieve_hybrid_raw.py` outputs the raw
+chunks with source annotations: `[vector-only]`, `[bm25-only]`, or
+`[vector+bm25]`, showing which retriever nominated each result.
+
+The BM25 retriever uses `BM25Retriever.from_defaults(index=index)` from
+`llama-index-retrievers-bm25` (v0.6.5). It indexes the nodes already
+stored in the persisted vector store — no separate build step needed.
+
+**Key idea:** BM25's job is only to nominate candidates that vector similarity
+might miss (exact names, dates, specific terms). The cross-encoder decides
+final relevance regardless of where candidates came from.
+
+---
+
+## February 12, 2026
+
+Updated vector store, now 4816 chunks.
+
+Scope of a language model based search: LLMs can summarize, but lack the
+ability to critically read and compare information. ChatGPT can summarize
+the literature that I've cited, but it cannot critique it. (It could
+generate from published critiques.) Our ability to critically read and
+synthesize from literature is an important skill. (Most reviews fall far
+short, simply aggregating "advances" without asking why, how, or whether
+they are real or not.)
+
+---
+
+## February 11, 2026
+
+### Project tidy-up and cross-encoder re-ranking (v3)
+
+Tidied up the project with Claude Code:
+- Generated `README.md` and `CLAUDE.md` documentation
+- Archived superseded scripts (v1 query engines, old build scripts, `shared/`,
+  `experimental/query_multitool.py`)
+- Removed stale `storage_exp` copy (Aug 2025 backup, ~105 MB)
+- Removed empty `shared/` and `experimental/` directories
+
+Created `query_topk_prompt_engine_v3.py`: adds cross-encoder re-ranking.
+
+**The idea:** the current pipeline (v2) uses a bi-encoder (`BAAI/bge-large-en-v1.5`)
+that encodes query and chunks independently, then compares via cosine similarity.
+This is fast but approximate — the query and chunk never "see" each other.
+
+A cross-encoder takes the query and chunk as a single concatenated input, with
+full attention between all tokens. It scores the pair jointly, which captures
+nuance that dot-product similarity misses (paraphrase, negation, indirect
+relevance). The tradeoff is speed: you can't pre-compute scores.
+
+**v3 uses a two-stage approach:**
+1. Retrieve top-30 via bi-encoder (fast, approximate)
+2. Re-rank to top-15 with cross-encoder (slow, precise)
+3. Pass re-ranked chunks to LLM for synthesis
+
+Cross-encoder model: `cross-encoder/ms-marco-MiniLM-L-6-v2` (~80 MB, 6 layers).
+Trained on MS MARCO passage ranking. Should add only a few seconds to query time
+for 30 candidates.
+
+### Bi-encoder vs cross-encoder
+
+**Bi-encoder (what the pipeline had):**
+The embedding model (`BAAI/bge-large-en-v1.5`) encodes the query and each chunk
+independently into vectors. Similarity is a dot product between two vectors that
+were computed separately. This is fast — you can pre-compute all chunk vectors
+once at build time and just compare against the query vector at search time. But
+because query and chunk never "see" each other during encoding, the model can
+miss subtle relevance signals.
+
+**Cross-encoder (what v3 adds):**
+A cross-encoder takes the query and a chunk as a single input pair:
+`[query, chunk]` concatenated together. It reads both simultaneously through the
+transformer, with full attention between every token in the query and every token
+in the chunk. It outputs a single relevance score. This is much more accurate
+because the model can reason about the specific relationship between your question
+and the passage — word overlap, paraphrase, negation, context.
+
+The tradeoff: it's slow. You can't pre-compute anything because the score depends
+on the specific query. Scoring 4,692 chunks this way would take too long.
+
+**Why the two-stage approach works:**
+```
+4,692 chunks  →  bi-encoder (fast, approximate)  →  top 30
+    top 30    →  cross-encoder (slow, precise)    →  top 15
+    top 15    →  LLM synthesis                    →  response
+```
+
+**Concrete example:** If you search "times the author felt conflicted about career
+choices," the bi-encoder might rank a chunk about "job satisfaction" highly because
+the vectors are close. But a chunk that says "I couldn't decide whether to stay or
+leave" — without using the word "career" — might score lower in vector space. The
+cross-encoder, reading both query and chunk together, would recognize that "couldn't
+decide whether to stay or leave" is highly relevant to "felt conflicted about career
+choices."
+
+### Prompt update for v3
+
+Updated the v3 prompt to account for re-ranked context. Changes:
+- Tells the LLM the context is from a "personal journal collection" and has been
+  "selected and ranked for relevance"
+- "Examine ALL provided excerpts, not just the top few" — counters single-file
+  collapse seen in initial testing
+- "When multiple files touch on the query, note what each one contributes" —
+  encourages breadth across sources
+- "End with a list of all files that contributed" — stronger than v2's vague
+  "list all relevant source files"
+
+Also updated `run_query.sh` to point to v3.
+
+### v3 test results
+
+**Query: "Passages that describe mining towns."**
+- Response cited 2 passages from `2023-03-15.txt` (coal mining, great-grandfather)
+- Source documents included 7 distinct files across 15 chunks
+- Top cross-encoder score: -1.177 (`2025-09-14.txt`)
+- LLM focused on `2023-03-15.txt` which had the most explicit mining content
+- Query time: 76 seconds
+- Note: cross-encoder scores are raw logits (negative), not 0–1 cosine similarity
+
+**Query: "I am looking for entries that discuss memes and cognition."**
+- Response cited 6 distinct files with specific content from each:
+  `2025-07-14` (Dennett/Blackmore on memes), `2023-09-20` (Hurley model),
+  `2024-03-24` (multiple drafts model), `2021-04-25` (consciousness discussion),
+  `2026-01-08` (epistemological frameworks), `2025-03-10` (Extended Mind Theory)
+- Top cross-encoder score: 4.499 (`2026-01-08.txt`) — clear separation from rest
+- LLM drew from chunks ranked 3rd, 4th, 5th, 12th, and 15th — confirming it
+  examines the full context, not just top hits
+- Query time: 71 seconds
+
+**Observations:**
+- The v3 prompt produces much better multi-source synthesis than v2's prompt
+- Cross-encoder scores show clear separation between strong and weak matches
+- The re-ranker + new prompt together encourage breadth across files
+- Query time comparable to v2 (~70–80 seconds)
+
+### Cross-encoder model comparison
+
+Tested three cross-encoder models on the same query ("Discussions of Kondiaronk
+and the Wendats") to compare re-ranking behavior.
+
+**1. cross-encoder/ms-marco-MiniLM-L-12-v2 (baseline)**
+- Scores: raw logits, wide spread (top score 3.702)
+- Clear separation between strong and weak matches
+- Balanced ranking: `2025-06-07.txt` #1, `2025-07-28.txt` #2, `2024-12-25.txt` #3
+- Query time: ~70–80 seconds
+- Trained on MS MARCO passage ranking (query → relevant passage)
+
+**2. cross-encoder/stsb-roberta-base**
+- Scores: 0.308 to 0.507 — very compressed range (0.199 spread)
+- Poor differentiation: model can't clearly separate relevant from irrelevant
+- Pulled in `2019-07-03.txt` at #2 (not in L-12 results), dropped `2024-12-25.txt`
+- Query time: 92 seconds
+- Trained on STS Benchmark (semantic similarity, not passage ranking) —
+  wrong task for re-ranking. Measures "are these texts about the same thing?"
+  rather than "is this passage a good answer to this query?"
+
+**3. BAAI/bge-reranker-v2-m3**
+- Scores: calibrated probabilities (0–1). Sharp top (0.812), then 0.313, 0.262…
+  Bottom 6 chunks at 0.001 (model says: not relevant at all)
+- Very confident about #1 (`2025-07-28.txt` at 0.812), but long zero tail
+- 5 of 15 chunks from `2025-07-28.txt` — heavy concentration on one file
+- Query time: 125 seconds (50% slower than L-12)
+- Multilingual model, larger than ms-marco MiniLM variants
+
+**Summary:**
+
+| Model | Score spread | Speed | Differentiation |
+|-------|-------------|-------|-----------------|
+| ms-marco-MiniLM-L-12-v2 | Wide (logits) | ~70–80s | Good, balanced |
+| BAAI/bge-reranker-v2-m3 | Sharp top/zeros | ~125s | Confident #1, weak tail |
+| stsb-roberta-base | Compressed | ~92s | Poor |
+
+**Decision:** ms-marco-MiniLM-L-12-v2 is the best fit. Purpose-built for passage
+ranking, fastest of the three, and produces balanced rankings with good score
+separation. The BAAI model's zero-tail problem means 6 of 15 chunks are dead
+weight in the context window (could be mitigated by lowering `RERANK_TOP_N` or
+adding a score cutoff, but adds complexity for marginal gain). The stsb model
+is simply wrong for this task — semantic similarity ≠ passage relevance.
+
+### New scripts: `retrieve_raw.py` and `search_keywords.py`
+
+**`retrieve_raw.py`** — Verbatim chunk retrieval, no LLM. Uses the LlamaIndex
+retriever API instead of the query engine:
+
+```python
+# v3 uses as_query_engine() — full pipeline including LLM synthesis
+query_engine = index.as_query_engine(
+    similarity_top_k=30,
+    text_qa_template=PROMPT,
+    node_postprocessors=[reranker],
+)
+response = query_engine.query(q)   # returns LLM-generated text
+
+# retrieve_raw.py uses as_retriever() — stops after retrieval
+retriever = index.as_retriever(similarity_top_k=30)
+nodes = retriever.retrieve(q)       # returns raw NodeWithScore objects
+reranked = reranker.postprocess_nodes(nodes, query_str=q)
+```
+
+The key distinction: `as_query_engine()` wraps retrieval + synthesis into one
+call (retriever → node postprocessors → response synthesizer → LLM).
+`as_retriever()` returns just the retriever component, giving back the raw
+nodes with their text and metadata. The re-ranker's `postprocess_nodes()`
+method can still be called manually on the retrieved nodes.
+
+Each node has:
+- `node.get_content()` — the chunk text
+- `node.metadata` — dict with `file_name`, `file_path`, etc.
+- `node.score` — similarity or re-ranker score
+
+This separation is useful for inspecting what the pipeline retrieves before
+the LLM processes it, and for building alternative output formats.
+
+**`search_keywords.py`** — Keyword search via NLTK POS tagging. Completely
+separate from the vector store pipeline. Extracts nouns (NN, NNS, NNP, NNPS)
+and adjectives (JJ, JJR, JJS) from the query using `nltk.pos_tag()`, then
+searches `./data/*.txt` with regex. Catches exact terms that embeddings miss.
+NLTK data (`punkt_tab`, `averaged_perceptron_tagger_eng`) is auto-downloaded on
+first run.
+
+---
+
+## January 12, 2026
+
+### Best practices for query rewriting
+
+1. **Understand the original intent:** Clarify the core intent behind the query.
+   Sometimes that means expanding a terse question into a more descriptive one,
+   or breaking a complex query into smaller, more focused sub-queries.
+
+2. **Leverage LlamaIndex's built-in rewriting tools:** LlamaIndex has query
+   transformation utilities that can help automatically rephrase or enrich
+   queries. Use them as a starting point and tweak the results.
+
+3. **Using a model to generate rewrites:** Have a language model generate a
+   "clarified" version of the query. Feed the model the initial query and
+   ask it to rephrase or add context.
+
+**Step-by-step approach:**
+- **Initial query expansion:** Take the raw user query and expand it with
+  natural language context.
+- **Model-assisted rewriting:** Use a model to generate alternate phrasings.
+  Prompt with something like, "Please rewrite this query in a more detailed
+  form for better retrieval results."
+- **Testing and iteration:** Test rewritten versions and see which yield
+  the best matches.
+
+---
+
+## January 1, 2026
+
+Updated `storage_exp` by running `build_exp.py`.
+
+---
+
+## September 6, 2025
+
+Rebuilt `storage_exp`: 2048 embeddings. Took about 4 minutes.
+
+Need to experiment more with query rewrites. Save the query but match on
+extracted terms? You can imagine an agent that decides between a search like
+grep and a more semantic search. The search is not good at finding dates
+("What did the author say on DATE") or when searching for certain terms
+("What did the author say about libraries?").
+
+---
+
+## August 28, 2025
+
+### Email embedding experiment
+
+Idea: given a strong (or top) hit, use this node to find similar chunks.
+
+Working with demo. Saved 294 emails from `president@udel.edu`. Embedding
+these took nearly 45 minutes. The resulting vector store is larger than the
+journals. The search is ok, but could be optimized by stripping the headers.
+
+To make the text files:
+```bash
+textutil -convert txt *.eml
+```
+
+The resulting text: 145,204 lines, 335,690 words, 9,425,696 characters total
+(~9.4 MB of text).
+
+```
+$ python build.py
+Parsing nodes: 100%|████████| 294/294 [00:31<00:00,  9.28it/s]
+Generating embeddings: ... (19 batches of 2048)
+
+Total = 2,571 seconds = 42 minutes 51 seconds.
+```
+
+Vector store size:
+```
+$ ls -lh storage/
+-rw-r--r--  867M  default__vector_store.json
+-rw-r--r--  100M  docstore.json
+-rw-r--r--   18B  graph_store.json
+-rw-r--r--   72B  image__vector_store.json
+-rw-r--r--  3.1M  index_store.json
+```
+
+That's a big vector store! The journals have a vector store that is only 90M
+(an order of magnitude smaller) from a body of texts that is ~3 MB.
+
+After extracting just the text/html from the eml files: 21,313 lines,
+130,901 words, 946,474 characters total — much smaller. Build time dropped
+to ~1:15. Store size dropped to ~25 MB.
+
+---
+
+## August 27, 2025
+
+The wrapped query works great on the decwriter! Queries take about 83 seconds,
+and sometimes up to 95 seconds if the model needs to be loaded. Longest query
+so far (had to load all models) is 98 seconds.
+
+---
+
+## August 26, 2025
+
+- Started an "experimental" folder for combining semantic + LLM-guided regex search.
+- Created an "archive" folder for older versions.
+- Wrote a shell script wrapper and a version that takes input on the command line.
+
+Timed the retrieval (backup was running, so probably longer):
+```
+real    1m20.971s
+user    0m13.074s
+sys     0m1.429s
+```
+
+---
+
+## August 25, 2025
+
+- Build a bash wrapper around the python query engine. The bash wrapper would
+  handle input and output.
+- Expand the search to extract keywords and do a regex search on those. Can you
+  search the real text chunks and sort by a similarity calc?
+- What if you returned more results and sorted these by a cluster grouping?
+
+---
+
+## August 21, 2025
+
+### HyDE experiments
+
+HyDE stands for Hypothetical Document Embeddings.
+
+Took out HyDE to test generation. Not sure HyDE is doing anything. Indeed, it is
+not generating results that are any better or different than just using the
+`BAAI/bge-large-en-v1.5` embedding model and a custom prompt. The BAAI/bge model
+gives very good results!
+
+**Compared llama3.1:8B with command-r7b.** Both are about the same size and give
+similar results. ChatGPT is pretty adamant that command-r7b will stick more to
+the retrieved content. This is reinforced by the following exercise:
+
+**command-r7b output** (RAG faithfulness test):
+> The last day you can file your 2023 taxes without incurring any penalties is
+> April 15th, 2024. This is the official filing deadline for the 2023 tax year.
+> Filing after this date will result in a late fee, with a 5% penalty per month
+> up to a maximum of 25%.
+
+**llama3.1:7b output:**
+> April 15th, 2024.
+>
+> Note: The context only mentions the filing deadline and late fees, not any
+> possible extensions or exceptions.
+
+ChatGPT says: LLaMA 3 8B might answer correctly but add a guess like "extensions
+are available." Command R 7B is more likely to stay within the context boundaries.
+This is what we see.
+
+---
+
+## August 20, 2025
+
+### Prompt engineering
+
+Tried doing a query rewrite, but this is difficult. Reverted back. Got a pretty
+good result with this question:
+
+> "What would the author say about art vs. engineering?"
+
+A prompt that starts with "What would the author say..." or "What does the author
+say..." leads to higher similarity scores.
+
+Implemented the HyDE rewrite of the prompt and that seems to lead to better
+results, too.
+
+### Prompt comparison
+
+First prompt (research assistant, bulleted list):
+```
+"""You are a research assistant. You're given journal snippets (CONTEXT) and
+a user query. Your job is NOT to write an essay but to list the best-matching
+journal files with a 1–2 sentence rationale. ..."""
+```
+
+Second prompt (expert research assistant, theme + 10 files):
+```
+"""You are an expert research assistant. You are given top-ranked journal
+excerpts (CONTEXT) and a user's QUERY. ... Format your answer in two parts:
+1. Summary Theme  2. Matching Files (bullet list of 10)..."""
+```
+
+The second prompt provides better responses.
+
+### Chunk size experiments
+
+Experimenting with chunking. Using 512 and 10 overlap: 2412 vectors. Tried
+512 tokens and 0 overlap. Changed the paragraph separator to `"\n\n"`. The
+default is `"\n\n\n"` for some reason.
+
+Reduced chunks to 256 tokens to see if higher similarity scores result. It
+decreased them a bit. Tried 384 tokens and 40 overlap. The 256 and 25 worked
+better — restored. Will work on semantic gap with the query.
+
+### Embedding model switch
+
+Switched the embedding model to `BAAI/bge-large-en-v1.5`. It seems to do
+better, although it requires more time to embed the vector store.
+Interestingly, the variance of the embedding values is much lower. The
+distribution is narrower, although the values skew in a different way. There
+is a broader distribution of clusters in the vectors.
+
+---
+
+## August 17, 2025
+
+Working on the Jupyter notebook to measure stats of the vector store.
+
+Links:
+- [Summarization](https://docs.llamaindex.ai/en/stable/understanding/putting_it_all_together/q_and_a/#summarization)
+- [Querying](https://docs.llamaindex.ai/en/stable/understanding/querying/querying/)
+- [Indexing](https://docs.llamaindex.ai/en/stable/understanding/indexing/indexing/)
+- [API Reference](https://docs.llamaindex.ai/en/stable/api_reference/)
+
+---
+
+## August 14, 2025
+
+Ideas for the document search pipeline:
+- Search by cosine similarity for semantic properties
+- Generate search terms and search by regex — names, specific topics or words
+
+**Problem:** HuggingFace requires internet connection.
+**Solution:** download locally.
+
+HuggingFace caches models at `~/.cache/huggingface/hub/`. It will redownload
+them if forced to or if there is a model update.
+
+**Solution:** ran first (online), which downloaded to the local directory.
+Then used `local_files_only=True` to run offline:
+```python
+embed_model = HuggingFaceEmbedding(
+    cache_folder="./models",
+    model_name="all-mpnet-base-v2",
+    local_files_only=True,
+)
+```
+
+### LlamaIndex concepts
+
+- **Nodes:** chunks of text (paragraphs, sentences) extracted from documents.
+  Stored in the document store (e.g., `SimpleDocumentStore`), which keeps
+  track of the original text and metadata.
+- **Vector store:** stores embeddings of nodes. Each entry corresponds to a
+  node's embedding vector. Query results include node IDs (or metadata)
+  that link back to the original nodes in the document store.
+- Vector store entries are linked to their full content via metadata (e.g., node ID).
+
+---
+
+## August 12, 2025
+
+Want to understand the vector store better:
+- Is it effective? Are queries effective?
+- How many entries are there?
+- Why doesn't it find Katie Hafner, but it does find Jimmy Soni?
+
+Query results are improved with a better prompt. Increased top-k to 50 to
+give the model more text to draw from. But it hallucinates at the end of
+longer responses.
+
+The `SimilarityPostprocessor` with `similarity_cutoff=0.78` returned nothing.
+The similarity scores must be very low.
+
+Performance is difficult to tune. Sometimes the models work and sometimes
+they don't. Multiple models loaded simultaneously causes issues — use
+`ollama ps` and `ollama stop MODEL_NAME`.
+
+---
+
+## August 10, 2025
+
+### Project start
+
+Files made today: `build.py`, `query_topk.py`, `query.py`.
+
+Build a semantic search of journal texts:
+- Ingest all texts and metadata
+- Search and return relevant text and file information
+
+Created `.venv` environment:
+```bash
+python3 -m venv .venv
+pip install llama-index-core llama-index-readers-file \
+    llama-index-llms-ollama llama-index-embeddings-huggingface
+```
+
+Ran `build.py` successfully and generated store. `SimpleDirectoryReader`
+stores the filename and file path as metadata.
+
+**Model comparison (initial):** llama3.1:8B, deepseek-r1:8B, gemma3:1b.
+Can't get past a fairly trivial query engine right now. These aren't very
+powerful models. Need to keep testing and see what happens.
diff --git a/run_retrieve.sh b/run_retrieve.sh
new file mode 100755
index 0000000..c56d584
--- /dev/null
+++ b/run_retrieve.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# This shell script will handle I/O for the python query engine
+# It will take a query and return the formatted results
+
+# E.M.F. August 2025
+
+# Usage: ./run_query.sh 
+
+QUERY_SCRIPT="retrieve.py"
+
+echo -e "$QUERY_SCRIPT -- retrieve vector store chunks based on similaity + BM25 with reranking.\n"
+
+# Loop until input is "exit"
+while true; do
+    read -p "Enter your query (or type 'exit' to quit): " query
+    if [ "$query" == "exit" ] || [ "$query" == "quit" ] || [ "$query" == "" ] ; then
+        echo "Exiting..."
+        break
+    fi
+    time_start=$(date +%s)
+
+    # Call the python script with the query and format the output
+    python3 $QUERY_SCRIPT --query "$query" | \
+        expand | sed -E 's|(.* )(.*/data)|\1./data|' | fold -s -w 131
+    
+    time_end=$(date +%s)
+    elapsed=$((time_end - time_start))
+    echo -e "Query processed in $elapsed seconds.\n"
+done