From 8310553f890280ca9f553795c5b280a909fd0f38 Mon Sep 17 00:00:00 2001
From: Eric <emfurst@users.noreply.github.com>
Date: Sun, 22 Feb 2026 09:03:03 -0500
Subject: [PATCH] RAG pipeline for semantic search over personal archives

Journal and clippings search with LlamaIndex, HuggingFace embeddings,
cross-encoder re-ranking, and local LLM inference via Ollama. Clippings
index uses ChromaDB for persistent vector storage.
---
 .gitignore                     |  37 +++
 README.md                      | 208 +++++++++++++++
 build_clippings.py             | 471 +++++++++++++++++++++++++++++++++
 build_exp_claude.py            | 193 ++++++++++++++
 query_hybrid_bm25_v4.py        | 176 ++++++++++++
 query_topk_prompt_engine_v3.py | 136 ++++++++++
 requirements.txt               | 216 +++++++++++++++
 retrieve_clippings.py          | 138 ++++++++++
 retrieve_hybrid_raw.py         | 140 ++++++++++
 retrieve_raw.py                |  97 +++++++
 run_query.sh                   |  30 +++
 search_keywords.py             | 189 +++++++++++++
 12 files changed, 2031 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100644 build_clippings.py
 create mode 100644 build_exp_claude.py
 create mode 100644 query_hybrid_bm25_v4.py
 create mode 100644 query_topk_prompt_engine_v3.py
 create mode 100644 requirements.txt
 create mode 100644 retrieve_clippings.py
 create mode 100644 retrieve_hybrid_raw.py
 create mode 100644 retrieve_raw.py
 create mode 100755 run_query.sh
 create mode 100644 search_keywords.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7e4dde6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,37 @@
+# Python
+.venv/
+__pycache__/
+*.pyc
+
+# HuggingFace cached models (large, ~2 GB)
+models/
+
+# Vector stores (large, rebuild with build scripts)
+storage_exp/
+storage/
+storage_clippings/
+
+# Data (symlinks to private files)
+data
+clippings
+
+# Generated file lists
+ocr_needed.txt
+
+# IDE and OS
+.DS_Store
+.vscode/
+.idea/
+
+# Jupyter checkpoints
+.ipynb_checkpoints/
+
+# Secrets
+.env
+API_key_temp
+
+# Query log
+query.log
+
+# Duplicate of CLAUDE.md
+claude.md
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..823c7a4
--- /dev/null
+++ b/README.md
@@ -0,0 +1,208 @@
+# ssearch
+
+Semantic search over a personal journal archive. Uses vector embeddings and a local LLM to find and synthesize information across 1800+ dated text entries spanning 2000-2025.
+
+## How it works
+
+```
+Query → Embed (BAAI/bge-large-en-v1.5) → Vector similarity (top-30) → Cross-encoder re-rank (top-15) → LLM synthesis (command-r7b via Ollama, or OpenAI API) → Response + sources
+```
+
+1. **Build**: Journal entries in `./data` are chunked (256 tokens, 25-token overlap) and embedded into a vector store using LlamaIndex. Supports incremental updates (new/modified files only) or full rebuilds.
+2. **Retrieve**: A user query is embedded with the same model and matched against stored vectors by cosine similarity, returning the top 30 candidate chunks.
+3. **Re-rank**: A cross-encoder (`cross-encoder/ms-marco-MiniLM-L-12-v2`) scores each (query, chunk) pair jointly and keeps the top 15.
+4. **Synthesize**: The re-ranked chunks are passed to a local LLM with a custom prompt that encourages multi-source synthesis, producing a grounded answer with file citations.
+
+## Project structure
+
+```
+ssearch/
+├── build_exp_claude.py             # Build/update vector store (incremental by default)
+├── query_topk_prompt_engine_v3.py  # Main query engine (cross-encoder re-ranking)
+├── query_topk_prompt_engine_v2.py  # Previous query engine (no re-ranking)
+├── retrieve_raw.py                 # Verbatim chunk retrieval (no LLM)
+├── query_hybrid_bm25_v4.py        # Hybrid BM25 + vector query (v4)
+├── retrieve_hybrid_raw.py          # Hybrid verbatim retrieval (no LLM)
+├── search_keywords.py              # Keyword search via POS-based term extraction
+├── run_query.sh                    # Shell wrapper with timing and logging
+├── data/                           # Symlink to ../text/ (journal .txt files)
+├── storage_exp/                    # Persisted vector store (~242 MB)
+├── models/                         # Cached HuggingFace models (embedding + cross-encoder, offline)
+├── archived/                       # Earlier iterations and prototypes
+├── saved_output/                   # Saved query results and model comparisons
+├── requirements.txt                # Python dependencies (pip freeze)
+├── NOTES.md                        # Similarity metric reference
+├── devlog.txt                      # Development log and experimental findings
+└── *.ipynb                         # Jupyter notebooks (HyDE, metrics, sandbox)
+```
+
+## Setup
+
+**Prerequisites**: Python 3.12, [Ollama](https://ollama.com) with `command-r7b` pulled.
+
+```bash
+cd ssearch
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+
+The `data/` symlink should point to `../text/` (the journal archive). The embedding model (`BAAI/bge-large-en-v1.5`) and cross-encoder (`cross-encoder/ms-marco-MiniLM-L-12-v2`) are cached in `./models/` for offline use.
+
+### Offline model loading
+
+All query scripts set three environment variables to prevent HuggingFace from making network requests:
+
+```python
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
+os.environ["HF_HUB_OFFLINE"] = "1"
+```
+
+**These must appear before any imports that touch HuggingFace libraries.** The `huggingface_hub` library evaluates `HF_HUB_OFFLINE` once at import time (in `huggingface_hub/constants.py`). If the env var is set after imports, the library will still attempt network access and fail offline. This is a common pitfall -- `llama_index.embeddings.huggingface` transitively imports `huggingface_hub`, so even indirect imports trigger the evaluation.
+
+Alternatively, set the variable in your shell before running Python:
+```bash
+export HF_HUB_OFFLINE=1
+python query_hybrid_bm25_v4.py "your query"
+```
+
+## Usage
+
+### Build the vector store
+
+```bash
+# Incremental update (default): only processes new, modified, or deleted files
+python build_exp_claude.py
+
+# Full rebuild from scratch
+python build_exp_claude.py --rebuild
+```
+
+The default incremental mode loads the existing index, compares file sizes and modification dates against the docstore, and only re-indexes what changed. A full rebuild (`--rebuild`) is only needed when chunk parameters or the embedding model change.
+
+### Search
+
+Three categories of search are available, from heaviest (semantic + LLM) to lightest (grep).
+
+#### Semantic search with LLM synthesis
+
+These scripts embed the query, retrieve candidate chunks from the vector store, re-rank with a cross-encoder, and pass the top results to a local LLM that synthesizes a grounded answer with file citations. **Requires Ollama running with `command-r7b`.**
+
+**Vector-only** (`query_topk_prompt_engine_v3.py`): Retrieves the top 30 chunks by cosine similarity, re-ranks to top 15, synthesizes.
+```bash
+python query_topk_prompt_engine_v3.py "What does the author say about creativity?"
+```
+
+**Hybrid BM25 + vector** (`query_hybrid_bm25_v4.py`): Retrieves top 20 by vector similarity and top 20 by BM25 term frequency, merges and deduplicates, re-ranks the union to top 15, synthesizes. Catches exact name/term matches that vector-only retrieval misses.
+```bash
+python query_hybrid_bm25_v4.py "Louis Menand"
+```
+
+**Interactive wrapper** (`run_query.sh`): Loops for queries using the v3 engine, displays timing, and appends queries to `query.log`.
+```bash
+./run_query.sh
+```
+
+#### Verbatim chunk retrieval (no LLM)
+
+These scripts run the same retrieval and re-ranking pipeline but output the raw chunk text instead of passing it to an LLM. Useful for inspecting what the retrieval pipeline finds, or when Ollama is not available. **No Ollama needed.**
+
+**Vector-only** (`retrieve_raw.py`): Top-30 vector retrieval, cross-encoder re-rank to top 15, raw output.
+```bash
+python retrieve_raw.py "Kondiaronk and the Wendats"
+```
+
+**Hybrid BM25 + vector** (`retrieve_hybrid_raw.py`): Same hybrid retrieval as v4 but outputs raw chunks. Each chunk is annotated with its source: `[vector-only]`, `[bm25-only]`, or `[vector+bm25]`.
+```bash
+python retrieve_hybrid_raw.py "Louis Menand"
+```
+
+Pipe either to `less` for browsing.
+
+#### Keyword search (no vector store, no LLM)
+
+**`search_keywords.py`**: Extracts nouns and adjectives from the query using NLTK POS tagging, then greps `./data/*.txt` for matches with surrounding context. A lightweight fallback when you want exact string matching without the vector store. **No vector store or Ollama needed.**
+```bash
+python search_keywords.py "Discussions of Kondiaronk and the Wendats"
+```
+
+### Output format
+
+```
+Response:
+<LLM-synthesized answer citing specific files>
+
+Source documents:
+2024-03-15.txt  ./data/2024-03-15.txt  0.683
+2023-11-02.txt  ./data/2023-11-02.txt  0.651
+...
+```
+
+## Configuration
+
+Key parameters (set in source files):
+
+| Parameter | Value | Location |
+|-----------|-------|----------|
+| Embedding model | `BAAI/bge-large-en-v1.5` | `build_exp_claude.py`, `query_topk_prompt_engine_v3.py` |
+| Chunk size | 256 tokens | `build_exp_claude.py` |
+| Chunk overlap | 25 tokens | `build_exp_claude.py` |
+| Paragraph separator | `\n\n` | `build_exp_claude.py` |
+| Initial retrieval | 30 chunks | `query_topk_prompt_engine_v3.py` |
+| Re-rank model | `cross-encoder/ms-marco-MiniLM-L-12-v2` | `query_topk_prompt_engine_v3.py` |
+| Re-rank top-n | 15 | `query_topk_prompt_engine_v3.py` |
+| LLM | `command-r7b` (Ollama) or `gpt-4o-mini` (OpenAI API) | `query_topk_prompt_engine_v3.py`, `query_hybrid_bm25_v4.py` |
+| Temperature | 0.3 (recommended for both local and API models) | `query_topk_prompt_engine_v3.py`, `query_hybrid_bm25_v4.py` |
+| Context window | 8000 tokens | `query_topk_prompt_engine_v3.py` |
+| Request timeout | 360 seconds | `query_topk_prompt_engine_v3.py` |
+
+## Key dependencies
+
+- **llama-index-core** (0.14.14) -- RAG framework
+- **llama-index-embeddings-huggingface** (0.6.1) -- embedding integration
+- **llama-index-llms-ollama** (0.9.1) -- local LLM via Ollama
+- **llama-index-llms-openai** (0.6.18) -- OpenAI API LLM (optional, for API-based synthesis)
+- **llama-index-readers-file** (0.5.6) -- file readers
+- **llama-index-retrievers-bm25** (0.6.5) -- BM25 sparse retrieval for hybrid search
+- **sentence-transformers** (5.1.0) -- embedding model support
+- **torch** (2.8.0) -- ML runtime
+
+## Notebooks
+
+Three Jupyter notebooks document exploration and analysis:
+
+- **`hyde.ipynb`** -- Experiments with HyDE (Hypothetical Document Embeddings) query rewriting. Tests whether generating a hypothetical answer to a query and embedding that instead improves retrieval. Uses LlamaIndex's `HyDEQueryTransform` with `llama3.1:8B`. Finding: the default HyDE prompt produced a rich hypothetical passage, but the technique did not improve retrieval quality over direct prompt engineering. This informed the decision to drop HyDE from the pipeline.
+
+- **`sandbox.ipynb`** -- Exploratory notebook for learning the LlamaIndex API. Inspects the `llama_index.core` module (104 objects), lists available classes and methods, and reads the source of `VectorStoreIndex`. Useful as a quick reference for what LlamaIndex exposes.
+
+- **`vs_metrics.ipynb`** -- Quantitative analysis of the vector store. Loads the persisted index (4,692 vectors, 1024 dimensions each from `BAAI/bge-large-en-v1.5`) and produces:
+  - Distribution of embedding values (histogram)
+  - Heatmap of the full embedding matrix
+  - Embedding vector magnitude distribution
+  - Per-dimension variance (which dimensions carry more signal)
+  - Pairwise cosine similarity distribution and heatmap (subset)
+  - Hierarchical clustering dendrogram (Ward linkage)
+  - PCA and t-SNE 2D projections of the embedding space
+
+## Design decisions
+
+- **BAAI/bge-large-en-v1.5 over all-mpnet-base-v2**: Better semantic matching quality for journal text despite slower embedding.
+- **256-token chunks**: Tested 512 and 384; 256 with 25-token overlap produced the highest quality matches.
+- **command-r7b over llama3.1:8B**: Sticks closer to provided context with less hallucination at comparable speed.
+- **Top-k=15**: Wide enough to capture diverse perspectives, narrow enough to fit the context window.
+- **Cross-encoder re-ranking (v3)**: Retrieve top-30 via bi-encoder, re-rank to top-15 with a cross-encoder that scores each (query, chunk) pair jointly. More accurate than bi-encoder similarity alone. Tested three models; `ms-marco-MiniLM-L-12-v2` selected over `stsb-roberta-base` (wrong task -- semantic similarity, not passage ranking) and `BAAI/bge-reranker-v2-m3` (50% slower, weak score tail).
+- **HyDE query rewriting tested and dropped**: Did not improve results over direct prompt engineering.
+- **V3 prompt**: Adapted for re-ranked context -- tells the LLM all excerpts have been curated, encourages examining every chunk and noting what each file contributes. Produces better multi-source synthesis than v2's prompt.
+- **V2 prompt**: More flexible and query-adaptive than v1, which forced rigid structure (exactly 10 files, mandatory theme).
+- **Verbatim retrieval (`retrieve_raw.py`)**: Uses LlamaIndex's `index.as_retriever()` instead of `index.as_query_engine()`. The retriever returns raw `NodeWithScore` objects (chunk text, metadata, scores) without invoking the LLM. The re-ranker is applied manually via `reranker.postprocess_nodes()`. This separation lets you inspect what the pipeline retrieves before synthesis.
+- **Keyword search (`search_keywords.py`)**: NLTK POS tagging extracts nouns and adjectives from the query -- a middle ground between naive stopword removal and LLM-based term extraction. Catches exact names, places, and dates that vector similarity misses.
+- **Hybrid BM25 + vector retrieval (v4)**: Runs two retrievers in parallel -- BM25 (top-20 by term frequency) and vector similarity (top-20 by cosine) -- merges and deduplicates candidates, then lets the cross-encoder re-rank the union to top-15. BM25 nominates candidates with exact term matches that embeddings miss; the cross-encoder decides final relevance. Uses `BM25Retriever.from_defaults(index=index)` from `llama-index-retrievers-bm25`, which indexes the nodes already stored in the persisted vector store.
+
+## Development history
+
+- **Aug 2025**: Initial implementation -- build pipeline, embedding model comparison, chunk size experiments, HyDE testing, prompt v1.
+- **Jan 2026**: Command-line interface, v2 prompt, error handling improvements, model comparison (command-r7b selected).
+- **Feb 2026**: Project tidy-up, cross-encoder re-ranking (v3), v3 prompt for multi-source synthesis, cross-encoder model comparison (L-12 selected), archived superseded scripts. Hybrid BM25 + vector retrieval (v4). Upgraded LlamaIndex from 0.13.1 to 0.14.14; added OpenAI API as optional LLM backend (`llama-index-llms-openai`). Incremental vector store updates (default mode in `build_exp_claude.py`). Fixed offline HuggingFace model loading (env vars must precede imports).
+
+See `devlog.txt` for detailed development notes and experimental findings.
diff --git a/build_clippings.py b/build_clippings.py
new file mode 100644
index 0000000..b808e92
--- /dev/null
+++ b/build_clippings.py
@@ -0,0 +1,471 @@
+# build_clippings.py
+#
+# Build or update the ChromaDB vector store from clippings in ./clippings.
+#
+# Default mode (incremental): loads the existing index and adds only
+# new or modified files.  Use --rebuild for a full rebuild from scratch.
+#
+# Handles PDFs, TXT, webarchive, and RTF files. Skips non-extractable PDFs
+# and writes them to ocr_needed.txt for later OCR processing.
+#
+# February 2026
+# E. M. Furst
+
+# Environment vars must be set before importing huggingface/transformers
+# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
+# at import time.
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
+os.environ["HF_HUB_OFFLINE"] = "1"
+
+import chromadb
+from llama_index.core import (
+    SimpleDirectoryReader,
+    StorageContext,
+    VectorStoreIndex,
+    Settings,
+    Document,
+)
+from llama_index.vector_stores.chroma import ChromaVectorStore
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core.node_parser import SentenceSplitter
+from pathlib import Path
+import argparse
+import datetime
+import time
+
+# Shared constants
+DATA_DIR = Path("./clippings")
+PERSIST_DIR = "./storage_clippings"
+COLLECTION_NAME = "clippings"
+EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5"
+CHUNK_SIZE = 256
+CHUNK_OVERLAP = 25
+
+# File types handled by SimpleDirectoryReader (PDF + TXT)
+READER_EXTS = {".pdf", ".txt"}
+# File types handled by custom loaders
+CUSTOM_EXTS = {".webarchive", ".rtf"}
+# All supported extensions
+SUPPORTED_EXTS = READER_EXTS | CUSTOM_EXTS
+
+# Minimum extracted text length to consider a PDF valid (characters)
+MIN_TEXT_LENGTH = 100
+
+
+def get_text_splitter():
+    return SentenceSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP,
+        paragraph_separator="\n\n",
+    )
+
+
+def validate_pdf(file_path):
+    """Check if a PDF has extractable text.
+
+    Returns (is_valid, reason) where reason explains why it was skipped.
+    """
+    import pypdf
+    try:
+        reader = pypdf.PdfReader(str(file_path))
+        page_count = len(reader.pages)
+        total_chars = 0
+        printable_chars = 0
+        for page in reader.pages:
+            text = page.extract_text() or ""
+            total_chars += len(text)
+            printable_chars += sum(
+                1 for c in text if c.isprintable() or c in "\n\r\t"
+            )
+
+        if total_chars < MIN_TEXT_LENGTH:
+            return False, f"too little text ({total_chars} chars, {page_count} pages)"
+
+        ratio = printable_chars / total_chars if total_chars > 0 else 0
+        if ratio < 0.5:
+            return False, f"low printable ratio ({ratio:.2f}, {page_count} pages)"
+
+        return True, None
+    except Exception as e:
+        return False, str(e)
+
+
+def load_webarchive(file_path):
+    """Extract text from a macOS .webarchive file.
+
+    Returns a LlamaIndex Document, or None if extraction fails.
+    """
+    import plistlib
+    from bs4 import BeautifulSoup
+
+    try:
+        with open(file_path, "rb") as f:
+            plist = plistlib.load(f)
+
+        resource = plist.get("WebMainResource", {})
+        html_bytes = resource.get("WebResourceData", b"")
+        if not html_bytes:
+            return None
+
+        html = html_bytes.decode("utf-8", errors="replace")
+        soup = BeautifulSoup(html, "html.parser")
+        text = soup.get_text(separator="\n", strip=True)
+
+        if len(text) < MIN_TEXT_LENGTH:
+            return None
+
+        stat = file_path.stat()
+        mdate = datetime.datetime.fromtimestamp(
+            stat.st_mtime, tz=datetime.timezone.utc
+        ).strftime("%Y-%m-%d")
+
+        return Document(
+            text=text,
+            metadata={
+                "file_name": file_path.name,
+                "file_path": str(file_path),
+                "file_size": stat.st_size,
+                "last_modified_date": mdate,
+                "file_type": "webarchive",
+            },
+        )
+    except Exception as e:
+        print(f"  Warning: could not read webarchive {file_path.name}: {e}")
+        return None
+
+
+def load_rtf(file_path):
+    """Extract text from an RTF file.
+
+    Returns a LlamaIndex Document, or None if extraction fails.
+    """
+    from striprtf.striprtf import rtf_to_text
+
+    try:
+        with open(file_path, "r", errors="replace") as f:
+            rtf_content = f.read()
+
+        text = rtf_to_text(rtf_content)
+
+        if len(text) < MIN_TEXT_LENGTH:
+            return None
+
+        stat = file_path.stat()
+        mdate = datetime.datetime.fromtimestamp(
+            stat.st_mtime, tz=datetime.timezone.utc
+        ).strftime("%Y-%m-%d")
+
+        return Document(
+            text=text,
+            metadata={
+                "file_name": file_path.name,
+                "file_path": str(file_path),
+                "file_size": stat.st_size,
+                "last_modified_date": mdate,
+                "file_type": "rtf",
+            },
+        )
+    except Exception as e:
+        print(f"  Warning: could not read RTF {file_path.name}: {e}")
+        return None
+
+
+def scan_clippings():
+    """Scan the clippings directory and classify files.
+
+    Returns (reader_files, custom_docs, skipped, ocr_needed) where:
+    - reader_files: list of Paths for SimpleDirectoryReader (PDF + TXT)
+    - custom_docs: list of Document objects from custom loaders
+    - skipped: list of (Path, reason) tuples
+    - ocr_needed: list of Paths for PDFs that need OCR
+    """
+    reader_files = []
+    custom_docs = []
+    skipped = []
+    ocr_needed = []
+
+    for fpath in sorted(DATA_DIR.rglob("*")):
+        if not fpath.is_file():
+            continue
+        if fpath.name.startswith("."):
+            continue
+
+        ext = fpath.suffix.lower()
+
+        if ext not in SUPPORTED_EXTS:
+            skipped.append((fpath, f"unsupported type: {ext}"))
+            continue
+
+        if ext == ".pdf":
+            is_valid, reason = validate_pdf(fpath)
+            if not is_valid:
+                skipped.append((fpath, f"no extractable text: {reason}"))
+                ocr_needed.append(fpath)
+                continue
+            reader_files.append(fpath)
+
+        elif ext == ".txt":
+            reader_files.append(fpath)
+
+        elif ext == ".webarchive":
+            doc = load_webarchive(fpath)
+            if doc:
+                custom_docs.append(doc)
+            else:
+                skipped.append((fpath, "no extractable text from webarchive"))
+
+        elif ext == ".rtf":
+            doc = load_rtf(fpath)
+            if doc:
+                custom_docs.append(doc)
+            else:
+                skipped.append((fpath, "no extractable text from RTF"))
+
+    return reader_files, custom_docs, skipped, ocr_needed
+
+
+def write_ocr_list(ocr_needed):
+    """Write the list of PDFs needing OCR to ocr_needed.txt."""
+    with open("ocr_needed.txt", "w") as f:
+        for fpath in ocr_needed:
+            f.write(f"{fpath}\n")
+    print(f"Wrote {len(ocr_needed)} file(s) to ocr_needed.txt")
+
+
+def load_all_documents(reader_files, custom_docs):
+    """Load documents from SimpleDirectoryReader and merge with custom docs."""
+    documents = []
+
+    if reader_files:
+        print(f"Loading {len(reader_files)} PDF/TXT files...")
+        reader_docs = SimpleDirectoryReader(
+            input_files=[str(f) for f in reader_files],
+            filename_as_id=True,
+        ).load_data()
+        documents.extend(reader_docs)
+
+    if custom_docs:
+        print(f"Adding {len(custom_docs)} webarchive/RTF documents...")
+        documents.extend(custom_docs)
+
+    return documents
+
+
+def rebuild(reader_files, custom_docs):
+    """Full rebuild: delete existing collection and recreate from scratch."""
+    client = chromadb.PersistentClient(path=PERSIST_DIR)
+    # Delete existing collection if present
+    try:
+        client.delete_collection(COLLECTION_NAME)
+        print(f"Deleted existing collection '{COLLECTION_NAME}'")
+    except Exception:
+        pass
+
+    collection = client.get_or_create_collection(COLLECTION_NAME)
+    vector_store = ChromaVectorStore(chroma_collection=collection)
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+
+    documents = load_all_documents(reader_files, custom_docs)
+    if not documents:
+        raise ValueError("No documents loaded")
+
+    print(f"Loaded {len(documents)} document(s) total")
+    print("Building vector index...")
+
+    index = VectorStoreIndex.from_documents(
+        documents,
+        storage_context=storage_context,
+        transformations=[get_text_splitter()],
+        show_progress=True,
+    )
+
+    print(f"Index built. Collection has {collection.count()} vectors.")
+    return index
+
+
+def update(reader_files, custom_docs):
+    """Incremental update: add new, re-index modified, remove deleted files."""
+    client = chromadb.PersistentClient(path=PERSIST_DIR)
+    collection = client.get_collection(COLLECTION_NAME)
+    count = collection.count()
+    print(f"Existing collection has {count} vectors")
+
+    # Get all stored metadata to find what's indexed
+    # Key on file_path (not file_name) to handle duplicate names across subdirs
+    indexed = {}  # file_path -> {"ids": [], "file_size": ..., "last_modified_date": ...}
+    if count > 0:
+        results = collection.get(include=["metadatas"])
+        for i, meta in enumerate(results["metadatas"]):
+            fpath = meta.get("file_path", "")
+            if fpath not in indexed:
+                indexed[fpath] = {
+                    "ids": [],
+                    "file_size": meta.get("file_size"),
+                    "last_modified_date": meta.get("last_modified_date"),
+                }
+            indexed[fpath]["ids"].append(results["ids"][i])
+
+    print(f"Index contains {len(indexed)} unique files")
+
+    # Build disk file lookup: file_path_str -> Path
+    # For reader_files, match the path format SimpleDirectoryReader would store
+    disk_files = {}
+    for f in reader_files:
+        disk_files[str(f)] = f
+    for doc in custom_docs:
+        disk_files[doc.metadata["file_path"]] = Path(doc.metadata["file_path"])
+
+    # Classify files
+    new_reader = []
+    new_custom = []
+    modified_reader = []
+    modified_custom = []
+    deleted_paths = []
+    unchanged = 0
+
+    for path_str, fpath in disk_files.items():
+        if path_str not in indexed:
+            # Check if it's a custom doc
+            if fpath.suffix.lower() in CUSTOM_EXTS:
+                matching = [d for d in custom_docs if d.metadata["file_path"] == path_str]
+                if matching:
+                    new_custom.extend(matching)
+            else:
+                new_reader.append(fpath)
+        else:
+            info = indexed[path_str]
+            stat = fpath.stat()
+            disk_mdate = datetime.datetime.fromtimestamp(
+                stat.st_mtime, tz=datetime.timezone.utc
+            ).strftime("%Y-%m-%d")
+
+            if stat.st_size != info["file_size"] or disk_mdate != info["last_modified_date"]:
+                if fpath.suffix.lower() in CUSTOM_EXTS:
+                    matching = [d for d in custom_docs if d.metadata["file_path"] == path_str]
+                    if matching:
+                        modified_custom.extend(matching)
+                else:
+                    modified_reader.append(fpath)
+            else:
+                unchanged += 1
+
+    for path_str in indexed:
+        if path_str not in disk_files:
+            deleted_paths.append(path_str)
+
+    n_new = len(new_reader) + len(new_custom)
+    n_modified = len(modified_reader) + len(modified_custom)
+    print(f"\n  New:       {n_new}")
+    print(f"  Modified:  {n_modified}")
+    print(f"  Deleted:   {len(deleted_paths)}")
+    print(f"  Unchanged: {unchanged}")
+
+    if n_new == 0 and n_modified == 0 and len(deleted_paths) == 0:
+        print("\nNothing to do.")
+        return
+
+    # Delete chunks for removed and modified files
+    for path_str in deleted_paths:
+        ids = indexed[path_str]["ids"]
+        fname = Path(path_str).name
+        print(f"  Removing {fname} ({len(ids)} chunks)")
+        collection.delete(ids=ids)
+
+    for fpath in modified_reader:
+        path_str = str(fpath)
+        ids = indexed[path_str]["ids"]
+        print(f"  Re-indexing {fpath.name} ({len(ids)} chunks)")
+        collection.delete(ids=ids)
+
+    for doc in modified_custom:
+        path_str = doc.metadata["file_path"]
+        if path_str in indexed:
+            ids = indexed[path_str]["ids"]
+            print(f"  Re-indexing {doc.metadata['file_name']} ({len(ids)} chunks)")
+            collection.delete(ids=ids)
+
+    # Add new and modified files
+    files_to_add = new_reader + modified_reader
+    docs_to_add = new_custom + modified_custom
+
+    if files_to_add or docs_to_add:
+        documents = load_all_documents(files_to_add, docs_to_add)
+        if documents:
+            print(f"Indexing {len(documents)} document(s)...")
+            vector_store = ChromaVectorStore(chroma_collection=collection)
+            storage_context = StorageContext.from_defaults(vector_store=vector_store)
+
+            VectorStoreIndex.from_documents(
+                documents,
+                storage_context=storage_context,
+                transformations=[get_text_splitter()],
+                show_progress=True,
+            )
+
+    print(f"\nIndex updated. Collection now has {collection.count()} vectors.")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Build or update the clippings vector store (ChromaDB)."
+    )
+    parser.add_argument(
+        "--rebuild",
+        action="store_true",
+        help="Full rebuild from scratch (default: incremental update)",
+    )
+    args = parser.parse_args()
+
+    # Configure embedding model (offline, cached in ./models)
+    embed_model = HuggingFaceEmbedding(
+        model_name=EMBED_MODEL_NAME,
+        cache_folder="./models",
+        local_files_only=True,
+    )
+    Settings.embed_model = embed_model
+
+    if not DATA_DIR.exists():
+        raise FileNotFoundError(
+            f"Clippings directory not found: {DATA_DIR.absolute()}\n"
+            f"Create symlink: ln -s ../clippings ./clippings"
+        )
+
+    start = time.time()
+
+    # Scan and classify files
+    print(f"Scanning {DATA_DIR.absolute()}...")
+    reader_files, custom_docs, skipped, ocr_needed = scan_clippings()
+
+    n_valid = len(reader_files) + len(custom_docs)
+    print(f"\nFiles to index: {n_valid}")
+    print(f"  PDF/TXT:       {len(reader_files)}")
+    print(f"  Webarchive/RTF: {len(custom_docs)}")
+    print(f"Files skipped:   {len(skipped)}")
+    for fpath, reason in skipped:
+        print(f"  SKIP: {fpath.name} -- {reason}")
+
+    if ocr_needed:
+        write_ocr_list(ocr_needed)
+
+    if n_valid == 0:
+        raise ValueError("No valid files found to index")
+
+    if args.rebuild:
+        print("\nMode: full rebuild")
+        rebuild(reader_files, custom_docs)
+    else:
+        print("\nMode: incremental update")
+        if not Path(PERSIST_DIR).exists():
+            print(f"No existing index at {PERSIST_DIR}, doing full rebuild.")
+            rebuild(reader_files, custom_docs)
+        else:
+            update(reader_files, custom_docs)
+
+    elapsed = time.time() - start
+    print(f"Done in {elapsed:.1f}s")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/build_exp_claude.py b/build_exp_claude.py
new file mode 100644
index 0000000..891507b
--- /dev/null
+++ b/build_exp_claude.py
@@ -0,0 +1,193 @@
+# build_exp_claude.py
+#
+# Build or update the vector store from journal entries in ./data.
+#
+# Default mode (incremental): loads the existing index and adds only
+# new or modified files.  Use --rebuild for a full rebuild from scratch.
+#
+# January 2026
+# E. M. Furst
+# Used Sonnet 4.5 to suggest changes; Opus 4.6 for incremental update
+
+from llama_index.core import (
+    SimpleDirectoryReader,
+    StorageContext,
+    VectorStoreIndex,
+    load_index_from_storage,
+    Settings,
+)
+from pathlib import Path
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core.node_parser import SentenceSplitter
+import argparse
+import datetime
+import os
+import time
+
+# Shared constants
+DATA_DIR = Path("./data")
+PERSIST_DIR = "./storage_exp"
+EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5"
+CHUNK_SIZE = 256
+CHUNK_OVERLAP = 25
+
+
+def get_text_splitter():
+    return SentenceSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP,
+        paragraph_separator="\n\n",
+    )
+
+
+def rebuild():
+    """Full rebuild: delete and recreate the vector store from scratch."""
+    if not DATA_DIR.exists():
+        raise FileNotFoundError(f"Data directory not found: {DATA_DIR.absolute()}")
+
+    print(f"Loading documents from {DATA_DIR.absolute()}...")
+    documents = SimpleDirectoryReader(str(DATA_DIR)).load_data()
+
+    if not documents:
+        raise ValueError("No documents found in data directory")
+
+    print(f"Loaded {len(documents)} document(s)")
+
+    print("Building vector index...")
+    index = VectorStoreIndex.from_documents(
+        documents,
+        transformations=[get_text_splitter()],
+        show_progress=True,
+    )
+
+    index.storage_context.persist(persist_dir=PERSIST_DIR)
+    print(f"Index built and saved to {PERSIST_DIR}")
+
+
+def update():
+    """Incremental update: add new files, re-index modified files, remove deleted files."""
+    if not DATA_DIR.exists():
+        raise FileNotFoundError(f"Data directory not found: {DATA_DIR.absolute()}")
+
+    # Load existing index
+    print(f"Loading existing index from {PERSIST_DIR}...")
+    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
+    index = load_index_from_storage(storage_context)
+
+    # Set transformations so index.insert() chunks correctly
+    Settings.transformations = [get_text_splitter()]
+
+    # Build lookup of indexed files: file_name -> (ref_doc_id, metadata)
+    all_ref_docs = index.docstore.get_all_ref_doc_info()
+    indexed = {}
+    for ref_id, info in all_ref_docs.items():
+        fname = info.metadata.get("file_name")
+        if fname:
+            indexed[fname] = (ref_id, info.metadata)
+
+    print(f"Index contains {len(indexed)} documents")
+
+    # Scan current files on disk
+    disk_files = {f.name: f for f in sorted(DATA_DIR.glob("*.txt"))}
+    print(f"Data directory contains {len(disk_files)} files")
+
+    # Classify files
+    new_files = []
+    modified_files = []
+    deleted_files = []
+    unchanged = 0
+
+    for fname, fpath in disk_files.items():
+        if fname not in indexed:
+            new_files.append(fpath)
+        else:
+            ref_id, meta = indexed[fname]
+            # Compare file size and modification date
+            stat = fpath.stat()
+            disk_size = stat.st_size
+            # Must use UTC to match SimpleDirectoryReader's date format
+            disk_mdate = datetime.datetime.fromtimestamp(
+                stat.st_mtime, tz=datetime.timezone.utc
+            ).strftime("%Y-%m-%d")
+
+            stored_size = meta.get("file_size")
+            stored_mdate = meta.get("last_modified_date")
+
+            if disk_size != stored_size or disk_mdate != stored_mdate:
+                modified_files.append((fpath, ref_id))
+            else:
+                unchanged += 1
+
+    for fname, (ref_id, meta) in indexed.items():
+        if fname not in disk_files:
+            deleted_files.append((fname, ref_id))
+
+    # Report
+    print(f"\n  New:       {len(new_files)}")
+    print(f"  Modified:  {len(modified_files)}")
+    print(f"  Deleted:   {len(deleted_files)}")
+    print(f"  Unchanged: {unchanged}")
+
+    if not new_files and not modified_files and not deleted_files:
+        print("\nNothing to do.")
+        return
+
+    # Process deletions (including modified files that need re-indexing)
+    for fname, ref_id in deleted_files:
+        print(f"  Removing {fname}")
+        index.delete_ref_doc(ref_id, delete_from_docstore=True)
+
+    for fpath, ref_id in modified_files:
+        print(f"  Re-indexing {fpath.name} (modified)")
+        index.delete_ref_doc(ref_id, delete_from_docstore=True)
+
+    # Process additions (new files + modified files)
+    files_to_add = new_files + [fpath for fpath, _ in modified_files]
+    if files_to_add:
+        print(f"\nIndexing {len(files_to_add)} file(s)...")
+        # Use "./" prefix to match paths from full build (pathlib strips it)
+        docs = SimpleDirectoryReader(
+            input_files=[f"./{f}" for f in files_to_add]
+        ).load_data()
+        for doc in docs:
+            index.insert(doc)
+
+    # Persist
+    index.storage_context.persist(persist_dir=PERSIST_DIR)
+    print(f"\nIndex updated and saved to {PERSIST_DIR}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Build or update the vector store from journal entries."
+    )
+    parser.add_argument(
+        "--rebuild",
+        action="store_true",
+        help="Full rebuild from scratch (default: incremental update)",
+    )
+    args = parser.parse_args()
+
+    # Configure embedding model
+    embed_model = HuggingFaceEmbedding(model_name=EMBED_MODEL_NAME)
+    Settings.embed_model = embed_model
+
+    start = time.time()
+
+    if args.rebuild:
+        print("Mode: full rebuild")
+        rebuild()
+    else:
+        print("Mode: incremental update")
+        if not Path(PERSIST_DIR).exists():
+            print(f"No existing index at {PERSIST_DIR}, doing full rebuild.")
+            rebuild()
+        else:
+            update()
+
+    elapsed = time.time() - start
+    print(f"Done in {elapsed:.1f}s")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/query_hybrid_bm25_v4.py b/query_hybrid_bm25_v4.py
new file mode 100644
index 0000000..277f870
--- /dev/null
+++ b/query_hybrid_bm25_v4.py
@@ -0,0 +1,176 @@
+# query_hybrid_bm25_v4.py
+# Hybrid retrieval: BM25 (sparse) + vector similarity (dense) + cross-encoder
+#
+# Combines two retrieval strategies to catch both exact term matches and
+# semantic similarity:
+#   1. Retrieve top-20 via vector similarity (bi-encoder, catches meaning)
+#   2. Retrieve top-20 via BM25 (term frequency, catches exact names/dates)
+#   3. Merge and deduplicate candidates by node ID
+#   4. Re-rank the union with a cross-encoder -> top-15
+#   5. Pass re-ranked chunks to LLM for synthesis
+#
+# The cross-encoder doesn't care where candidates came from -- it scores
+# each (query, chunk) pair on its own merits. BM25's job is just to
+# nominate candidates that vector similarity might miss.
+#
+# E.M.F. February 2026
+
+# Environment vars must be set before importing huggingface/transformers
+# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
+# at import time.
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
+os.environ["HF_HUB_OFFLINE"] = "1"
+
+from llama_index.core import (
+    StorageContext,
+    load_index_from_storage,
+    Settings,
+    get_response_synthesizer,
+)
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.llms.ollama import Ollama
+from llama_index.core.prompts import PromptTemplate
+from llama_index.core.postprocessor import SentenceTransformerRerank
+from llama_index.retrievers.bm25 import BM25Retriever
+import sys
+
+#
+# Globals
+#
+
+# Embedding model (must match build_exp_claude.py)
+EMBED_MODEL = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5", local_files_only=True)
+
+# LLM model for generation
+LLM_MODEL = "command-r7b"
+
+# Cross-encoder model for re-ranking (cached in ./models/)
+RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2"
+RERANK_TOP_N = 15
+
+# Retrieval parameters
+VECTOR_TOP_K = 20   # candidates from vector similarity
+BM25_TOP_K = 20     # candidates from BM25 term matching
+
+#
+# Custom prompt -- same as v3
+#
+PROMPT = PromptTemplate(
+"""You are a precise research assistant analyzing excerpts from a personal journal collection.
+Every excerpt below has been selected and ranked for relevance to the query.
+
+CONTEXT (ranked by relevance):
+{context_str}
+
+QUERY:
+{query_str}
+
+Instructions:
+- Answer ONLY using information explicitly present in the CONTEXT above
+- Examine ALL provided excerpts, not just the top few -- each one was selected for relevance
+- Be specific: quote or closely paraphrase key passages and cite their file names
+- When multiple files touch on the query, note what each one contributes
+- If the context doesn't contain enough information to answer fully, say so
+
+Your response should:
+1. Directly answer the query, drawing on as many relevant excerpts as possible
+2. Reference specific files and their content (e.g., "In <filename>, ...")
+3. End with a list of all files that contributed to your answer, with a brief note on each
+
+If the context is insufficient, explain what's missing."""
+)
+
+
+def main():
+    # Configure LLM and embedding model
+    # for local model using ollama
+    # Note: Ollama temperature defaults to 0.8
+    Settings.llm = Ollama(
+        model=LLM_MODEL,
+        temperature=0.3,
+        request_timeout=360.0,
+        context_window=8000,
+    )
+    
+    # Use OpenAI API:
+    # from llama_index.llms.openai import OpenAI
+    # Settings.llm = OpenAI(
+    #     model="gpt-4o-mini",   # or "gpt-4o" for higher quality
+    #     temperature=0.3,
+    # )
+
+    Settings.embed_model = EMBED_MODEL
+
+
+    # Load persisted vector store
+    storage_context = StorageContext.from_defaults(persist_dir="./storage_exp")
+    index = load_index_from_storage(storage_context)
+
+    # --- Retrievers ---
+
+    # Vector retriever (dense: cosine similarity over embeddings)
+    vector_retriever = index.as_retriever(similarity_top_k=VECTOR_TOP_K)
+
+    # BM25 retriever (sparse: term frequency scoring)
+    bm25_retriever = BM25Retriever.from_defaults(
+        index=index,
+        similarity_top_k=BM25_TOP_K,
+    )
+
+    # Cross-encoder re-ranker
+    reranker = SentenceTransformerRerank(
+        model=RERANK_MODEL,
+        top_n=RERANK_TOP_N,
+    )
+
+    # --- Query ---
+
+    if len(sys.argv) < 2:
+        print("Usage: python query_hybrid_bm25_v4.py QUERY_TEXT")
+        sys.exit(1)
+    q = " ".join(sys.argv[1:])
+
+    # Retrieve from both sources
+    vector_nodes = vector_retriever.retrieve(q)
+    bm25_nodes = bm25_retriever.retrieve(q)
+
+    # Merge and deduplicate by node ID
+    seen_ids = set()
+    merged = []
+    for node in vector_nodes + bm25_nodes:
+        node_id = node.node.node_id
+        if node_id not in seen_ids:
+            seen_ids.add(node_id)
+            merged.append(node)
+
+    # Re-rank the merged candidates with cross-encoder
+    reranked = reranker.postprocess_nodes(merged, query_str=q)
+
+    # Report retrieval stats
+    n_vector_only = len([n for n in vector_nodes if n.node.node_id not in {b.node.node_id for b in bm25_nodes}])
+    n_bm25_only = len([n for n in bm25_nodes if n.node.node_id not in {v.node.node_id for v in vector_nodes}])
+    n_both = len(vector_nodes) + len(bm25_nodes) - len(merged)
+
+    print(f"\nQuery: {q}")
+    print(f"Vector: {len(vector_nodes)}, BM25: {len(bm25_nodes)}, "
+          f"overlap: {n_both}, merged: {len(merged)}, re-ranked to: {len(reranked)}")
+
+    # Synthesize response with LLM
+    synthesizer = get_response_synthesizer(text_qa_template=PROMPT)
+    response = synthesizer.synthesize(q, nodes=reranked)
+
+    # Output
+    print("\nResponse:\n")
+    print(response.response)
+
+    print("\nSource documents:")
+    for node in response.source_nodes:
+        meta = getattr(node, "metadata", None) or node.node.metadata
+        score = getattr(node, "score", None)
+        print(f"{meta.get('file_name')}  {meta.get('file_path')}  {score:.3f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/query_topk_prompt_engine_v3.py b/query_topk_prompt_engine_v3.py
new file mode 100644
index 0000000..12083b0
--- /dev/null
+++ b/query_topk_prompt_engine_v3.py
@@ -0,0 +1,136 @@
+# query_topk_prompt_engine_v3.py
+# Run a query on a vector store with cross-encoder re-ranking
+#
+# Based on v2. Adds a cross-encoder re-ranking step:
+#   1. Retrieve top-30 chunks via vector similarity (bi-encoder, fast)
+#   2. Re-rank to top-15 using a cross-encoder (slower but more accurate)
+#   3. Pass re-ranked chunks to LLM for synthesis
+#
+# The cross-encoder scores each (query, chunk) pair jointly, which captures
+# nuance that bi-encoder dot-product similarity misses.
+#
+# E.M.F. February 2026
+
+# Environment vars must be set before importing huggingface/transformers
+# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
+# at import time.
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
+os.environ["HF_HUB_OFFLINE"] = "1"
+
+from llama_index.core import (
+    StorageContext,
+    load_index_from_storage,
+    Settings,
+)
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.llms.ollama import Ollama
+from llama_index.core.prompts import PromptTemplate
+from llama_index.core.postprocessor import SentenceTransformerRerank
+import sys
+
+#
+# Globals
+#
+
+# Embedding model used in vector store (must match build_exp_claude.py)
+EMBED_MODEL = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5", local_files_only=True)
+
+# LLM model for generation
+llm = "command-r7b"
+
+# Cross-encoder model for re-ranking (cached in ./models/)
+#RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2"
+#RERANK_MODEL = "cross-encoder/stsb-roberta-base"
+#RERANK_MODEL = "BAAI/bge-reranker-v2-m3"
+RERANK_TOP_N = 15   # keep top 15 after re-ranking
+RETRIEVE_TOP_K = 30  # retrieve wider pool for re-ranker to work with
+
+#
+# Custom prompt for the query engine - Version 3
+#
+# Adapted for re-ranked context: every excerpt below has been scored for
+# relevance by a cross-encoder, so even lower-ranked ones are worth examining.
+# The prompt encourages the LLM to draw from all provided excerpts and to
+# note what each distinct file contributes rather than collapsing onto one.
+#
+PROMPT = PromptTemplate(
+"""You are a precise research assistant analyzing excerpts from a personal journal collection.
+Every excerpt below has been selected and ranked for relevance to the query.
+
+CONTEXT (ranked by relevance):
+{context_str}
+
+QUERY:
+{query_str}
+
+Instructions:
+- Answer ONLY using information explicitly present in the CONTEXT above
+- Examine ALL provided excerpts, not just the top few -- each one was selected for relevance
+- Be specific: quote or closely paraphrase key passages and cite their file names
+- When multiple files touch on the query, note what each one contributes
+- If the context doesn't contain enough information to answer fully, say so
+
+Your response should:
+1. Directly answer the query, drawing on as many relevant excerpts as possible
+2. Reference specific files and their content (e.g., "In <filename>, ...")
+3. End with a list of all files that contributed to your answer, with a brief note on each
+
+If the context is insufficient, explain what's missing."""
+)
+
+#
+# Main program routine
+#
+
+def main():
+    # Use a local model to generate -- in this case using Ollama
+    Settings.llm = Ollama(
+        model=llm,
+        request_timeout=360.0,
+        context_window=8000
+    )
+
+    # Load embedding model (same as used for vector store)
+    Settings.embed_model = EMBED_MODEL
+
+    # Load persisted vector store + metadata
+    storage_context = StorageContext.from_defaults(persist_dir="./storage_exp")
+    index = load_index_from_storage(storage_context)
+
+    # Cross-encoder re-ranker
+    reranker = SentenceTransformerRerank(
+        model=RERANK_MODEL,
+        top_n=RERANK_TOP_N,
+    )
+
+    # Build query engine: retrieve wide (top-30), re-rank to top-15, then synthesize
+    query_engine = index.as_query_engine(
+        similarity_top_k=RETRIEVE_TOP_K,
+        text_qa_template=PROMPT,
+        node_postprocessors=[reranker],
+    )
+
+    # Query
+    if len(sys.argv) < 2:
+        print("Usage: python query_topk_prompt_engine_v3.py QUERY_TEXT")
+        sys.exit(1)
+    q = " ".join(sys.argv[1:])
+
+    # Generate the response by querying the engine
+    response = query_engine.query(q)
+
+    # Return the query response and source documents
+    print("\nResponse:\n")
+    print(response.response)
+
+    print("\nSource documents:")
+    for node in response.source_nodes:
+        meta = getattr(node, "metadata", None) or node.node.metadata
+        print(f"{meta.get('file_name')}  {meta.get('file_path')}  {getattr(node, 'score', None):.3f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..85f1abf
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,216 @@
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.15
+aiosignal==1.4.0
+aiosqlite==0.21.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anyio==4.10.0
+appnope==0.1.4
+argon2-cffi==25.1.0
+argon2-cffi-bindings==25.1.0
+arrow==1.3.0
+asttokens==3.0.0
+async-lru==2.0.5
+attrs==25.3.0
+babel==2.17.0
+backoff==2.2.1
+banks==2.2.0
+bcrypt==5.0.0
+beautifulsoup4==4.13.4
+bleach==6.2.0
+bm25s==0.2.14
+build==1.4.0
+certifi==2025.8.3
+cffi==1.17.1
+charset-normalizer==3.4.3
+chromadb==1.5.1
+click==8.2.1
+colorama==0.4.6
+comm==0.2.3
+contourpy==1.3.3
+cycler==0.12.1
+dataclasses-json==0.6.7
+debugpy==1.8.16
+decorator==5.2.1
+defusedxml==0.7.1
+Deprecated==1.2.18
+dirtyjson==1.0.8
+distro==1.9.0
+durationpy==0.10
+executing==2.2.0
+fastjsonschema==2.21.1
+filelock==3.18.0
+filetype==1.2.0
+flatbuffers==25.12.19
+fonttools==4.59.1
+fqdn==1.5.1
+frozenlist==1.7.0
+fsspec==2025.7.0
+googleapis-common-protos==1.72.0
+greenlet==3.2.4
+griffe==1.11.0
+grpcio==1.78.1
+h11==0.16.0
+hf-xet==1.1.7
+httpcore==1.0.9
+httptools==0.7.1
+httpx==0.28.1
+huggingface-hub==0.34.4
+idna==3.10
+importlib_metadata==8.7.1
+importlib_resources==6.5.2
+ipykernel==6.30.1
+ipython==9.4.0
+ipython_pygments_lexers==1.1.1
+ipywidgets==8.1.7
+isoduration==20.11.0
+jedi==0.19.2
+Jinja2==3.1.6
+jiter==0.13.0
+joblib==1.5.1
+json5==0.12.1
+jsonpointer==3.0.0
+jsonschema==4.25.0
+jsonschema-specifications==2025.4.1
+jupyter==1.1.1
+jupyter-console==6.6.3
+jupyter-events==0.12.0
+jupyter-lsp==2.2.6
+jupyter_client==8.6.3
+jupyter_core==5.8.1
+jupyter_server==2.16.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.4.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.15
+kiwisolver==1.4.9
+kubernetes==35.0.0
+lark==1.2.2
+llama-index-core==0.14.14
+llama-index-embeddings-huggingface==0.6.1
+llama-index-instrumentation==0.4.0
+llama-index-llms-ollama==0.9.1
+llama-index-llms-openai==0.6.18
+llama-index-readers-file==0.5.6
+llama-index-retrievers-bm25==0.6.5
+llama-index-vector-stores-chroma==0.5.5
+llama-index-workflows==2.14.2
+markdown-it-py==4.0.0
+MarkupSafe==3.0.2
+marshmallow==3.26.1
+matplotlib==3.10.5
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistune==3.1.3
+mmh3==5.2.0
+mpmath==1.3.0
+multidict==6.6.3
+mypy_extensions==1.1.0
+nbclient==0.10.2
+nbconvert==7.16.6
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.5
+nltk==3.9.1
+notebook==7.4.5
+notebook_shim==0.2.4
+numpy==2.3.2
+oauthlib==3.3.1
+ollama==0.5.3
+onnxruntime==1.24.2
+openai==2.21.0
+opentelemetry-api==1.39.1
+opentelemetry-exporter-otlp-proto-common==1.39.1
+opentelemetry-exporter-otlp-proto-grpc==1.39.1
+opentelemetry-proto==1.39.1
+opentelemetry-sdk==1.39.1
+opentelemetry-semantic-conventions==0.60b1
+orjson==3.11.7
+overrides==7.7.0
+packaging==25.0
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.3.0
+platformdirs==4.3.8
+posthog==5.4.0
+prometheus_client==0.22.1
+prompt_toolkit==3.0.51
+propcache==0.3.2
+protobuf==6.33.5
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pybase64==1.4.3
+pycparser==2.22
+pydantic==2.11.7
+pydantic_core==2.33.2
+Pygments==2.19.2
+pyparsing==3.2.3
+pypdf==6.7.1
+PyPika==0.51.1
+pyproject_hooks==1.2.0
+PyStemmer==2.2.0.3
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
+python-json-logger==3.3.0
+pytz==2025.2
+PyYAML==6.0.2
+pyzmq==27.0.1
+referencing==0.36.2
+regex==2025.7.34
+requests==2.32.4
+requests-oauthlib==2.0.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rfc3987-syntax==1.1.0
+rich==14.3.3
+rpds-py==0.27.0
+safetensors==0.6.2
+scikit-learn==1.7.1
+scipy==1.16.1
+seaborn==0.13.2
+Send2Trash==1.8.3
+sentence-transformers==5.1.0
+setuptools==80.9.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.7
+SQLAlchemy==2.0.42
+stack-data==0.6.3
+striprtf==0.0.26
+sympy==1.14.0
+tenacity==9.1.2
+terminado==0.18.1
+threadpoolctl==3.6.0
+tiktoken==0.11.0
+tinycss2==1.4.0
+tokenizers==0.21.4
+torch==2.8.0
+tornado==6.5.2
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.55.0
+typer==0.24.1
+types-python-dateutil==2.9.0.20250809
+typing-inspect==0.9.0
+typing-inspection==0.4.1
+typing_extensions==4.14.1
+tzdata==2025.2
+uri-template==1.3.0
+urllib3==2.5.0
+uvicorn==0.41.0
+uvloop==0.22.1
+watchfiles==1.1.1
+wcwidth==0.2.13
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==16.0
+widgetsnbextension==4.0.14
+wrapt==1.17.2
+yarl==1.20.1
+zipp==3.23.0
diff --git a/retrieve_clippings.py b/retrieve_clippings.py
new file mode 100644
index 0000000..2cb0c9e
--- /dev/null
+++ b/retrieve_clippings.py
@@ -0,0 +1,138 @@
+# retrieve_clippings.py
+# Verbatim chunk retrieval from clippings index (ChromaDB).
+# Vector search + cross-encoder re-ranking, no LLM.
+#
+# Returns the top re-ranked chunks with their full text, file metadata, and
+# scores. Includes page numbers for PDF sources when available.
+#
+# E.M.F. February 2026
+
+# Environment vars must be set before importing huggingface/transformers
+# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
+# at import time.
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
+os.environ["HF_HUB_OFFLINE"] = "1"
+
+import chromadb
+from llama_index.core import VectorStoreIndex, Settings
+from llama_index.vector_stores.chroma import ChromaVectorStore
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core.postprocessor import SentenceTransformerRerank
+import sys
+import textwrap
+
+#
+# Globals
+#
+
+PERSIST_DIR = "./storage_clippings"
+COLLECTION_NAME = "clippings"
+
+# Embedding model (must match build_clippings.py)
+EMBED_MODEL = HuggingFaceEmbedding(
+    cache_folder="./models",
+    model_name="BAAI/bge-large-en-v1.5",
+    local_files_only=True,
+)
+
+# Cross-encoder model for re-ranking (cached in ./models/)
+RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2"
+RERANK_TOP_N = 15
+RETRIEVE_TOP_K = 30
+
+# Output formatting
+WRAP_WIDTH = 80
+
+
+def main():
+    # No LLM needed -- set embed model only
+    Settings.embed_model = EMBED_MODEL
+
+    # Load ChromaDB collection
+    client = chromadb.PersistentClient(path=PERSIST_DIR)
+    collection = client.get_collection(COLLECTION_NAME)
+
+    # Build index from existing vector store
+    vector_store = ChromaVectorStore(chroma_collection=collection)
+    index = VectorStoreIndex.from_vector_store(vector_store)
+
+    # Build retriever (vector search only, no query engine / LLM)
+    retriever = index.as_retriever(similarity_top_k=RETRIEVE_TOP_K)
+
+    # Cross-encoder re-ranker
+    reranker = SentenceTransformerRerank(
+        model=RERANK_MODEL,
+        top_n=RERANK_TOP_N,
+    )
+
+    # Query
+    if len(sys.argv) < 2:
+        print("Usage: python retrieve_clippings.py QUERY_TEXT")
+        sys.exit(1)
+    q = " ".join(sys.argv[1:])
+
+    # Retrieve and re-rank
+    nodes = retriever.retrieve(q)
+    reranked = reranker.postprocess_nodes(nodes, query_str=q)
+
+    # Build result list with metadata
+    results = []
+    for i, node in enumerate(reranked, 1):
+        meta = getattr(node, "metadata", None) or node.node.metadata
+        score = getattr(node, "score", None)
+        file_name = meta.get("file_name", "unknown")
+        page_label = meta.get("page_label", "")
+        results.append((i, node, file_name, page_label, score))
+
+    # --- Summary: source files and rankings ---
+    print(f"\nQuery: {q}")
+    print(f"Retrieved {len(nodes)} chunks, re-ranked to top {len(reranked)}")
+    print(f"({collection.count()} total vectors in collection)\n")
+
+    # Unique source files in rank order
+    seen = set()
+    unique_sources = []
+    for i, node, file_name, page_label, score in results:
+        if file_name not in seen:
+            seen.add(file_name)
+            unique_sources.append(file_name)
+
+    print(f"Source files ({len(unique_sources)} unique):")
+    for j, fname in enumerate(unique_sources, 1):
+        print(f"  {j}. {fname}")
+
+    print(f"\nRankings:")
+    for i, node, file_name, page_label, score in results:
+        line = f"  [{i:2d}]  {score:+7.3f}  {file_name}"
+        if page_label:
+            line += f"  (p. {page_label})"
+        print(line)
+
+    # --- Full chunk text ---
+    print(f"\n{'=' * WRAP_WIDTH}")
+    print("CHUNKS")
+    print("=" * WRAP_WIDTH)
+
+    for i, node, file_name, page_label, score in results:
+        header = f"=== [{i}] {file_name}"
+        if page_label:
+            header += f"  (p. {page_label})"
+        header += f"  (score: {score:.3f})"
+
+        print("\n" + "=" * WRAP_WIDTH)
+        print(header)
+        print("=" * WRAP_WIDTH)
+
+        text = node.get_content()
+        for line in text.splitlines():
+            if line.strip():
+                print(textwrap.fill(line, width=WRAP_WIDTH))
+            else:
+                print()
+        print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/retrieve_hybrid_raw.py b/retrieve_hybrid_raw.py
new file mode 100644
index 0000000..ff64140
--- /dev/null
+++ b/retrieve_hybrid_raw.py
@@ -0,0 +1,140 @@
+# retrieve_hybrid_raw.py
+# Hybrid verbatim chunk retrieval: BM25 + vector search + cross-encoder, no LLM.
+#
+# Same hybrid retrieval as query_hybrid_bm25_v4.py but outputs raw chunk text
+# instead of LLM synthesis. Useful for inspecting what the hybrid pipeline
+# retrieves and comparing against retrieve_raw.py (vector-only).
+#
+# Each chunk is annotated with its source (vector, BM25, or both) so you can
+# see which retriever nominated it.
+#
+# E.M.F. February 2026
+
+# Environment vars must be set before importing huggingface/transformers
+# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
+# at import time.
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
+os.environ["HF_HUB_OFFLINE"] = "1"
+
+from llama_index.core import (
+    StorageContext,
+    load_index_from_storage,
+    Settings,
+)
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core.postprocessor import SentenceTransformerRerank
+from llama_index.retrievers.bm25 import BM25Retriever
+import sys
+import textwrap
+
+#
+# Globals
+#
+
+# Embedding model (must match build_exp_claude.py)
+EMBED_MODEL = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5", local_files_only=True)
+
+# Cross-encoder model for re-ranking (cached in ./models/)
+RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2"
+RERANK_TOP_N = 15
+
+# Retrieval parameters
+VECTOR_TOP_K = 20
+BM25_TOP_K = 20
+
+# Output formatting
+WRAP_WIDTH = 80
+
+
+def main():
+    # No LLM needed -- set embed model only
+    Settings.embed_model = EMBED_MODEL
+
+    # Load persisted vector store
+    storage_context = StorageContext.from_defaults(persist_dir="./storage_exp")
+    index = load_index_from_storage(storage_context)
+
+    # --- Retrievers ---
+
+    vector_retriever = index.as_retriever(similarity_top_k=VECTOR_TOP_K)
+
+    bm25_retriever = BM25Retriever.from_defaults(
+        index=index,
+        similarity_top_k=BM25_TOP_K,
+    )
+
+    # Cross-encoder re-ranker
+    reranker = SentenceTransformerRerank(
+        model=RERANK_MODEL,
+        top_n=RERANK_TOP_N,
+    )
+
+    # Query
+    if len(sys.argv) < 2:
+        print("Usage: python retrieve_hybrid_raw.py QUERY_TEXT")
+        sys.exit(1)
+    q = " ".join(sys.argv[1:])
+
+    # Retrieve from both sources
+    vector_nodes = vector_retriever.retrieve(q)
+    bm25_nodes = bm25_retriever.retrieve(q)
+
+    # Track which retriever found each node
+    vector_ids = {n.node.node_id for n in vector_nodes}
+    bm25_ids = {n.node.node_id for n in bm25_nodes}
+
+    # Merge and deduplicate by node ID
+    seen_ids = set()
+    merged = []
+    for node in vector_nodes + bm25_nodes:
+        node_id = node.node.node_id
+        if node_id not in seen_ids:
+            seen_ids.add(node_id)
+            merged.append(node)
+
+    # Re-rank merged candidates
+    reranked = reranker.postprocess_nodes(merged, query_str=q)
+
+    # Retrieval stats
+    n_both = len(vector_ids & bm25_ids)
+    n_vector_only = len(vector_ids - bm25_ids)
+    n_bm25_only = len(bm25_ids - vector_ids)
+
+    print(f"\nQuery: {q}")
+    print(f"Vector: {len(vector_nodes)}, BM25: {len(bm25_nodes)}, "
+          f"overlap: {n_both}, merged: {len(merged)}, re-ranked to: {len(reranked)}")
+    print(f"  vector-only: {n_vector_only}, bm25-only: {n_bm25_only}, both: {n_both}\n")
+
+    # Output re-ranked chunks with source annotation
+    for i, node in enumerate(reranked, 1):
+        meta = getattr(node, "metadata", None) or node.node.metadata
+        score = getattr(node, "score", None)
+        file_name = meta.get("file_name", "unknown")
+        text = node.get_content()
+        node_id = node.node.node_id
+
+        # Annotate source
+        in_vector = node_id in vector_ids
+        in_bm25 = node_id in bm25_ids
+        if in_vector and in_bm25:
+            source = "vector+bm25"
+        elif in_bm25:
+            source = "bm25-only"
+        else:
+            source = "vector-only"
+
+        print("=" * WRAP_WIDTH)
+        print(f"=== [{i}] {file_name}  (score: {score:.3f})  [{source}]")
+        print("=" * WRAP_WIDTH)
+        for line in text.splitlines():
+            if line.strip():
+                print(textwrap.fill(line, width=WRAP_WIDTH))
+            else:
+                print()
+        print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/retrieve_raw.py b/retrieve_raw.py
new file mode 100644
index 0000000..de639ee
--- /dev/null
+++ b/retrieve_raw.py
@@ -0,0 +1,97 @@
+# retrieve_raw.py
+# Verbatim chunk retrieval: vector search + cross-encoder re-ranking, no LLM.
+#
+# Returns the top re-ranked chunks with their full text, file metadata, and
+# scores. Useful for browsing source material directly and verifying what
+# the RAG pipeline retrieves before LLM synthesis.
+#
+# Uses the same vector store, embedding model, and re-ranker as
+# query_topk_prompt_engine_v3.py, but skips the LLM step entirely.
+#
+# E.M.F. February 2026
+
+# Environment vars must be set before importing huggingface/transformers
+# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
+# at import time.
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
+os.environ["HF_HUB_OFFLINE"] = "1"
+
+from llama_index.core import (
+    StorageContext,
+    load_index_from_storage,
+    Settings,
+)
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core.postprocessor import SentenceTransformerRerank
+import sys
+import textwrap
+
+#
+# Globals
+#
+
+# Embedding model (must match build_exp_claude.py)
+EMBED_MODEL = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5", local_files_only=True)
+
+# Cross-encoder model for re-ranking (cached in ./models/)
+RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2"
+RERANK_TOP_N = 15
+RETRIEVE_TOP_K = 30
+
+# Output formatting
+WRAP_WIDTH = 80
+
+
+def main():
+    # No LLM needed -- set embed model only
+    Settings.embed_model = EMBED_MODEL
+
+    # Load persisted vector store
+    storage_context = StorageContext.from_defaults(persist_dir="./storage_exp")
+    index = load_index_from_storage(storage_context)
+
+    # Build retriever (vector search only, no query engine / LLM)
+    retriever = index.as_retriever(similarity_top_k=RETRIEVE_TOP_K)
+
+    # Cross-encoder re-ranker
+    reranker = SentenceTransformerRerank(
+        model=RERANK_MODEL,
+        top_n=RERANK_TOP_N,
+    )
+
+    # Query
+    if len(sys.argv) < 2:
+        print("Usage: python retrieve_raw.py QUERY_TEXT")
+        sys.exit(1)
+    q = " ".join(sys.argv[1:])
+
+    # Retrieve and re-rank
+    nodes = retriever.retrieve(q)
+    reranked = reranker.postprocess_nodes(nodes, query_str=q)
+
+    # Output
+    print(f"\nQuery: {q}")
+    print(f"Retrieved {len(nodes)} chunks, re-ranked to top {len(reranked)}\n")
+
+    for i, node in enumerate(reranked, 1):
+        meta = getattr(node, "metadata", None) or node.node.metadata
+        score = getattr(node, "score", None)
+        file_name = meta.get("file_name", "unknown")
+        text = node.get_content()
+
+        print("="*WRAP_WIDTH)
+        print(f"=== [{i}] {file_name}  (score: {score:.3f}) ")
+        print("="*WRAP_WIDTH)
+        # Wrap text for readability
+        for line in text.splitlines():
+            if line.strip():
+                print(textwrap.fill(line, width=WRAP_WIDTH))
+            else:
+                print()
+        print()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/run_query.sh b/run_query.sh
new file mode 100755
index 0000000..d34683a
--- /dev/null
+++ b/run_query.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# This shell script will handle I/O for the python query engine
+# It will take a query and return the formatted results
+
+# E.M.F. August 2025
+
+# Usage: ./run_query.sh 
+
+QUERY_SCRIPT="query_hybrid_bm25_v4.py"
+
+echo -e "Current query engine is $QUERY_SCRIPT\n"
+
+# Loop until input is "exit"
+while true; do
+    read -p "Enter your query (or type 'exit' to quit): " query
+    if [ "$query" == "exit" ] || [ "$query" == "quit" ] || [ "$query" == "" ] ; then
+        echo "Exiting..."
+        break
+    fi
+    time_start=$(date +%s)
+
+    # Call the python script with the query and format the output
+    python3 $QUERY_SCRIPT --query "$query" | \
+        expand | sed -E 's|(.* )(.*/data)|\1./data|' | fold -s -w 131
+    
+    time_end=$(date +%s)
+    elapsed=$((time_end - time_start))
+    echo -e "Query processed in $elapsed seconds.\n"
+    echo $query >> query.log
+done
diff --git a/search_keywords.py b/search_keywords.py
new file mode 100644
index 0000000..cb2cd51
--- /dev/null
+++ b/search_keywords.py
@@ -0,0 +1,189 @@
+# search_keywords.py
+# Keyword search: extract terms from a query using POS tagging, then grep
+# across journal files for matches.
+#
+# Complements the vector search pipeline by catching exact names, places,
+# and dates that embeddings can miss. No vector store or LLM needed.
+#
+# Term extraction uses NLTK POS tagging to keep nouns (NN*), proper nouns
+# (NNP*), and adjectives (JJ*) -- skipping stopwords and function words
+# automatically. Consecutive proper nouns are joined into multi-word phrases
+# (e.g., "Robert Wright" stays as one search term, not "robert" + "wright").
+#
+# E.M.F. February 2026
+
+import os
+import sys
+import re
+from pathlib import Path
+
+import nltk
+
+#
+# Globals
+#
+DATA_DIR = Path("./data")
+CONTEXT_LINES = 2       # lines of context around each match
+MAX_MATCHES_PER_FILE = 3  # cap matches shown per file to avoid flooding
+
+# POS tags to keep: nouns, proper nouns, adjectives
+KEEP_TAGS = {"NN", "NNS", "NNP", "NNPS", "JJ", "JJS", "JJR"}
+
+# Proper noun tags (consecutive runs are joined as phrases)
+PROPER_NOUN_TAGS = {"NNP", "NNPS"}
+
+# Minimum word length to keep (filters out short noise)
+MIN_WORD_LEN = 3
+
+
+def ensure_nltk_data():
+    """Download NLTK data if not already present."""
+    for resource, name in [
+        ("tokenizers/punkt_tab", "punkt_tab"),
+        ("taggers/averaged_perceptron_tagger_eng", "averaged_perceptron_tagger_eng"),
+    ]:
+        try:
+            nltk.data.find(resource)
+        except LookupError:
+            print(f"Downloading NLTK resource: {name}")
+            nltk.download(name, quiet=True)
+
+
+def extract_terms(query):
+    """Extract key terms from a query using POS tagging.
+
+    Tokenizes the query, runs POS tagging, and keeps nouns, proper nouns,
+    and adjectives. Consecutive proper nouns (NNP/NNPS) are joined into
+    multi-word phrases (e.g., "Robert Wright" → "robert wright").
+
+    Returns a list of terms (lowercase), phrases listed first.
+    """
+    tokens = nltk.word_tokenize(query)
+    tagged = nltk.pos_tag(tokens)
+
+    phrases = []     # multi-word proper noun phrases
+    single_terms = []  # individual nouns/adjectives
+    proper_run = []  # accumulator for consecutive proper nouns
+
+    for word, tag in tagged:
+        if tag in PROPER_NOUN_TAGS:
+            proper_run.append(word)
+        else:
+            # Flush any accumulated proper noun run
+            if proper_run:
+                phrase = " ".join(proper_run).lower()
+                if len(phrase) >= MIN_WORD_LEN:
+                    phrases.append(phrase)
+                proper_run = []
+            # Keep other nouns and adjectives as single terms
+            if tag in KEEP_TAGS and len(word) >= MIN_WORD_LEN:
+                single_terms.append(word.lower())
+
+    # Flush final proper noun run
+    if proper_run:
+        phrase = " ".join(proper_run).lower()
+        if len(phrase) >= MIN_WORD_LEN:
+            phrases.append(phrase)
+
+    # Phrases first (more specific), then single terms
+    all_terms = phrases + single_terms
+    return list(dict.fromkeys(all_terms))  # deduplicate, preserve order
+
+
+def search_files(terms, data_dir, context_lines=CONTEXT_LINES):
+    """Search all .txt files in data_dir for the given terms.
+
+    Returns a list of (file_path, match_count, matches) where matches is a
+    list of (line_number, context_block) tuples.
+    """
+    if not terms:
+        return []
+
+    # Build a single regex pattern that matches any term (case-insensitive)
+    pattern = re.compile(
+        r"\b(" + "|".join(re.escape(t) for t in terms) + r")\b",
+        re.IGNORECASE
+    )
+
+    results = []
+    txt_files = sorted(data_dir.glob("*.txt"))
+
+    for fpath in txt_files:
+        try:
+            lines = fpath.read_text(encoding="utf-8").splitlines()
+        except (OSError, UnicodeDecodeError):
+            continue
+
+        matches = []
+        match_count = 0
+        seen_lines = set()  # avoid overlapping context blocks
+
+        for i, line in enumerate(lines):
+            if pattern.search(line):
+                match_count += 1
+                if i in seen_lines:
+                    continue
+
+                # Extract context window
+                start = max(0, i - context_lines)
+                end = min(len(lines), i + context_lines + 1)
+                block = []
+                for j in range(start, end):
+                    seen_lines.add(j)
+                    marker = ">>>" if j == i else "   "
+                    block.append(f"  {marker} {j+1:4d}: {lines[j]}")
+
+                matches.append((i + 1, "\n".join(block)))
+
+        if match_count > 0:
+            results.append((fpath, match_count, matches))
+
+    # Sort by match count (most matches first)
+    results.sort(key=lambda x: x[1], reverse=True)
+    return results
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python search_keywords.py QUERY_TEXT")
+        sys.exit(1)
+
+    ensure_nltk_data()
+
+    q = " ".join(sys.argv[1:])
+
+    # Extract terms
+    terms = extract_terms(q)
+    if not terms:
+        print(f"Query: {q}")
+        print("No searchable terms extracted. Try a more specific query.")
+        sys.exit(0)
+
+    print(f"Query: {q}")
+    print(f"Extracted terms: {', '.join(terms)}\n")
+
+    # Search
+    results = search_files(terms, DATA_DIR)
+
+    if not results:
+        print("No matches found.")
+        sys.exit(0)
+
+    # Summary
+    total_matches = sum(r[1] for r in results)
+    print(f"Found {total_matches} matches across {len(results)} files\n")
+
+    # Detailed output
+    for fpath, match_count, matches in results:
+        print("="*60)
+        print(f"--- {fpath.name}  ({match_count} matches) ---")
+        print("="*60)
+        for line_num, block in matches[:MAX_MATCHES_PER_FILE]:
+            print(block)
+            print()
+        if len(matches) > MAX_MATCHES_PER_FILE:
+            print(f"  ... and {len(matches) - MAX_MATCHES_PER_FILE} more matches\n")
+
+
+if __name__ == "__main__":
+    main()