From 8310553f890280ca9f553795c5b280a909fd0f38 Mon Sep 17 00:00:00 2001 From: Eric Date: Sun, 22 Feb 2026 09:03:03 -0500 Subject: [PATCH] RAG pipeline for semantic search over personal archives Journal and clippings search with LlamaIndex, HuggingFace embeddings, cross-encoder re-ranking, and local LLM inference via Ollama. Clippings index uses ChromaDB for persistent vector storage. --- .gitignore | 37 +++ README.md | 208 +++++++++++++++ build_clippings.py | 471 +++++++++++++++++++++++++++++++++ build_exp_claude.py | 193 ++++++++++++++ query_hybrid_bm25_v4.py | 176 ++++++++++++ query_topk_prompt_engine_v3.py | 136 ++++++++++ requirements.txt | 216 +++++++++++++++ retrieve_clippings.py | 138 ++++++++++ retrieve_hybrid_raw.py | 140 ++++++++++ retrieve_raw.py | 97 +++++++ run_query.sh | 30 +++ search_keywords.py | 189 +++++++++++++ 12 files changed, 2031 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 build_clippings.py create mode 100644 build_exp_claude.py create mode 100644 query_hybrid_bm25_v4.py create mode 100644 query_topk_prompt_engine_v3.py create mode 100644 requirements.txt create mode 100644 retrieve_clippings.py create mode 100644 retrieve_hybrid_raw.py create mode 100644 retrieve_raw.py create mode 100755 run_query.sh create mode 100644 search_keywords.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7e4dde6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,37 @@ +# Python +.venv/ +__pycache__/ +*.pyc + +# HuggingFace cached models (large, ~2 GB) +models/ + +# Vector stores (large, rebuild with build scripts) +storage_exp/ +storage/ +storage_clippings/ + +# Data (symlinks to private files) +data +clippings + +# Generated file lists +ocr_needed.txt + +# IDE and OS +.DS_Store +.vscode/ +.idea/ + +# Jupyter checkpoints +.ipynb_checkpoints/ + +# Secrets +.env +API_key_temp + +# Query log +query.log + +# Duplicate of CLAUDE.md +claude.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..823c7a4 --- /dev/null +++ b/README.md @@ -0,0 +1,208 @@ +# ssearch + +Semantic search over a personal journal archive. Uses vector embeddings and a local LLM to find and synthesize information across 1800+ dated text entries spanning 2000-2025. + +## How it works + +``` +Query → Embed (BAAI/bge-large-en-v1.5) → Vector similarity (top-30) → Cross-encoder re-rank (top-15) → LLM synthesis (command-r7b via Ollama, or OpenAI API) → Response + sources +``` + +1. **Build**: Journal entries in `./data` are chunked (256 tokens, 25-token overlap) and embedded into a vector store using LlamaIndex. Supports incremental updates (new/modified files only) or full rebuilds. +2. **Retrieve**: A user query is embedded with the same model and matched against stored vectors by cosine similarity, returning the top 30 candidate chunks. +3. **Re-rank**: A cross-encoder (`cross-encoder/ms-marco-MiniLM-L-12-v2`) scores each (query, chunk) pair jointly and keeps the top 15. +4. **Synthesize**: The re-ranked chunks are passed to a local LLM with a custom prompt that encourages multi-source synthesis, producing a grounded answer with file citations. + +## Project structure + +``` +ssearch/ +├── build_exp_claude.py # Build/update vector store (incremental by default) +├── query_topk_prompt_engine_v3.py # Main query engine (cross-encoder re-ranking) +├── query_topk_prompt_engine_v2.py # Previous query engine (no re-ranking) +├── retrieve_raw.py # Verbatim chunk retrieval (no LLM) +├── query_hybrid_bm25_v4.py # Hybrid BM25 + vector query (v4) +├── retrieve_hybrid_raw.py # Hybrid verbatim retrieval (no LLM) +├── search_keywords.py # Keyword search via POS-based term extraction +├── run_query.sh # Shell wrapper with timing and logging +├── data/ # Symlink to ../text/ (journal .txt files) +├── storage_exp/ # Persisted vector store (~242 MB) +├── models/ # Cached HuggingFace models (embedding + cross-encoder, offline) +├── archived/ # Earlier iterations and prototypes +├── saved_output/ # Saved query results and model comparisons +├── requirements.txt # Python dependencies (pip freeze) +├── NOTES.md # Similarity metric reference +├── devlog.txt # Development log and experimental findings +└── *.ipynb # Jupyter notebooks (HyDE, metrics, sandbox) +``` + +## Setup + +**Prerequisites**: Python 3.12, [Ollama](https://ollama.com) with `command-r7b` pulled. + +```bash +cd ssearch +python3 -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +``` + +The `data/` symlink should point to `../text/` (the journal archive). The embedding model (`BAAI/bge-large-en-v1.5`) and cross-encoder (`cross-encoder/ms-marco-MiniLM-L-12-v2`) are cached in `./models/` for offline use. + +### Offline model loading + +All query scripts set three environment variables to prevent HuggingFace from making network requests: + +```python +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models" +os.environ["HF_HUB_OFFLINE"] = "1" +``` + +**These must appear before any imports that touch HuggingFace libraries.** The `huggingface_hub` library evaluates `HF_HUB_OFFLINE` once at import time (in `huggingface_hub/constants.py`). If the env var is set after imports, the library will still attempt network access and fail offline. This is a common pitfall -- `llama_index.embeddings.huggingface` transitively imports `huggingface_hub`, so even indirect imports trigger the evaluation. + +Alternatively, set the variable in your shell before running Python: +```bash +export HF_HUB_OFFLINE=1 +python query_hybrid_bm25_v4.py "your query" +``` + +## Usage + +### Build the vector store + +```bash +# Incremental update (default): only processes new, modified, or deleted files +python build_exp_claude.py + +# Full rebuild from scratch +python build_exp_claude.py --rebuild +``` + +The default incremental mode loads the existing index, compares file sizes and modification dates against the docstore, and only re-indexes what changed. A full rebuild (`--rebuild`) is only needed when chunk parameters or the embedding model change. + +### Search + +Three categories of search are available, from heaviest (semantic + LLM) to lightest (grep). + +#### Semantic search with LLM synthesis + +These scripts embed the query, retrieve candidate chunks from the vector store, re-rank with a cross-encoder, and pass the top results to a local LLM that synthesizes a grounded answer with file citations. **Requires Ollama running with `command-r7b`.** + +**Vector-only** (`query_topk_prompt_engine_v3.py`): Retrieves the top 30 chunks by cosine similarity, re-ranks to top 15, synthesizes. +```bash +python query_topk_prompt_engine_v3.py "What does the author say about creativity?" +``` + +**Hybrid BM25 + vector** (`query_hybrid_bm25_v4.py`): Retrieves top 20 by vector similarity and top 20 by BM25 term frequency, merges and deduplicates, re-ranks the union to top 15, synthesizes. Catches exact name/term matches that vector-only retrieval misses. +```bash +python query_hybrid_bm25_v4.py "Louis Menand" +``` + +**Interactive wrapper** (`run_query.sh`): Loops for queries using the v3 engine, displays timing, and appends queries to `query.log`. +```bash +./run_query.sh +``` + +#### Verbatim chunk retrieval (no LLM) + +These scripts run the same retrieval and re-ranking pipeline but output the raw chunk text instead of passing it to an LLM. Useful for inspecting what the retrieval pipeline finds, or when Ollama is not available. **No Ollama needed.** + +**Vector-only** (`retrieve_raw.py`): Top-30 vector retrieval, cross-encoder re-rank to top 15, raw output. +```bash +python retrieve_raw.py "Kondiaronk and the Wendats" +``` + +**Hybrid BM25 + vector** (`retrieve_hybrid_raw.py`): Same hybrid retrieval as v4 but outputs raw chunks. Each chunk is annotated with its source: `[vector-only]`, `[bm25-only]`, or `[vector+bm25]`. +```bash +python retrieve_hybrid_raw.py "Louis Menand" +``` + +Pipe either to `less` for browsing. + +#### Keyword search (no vector store, no LLM) + +**`search_keywords.py`**: Extracts nouns and adjectives from the query using NLTK POS tagging, then greps `./data/*.txt` for matches with surrounding context. A lightweight fallback when you want exact string matching without the vector store. **No vector store or Ollama needed.** +```bash +python search_keywords.py "Discussions of Kondiaronk and the Wendats" +``` + +### Output format + +``` +Response: + + +Source documents: +2024-03-15.txt ./data/2024-03-15.txt 0.683 +2023-11-02.txt ./data/2023-11-02.txt 0.651 +... +``` + +## Configuration + +Key parameters (set in source files): + +| Parameter | Value | Location | +|-----------|-------|----------| +| Embedding model | `BAAI/bge-large-en-v1.5` | `build_exp_claude.py`, `query_topk_prompt_engine_v3.py` | +| Chunk size | 256 tokens | `build_exp_claude.py` | +| Chunk overlap | 25 tokens | `build_exp_claude.py` | +| Paragraph separator | `\n\n` | `build_exp_claude.py` | +| Initial retrieval | 30 chunks | `query_topk_prompt_engine_v3.py` | +| Re-rank model | `cross-encoder/ms-marco-MiniLM-L-12-v2` | `query_topk_prompt_engine_v3.py` | +| Re-rank top-n | 15 | `query_topk_prompt_engine_v3.py` | +| LLM | `command-r7b` (Ollama) or `gpt-4o-mini` (OpenAI API) | `query_topk_prompt_engine_v3.py`, `query_hybrid_bm25_v4.py` | +| Temperature | 0.3 (recommended for both local and API models) | `query_topk_prompt_engine_v3.py`, `query_hybrid_bm25_v4.py` | +| Context window | 8000 tokens | `query_topk_prompt_engine_v3.py` | +| Request timeout | 360 seconds | `query_topk_prompt_engine_v3.py` | + +## Key dependencies + +- **llama-index-core** (0.14.14) -- RAG framework +- **llama-index-embeddings-huggingface** (0.6.1) -- embedding integration +- **llama-index-llms-ollama** (0.9.1) -- local LLM via Ollama +- **llama-index-llms-openai** (0.6.18) -- OpenAI API LLM (optional, for API-based synthesis) +- **llama-index-readers-file** (0.5.6) -- file readers +- **llama-index-retrievers-bm25** (0.6.5) -- BM25 sparse retrieval for hybrid search +- **sentence-transformers** (5.1.0) -- embedding model support +- **torch** (2.8.0) -- ML runtime + +## Notebooks + +Three Jupyter notebooks document exploration and analysis: + +- **`hyde.ipynb`** -- Experiments with HyDE (Hypothetical Document Embeddings) query rewriting. Tests whether generating a hypothetical answer to a query and embedding that instead improves retrieval. Uses LlamaIndex's `HyDEQueryTransform` with `llama3.1:8B`. Finding: the default HyDE prompt produced a rich hypothetical passage, but the technique did not improve retrieval quality over direct prompt engineering. This informed the decision to drop HyDE from the pipeline. + +- **`sandbox.ipynb`** -- Exploratory notebook for learning the LlamaIndex API. Inspects the `llama_index.core` module (104 objects), lists available classes and methods, and reads the source of `VectorStoreIndex`. Useful as a quick reference for what LlamaIndex exposes. + +- **`vs_metrics.ipynb`** -- Quantitative analysis of the vector store. Loads the persisted index (4,692 vectors, 1024 dimensions each from `BAAI/bge-large-en-v1.5`) and produces: + - Distribution of embedding values (histogram) + - Heatmap of the full embedding matrix + - Embedding vector magnitude distribution + - Per-dimension variance (which dimensions carry more signal) + - Pairwise cosine similarity distribution and heatmap (subset) + - Hierarchical clustering dendrogram (Ward linkage) + - PCA and t-SNE 2D projections of the embedding space + +## Design decisions + +- **BAAI/bge-large-en-v1.5 over all-mpnet-base-v2**: Better semantic matching quality for journal text despite slower embedding. +- **256-token chunks**: Tested 512 and 384; 256 with 25-token overlap produced the highest quality matches. +- **command-r7b over llama3.1:8B**: Sticks closer to provided context with less hallucination at comparable speed. +- **Top-k=15**: Wide enough to capture diverse perspectives, narrow enough to fit the context window. +- **Cross-encoder re-ranking (v3)**: Retrieve top-30 via bi-encoder, re-rank to top-15 with a cross-encoder that scores each (query, chunk) pair jointly. More accurate than bi-encoder similarity alone. Tested three models; `ms-marco-MiniLM-L-12-v2` selected over `stsb-roberta-base` (wrong task -- semantic similarity, not passage ranking) and `BAAI/bge-reranker-v2-m3` (50% slower, weak score tail). +- **HyDE query rewriting tested and dropped**: Did not improve results over direct prompt engineering. +- **V3 prompt**: Adapted for re-ranked context -- tells the LLM all excerpts have been curated, encourages examining every chunk and noting what each file contributes. Produces better multi-source synthesis than v2's prompt. +- **V2 prompt**: More flexible and query-adaptive than v1, which forced rigid structure (exactly 10 files, mandatory theme). +- **Verbatim retrieval (`retrieve_raw.py`)**: Uses LlamaIndex's `index.as_retriever()` instead of `index.as_query_engine()`. The retriever returns raw `NodeWithScore` objects (chunk text, metadata, scores) without invoking the LLM. The re-ranker is applied manually via `reranker.postprocess_nodes()`. This separation lets you inspect what the pipeline retrieves before synthesis. +- **Keyword search (`search_keywords.py`)**: NLTK POS tagging extracts nouns and adjectives from the query -- a middle ground between naive stopword removal and LLM-based term extraction. Catches exact names, places, and dates that vector similarity misses. +- **Hybrid BM25 + vector retrieval (v4)**: Runs two retrievers in parallel -- BM25 (top-20 by term frequency) and vector similarity (top-20 by cosine) -- merges and deduplicates candidates, then lets the cross-encoder re-rank the union to top-15. BM25 nominates candidates with exact term matches that embeddings miss; the cross-encoder decides final relevance. Uses `BM25Retriever.from_defaults(index=index)` from `llama-index-retrievers-bm25`, which indexes the nodes already stored in the persisted vector store. + +## Development history + +- **Aug 2025**: Initial implementation -- build pipeline, embedding model comparison, chunk size experiments, HyDE testing, prompt v1. +- **Jan 2026**: Command-line interface, v2 prompt, error handling improvements, model comparison (command-r7b selected). +- **Feb 2026**: Project tidy-up, cross-encoder re-ranking (v3), v3 prompt for multi-source synthesis, cross-encoder model comparison (L-12 selected), archived superseded scripts. Hybrid BM25 + vector retrieval (v4). Upgraded LlamaIndex from 0.13.1 to 0.14.14; added OpenAI API as optional LLM backend (`llama-index-llms-openai`). Incremental vector store updates (default mode in `build_exp_claude.py`). Fixed offline HuggingFace model loading (env vars must precede imports). + +See `devlog.txt` for detailed development notes and experimental findings. diff --git a/build_clippings.py b/build_clippings.py new file mode 100644 index 0000000..b808e92 --- /dev/null +++ b/build_clippings.py @@ -0,0 +1,471 @@ +# build_clippings.py +# +# Build or update the ChromaDB vector store from clippings in ./clippings. +# +# Default mode (incremental): loads the existing index and adds only +# new or modified files. Use --rebuild for a full rebuild from scratch. +# +# Handles PDFs, TXT, webarchive, and RTF files. Skips non-extractable PDFs +# and writes them to ocr_needed.txt for later OCR processing. +# +# February 2026 +# E. M. Furst + +# Environment vars must be set before importing huggingface/transformers +# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE +# at import time. +import os +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models" +os.environ["HF_HUB_OFFLINE"] = "1" + +import chromadb +from llama_index.core import ( + SimpleDirectoryReader, + StorageContext, + VectorStoreIndex, + Settings, + Document, +) +from llama_index.vector_stores.chroma import ChromaVectorStore +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.core.node_parser import SentenceSplitter +from pathlib import Path +import argparse +import datetime +import time + +# Shared constants +DATA_DIR = Path("./clippings") +PERSIST_DIR = "./storage_clippings" +COLLECTION_NAME = "clippings" +EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5" +CHUNK_SIZE = 256 +CHUNK_OVERLAP = 25 + +# File types handled by SimpleDirectoryReader (PDF + TXT) +READER_EXTS = {".pdf", ".txt"} +# File types handled by custom loaders +CUSTOM_EXTS = {".webarchive", ".rtf"} +# All supported extensions +SUPPORTED_EXTS = READER_EXTS | CUSTOM_EXTS + +# Minimum extracted text length to consider a PDF valid (characters) +MIN_TEXT_LENGTH = 100 + + +def get_text_splitter(): + return SentenceSplitter( + chunk_size=CHUNK_SIZE, + chunk_overlap=CHUNK_OVERLAP, + paragraph_separator="\n\n", + ) + + +def validate_pdf(file_path): + """Check if a PDF has extractable text. + + Returns (is_valid, reason) where reason explains why it was skipped. + """ + import pypdf + try: + reader = pypdf.PdfReader(str(file_path)) + page_count = len(reader.pages) + total_chars = 0 + printable_chars = 0 + for page in reader.pages: + text = page.extract_text() or "" + total_chars += len(text) + printable_chars += sum( + 1 for c in text if c.isprintable() or c in "\n\r\t" + ) + + if total_chars < MIN_TEXT_LENGTH: + return False, f"too little text ({total_chars} chars, {page_count} pages)" + + ratio = printable_chars / total_chars if total_chars > 0 else 0 + if ratio < 0.5: + return False, f"low printable ratio ({ratio:.2f}, {page_count} pages)" + + return True, None + except Exception as e: + return False, str(e) + + +def load_webarchive(file_path): + """Extract text from a macOS .webarchive file. + + Returns a LlamaIndex Document, or None if extraction fails. + """ + import plistlib + from bs4 import BeautifulSoup + + try: + with open(file_path, "rb") as f: + plist = plistlib.load(f) + + resource = plist.get("WebMainResource", {}) + html_bytes = resource.get("WebResourceData", b"") + if not html_bytes: + return None + + html = html_bytes.decode("utf-8", errors="replace") + soup = BeautifulSoup(html, "html.parser") + text = soup.get_text(separator="\n", strip=True) + + if len(text) < MIN_TEXT_LENGTH: + return None + + stat = file_path.stat() + mdate = datetime.datetime.fromtimestamp( + stat.st_mtime, tz=datetime.timezone.utc + ).strftime("%Y-%m-%d") + + return Document( + text=text, + metadata={ + "file_name": file_path.name, + "file_path": str(file_path), + "file_size": stat.st_size, + "last_modified_date": mdate, + "file_type": "webarchive", + }, + ) + except Exception as e: + print(f" Warning: could not read webarchive {file_path.name}: {e}") + return None + + +def load_rtf(file_path): + """Extract text from an RTF file. + + Returns a LlamaIndex Document, or None if extraction fails. + """ + from striprtf.striprtf import rtf_to_text + + try: + with open(file_path, "r", errors="replace") as f: + rtf_content = f.read() + + text = rtf_to_text(rtf_content) + + if len(text) < MIN_TEXT_LENGTH: + return None + + stat = file_path.stat() + mdate = datetime.datetime.fromtimestamp( + stat.st_mtime, tz=datetime.timezone.utc + ).strftime("%Y-%m-%d") + + return Document( + text=text, + metadata={ + "file_name": file_path.name, + "file_path": str(file_path), + "file_size": stat.st_size, + "last_modified_date": mdate, + "file_type": "rtf", + }, + ) + except Exception as e: + print(f" Warning: could not read RTF {file_path.name}: {e}") + return None + + +def scan_clippings(): + """Scan the clippings directory and classify files. + + Returns (reader_files, custom_docs, skipped, ocr_needed) where: + - reader_files: list of Paths for SimpleDirectoryReader (PDF + TXT) + - custom_docs: list of Document objects from custom loaders + - skipped: list of (Path, reason) tuples + - ocr_needed: list of Paths for PDFs that need OCR + """ + reader_files = [] + custom_docs = [] + skipped = [] + ocr_needed = [] + + for fpath in sorted(DATA_DIR.rglob("*")): + if not fpath.is_file(): + continue + if fpath.name.startswith("."): + continue + + ext = fpath.suffix.lower() + + if ext not in SUPPORTED_EXTS: + skipped.append((fpath, f"unsupported type: {ext}")) + continue + + if ext == ".pdf": + is_valid, reason = validate_pdf(fpath) + if not is_valid: + skipped.append((fpath, f"no extractable text: {reason}")) + ocr_needed.append(fpath) + continue + reader_files.append(fpath) + + elif ext == ".txt": + reader_files.append(fpath) + + elif ext == ".webarchive": + doc = load_webarchive(fpath) + if doc: + custom_docs.append(doc) + else: + skipped.append((fpath, "no extractable text from webarchive")) + + elif ext == ".rtf": + doc = load_rtf(fpath) + if doc: + custom_docs.append(doc) + else: + skipped.append((fpath, "no extractable text from RTF")) + + return reader_files, custom_docs, skipped, ocr_needed + + +def write_ocr_list(ocr_needed): + """Write the list of PDFs needing OCR to ocr_needed.txt.""" + with open("ocr_needed.txt", "w") as f: + for fpath in ocr_needed: + f.write(f"{fpath}\n") + print(f"Wrote {len(ocr_needed)} file(s) to ocr_needed.txt") + + +def load_all_documents(reader_files, custom_docs): + """Load documents from SimpleDirectoryReader and merge with custom docs.""" + documents = [] + + if reader_files: + print(f"Loading {len(reader_files)} PDF/TXT files...") + reader_docs = SimpleDirectoryReader( + input_files=[str(f) for f in reader_files], + filename_as_id=True, + ).load_data() + documents.extend(reader_docs) + + if custom_docs: + print(f"Adding {len(custom_docs)} webarchive/RTF documents...") + documents.extend(custom_docs) + + return documents + + +def rebuild(reader_files, custom_docs): + """Full rebuild: delete existing collection and recreate from scratch.""" + client = chromadb.PersistentClient(path=PERSIST_DIR) + # Delete existing collection if present + try: + client.delete_collection(COLLECTION_NAME) + print(f"Deleted existing collection '{COLLECTION_NAME}'") + except Exception: + pass + + collection = client.get_or_create_collection(COLLECTION_NAME) + vector_store = ChromaVectorStore(chroma_collection=collection) + storage_context = StorageContext.from_defaults(vector_store=vector_store) + + documents = load_all_documents(reader_files, custom_docs) + if not documents: + raise ValueError("No documents loaded") + + print(f"Loaded {len(documents)} document(s) total") + print("Building vector index...") + + index = VectorStoreIndex.from_documents( + documents, + storage_context=storage_context, + transformations=[get_text_splitter()], + show_progress=True, + ) + + print(f"Index built. Collection has {collection.count()} vectors.") + return index + + +def update(reader_files, custom_docs): + """Incremental update: add new, re-index modified, remove deleted files.""" + client = chromadb.PersistentClient(path=PERSIST_DIR) + collection = client.get_collection(COLLECTION_NAME) + count = collection.count() + print(f"Existing collection has {count} vectors") + + # Get all stored metadata to find what's indexed + # Key on file_path (not file_name) to handle duplicate names across subdirs + indexed = {} # file_path -> {"ids": [], "file_size": ..., "last_modified_date": ...} + if count > 0: + results = collection.get(include=["metadatas"]) + for i, meta in enumerate(results["metadatas"]): + fpath = meta.get("file_path", "") + if fpath not in indexed: + indexed[fpath] = { + "ids": [], + "file_size": meta.get("file_size"), + "last_modified_date": meta.get("last_modified_date"), + } + indexed[fpath]["ids"].append(results["ids"][i]) + + print(f"Index contains {len(indexed)} unique files") + + # Build disk file lookup: file_path_str -> Path + # For reader_files, match the path format SimpleDirectoryReader would store + disk_files = {} + for f in reader_files: + disk_files[str(f)] = f + for doc in custom_docs: + disk_files[doc.metadata["file_path"]] = Path(doc.metadata["file_path"]) + + # Classify files + new_reader = [] + new_custom = [] + modified_reader = [] + modified_custom = [] + deleted_paths = [] + unchanged = 0 + + for path_str, fpath in disk_files.items(): + if path_str not in indexed: + # Check if it's a custom doc + if fpath.suffix.lower() in CUSTOM_EXTS: + matching = [d for d in custom_docs if d.metadata["file_path"] == path_str] + if matching: + new_custom.extend(matching) + else: + new_reader.append(fpath) + else: + info = indexed[path_str] + stat = fpath.stat() + disk_mdate = datetime.datetime.fromtimestamp( + stat.st_mtime, tz=datetime.timezone.utc + ).strftime("%Y-%m-%d") + + if stat.st_size != info["file_size"] or disk_mdate != info["last_modified_date"]: + if fpath.suffix.lower() in CUSTOM_EXTS: + matching = [d for d in custom_docs if d.metadata["file_path"] == path_str] + if matching: + modified_custom.extend(matching) + else: + modified_reader.append(fpath) + else: + unchanged += 1 + + for path_str in indexed: + if path_str not in disk_files: + deleted_paths.append(path_str) + + n_new = len(new_reader) + len(new_custom) + n_modified = len(modified_reader) + len(modified_custom) + print(f"\n New: {n_new}") + print(f" Modified: {n_modified}") + print(f" Deleted: {len(deleted_paths)}") + print(f" Unchanged: {unchanged}") + + if n_new == 0 and n_modified == 0 and len(deleted_paths) == 0: + print("\nNothing to do.") + return + + # Delete chunks for removed and modified files + for path_str in deleted_paths: + ids = indexed[path_str]["ids"] + fname = Path(path_str).name + print(f" Removing {fname} ({len(ids)} chunks)") + collection.delete(ids=ids) + + for fpath in modified_reader: + path_str = str(fpath) + ids = indexed[path_str]["ids"] + print(f" Re-indexing {fpath.name} ({len(ids)} chunks)") + collection.delete(ids=ids) + + for doc in modified_custom: + path_str = doc.metadata["file_path"] + if path_str in indexed: + ids = indexed[path_str]["ids"] + print(f" Re-indexing {doc.metadata['file_name']} ({len(ids)} chunks)") + collection.delete(ids=ids) + + # Add new and modified files + files_to_add = new_reader + modified_reader + docs_to_add = new_custom + modified_custom + + if files_to_add or docs_to_add: + documents = load_all_documents(files_to_add, docs_to_add) + if documents: + print(f"Indexing {len(documents)} document(s)...") + vector_store = ChromaVectorStore(chroma_collection=collection) + storage_context = StorageContext.from_defaults(vector_store=vector_store) + + VectorStoreIndex.from_documents( + documents, + storage_context=storage_context, + transformations=[get_text_splitter()], + show_progress=True, + ) + + print(f"\nIndex updated. Collection now has {collection.count()} vectors.") + + +def main(): + parser = argparse.ArgumentParser( + description="Build or update the clippings vector store (ChromaDB)." + ) + parser.add_argument( + "--rebuild", + action="store_true", + help="Full rebuild from scratch (default: incremental update)", + ) + args = parser.parse_args() + + # Configure embedding model (offline, cached in ./models) + embed_model = HuggingFaceEmbedding( + model_name=EMBED_MODEL_NAME, + cache_folder="./models", + local_files_only=True, + ) + Settings.embed_model = embed_model + + if not DATA_DIR.exists(): + raise FileNotFoundError( + f"Clippings directory not found: {DATA_DIR.absolute()}\n" + f"Create symlink: ln -s ../clippings ./clippings" + ) + + start = time.time() + + # Scan and classify files + print(f"Scanning {DATA_DIR.absolute()}...") + reader_files, custom_docs, skipped, ocr_needed = scan_clippings() + + n_valid = len(reader_files) + len(custom_docs) + print(f"\nFiles to index: {n_valid}") + print(f" PDF/TXT: {len(reader_files)}") + print(f" Webarchive/RTF: {len(custom_docs)}") + print(f"Files skipped: {len(skipped)}") + for fpath, reason in skipped: + print(f" SKIP: {fpath.name} -- {reason}") + + if ocr_needed: + write_ocr_list(ocr_needed) + + if n_valid == 0: + raise ValueError("No valid files found to index") + + if args.rebuild: + print("\nMode: full rebuild") + rebuild(reader_files, custom_docs) + else: + print("\nMode: incremental update") + if not Path(PERSIST_DIR).exists(): + print(f"No existing index at {PERSIST_DIR}, doing full rebuild.") + rebuild(reader_files, custom_docs) + else: + update(reader_files, custom_docs) + + elapsed = time.time() - start + print(f"Done in {elapsed:.1f}s") + + +if __name__ == "__main__": + main() diff --git a/build_exp_claude.py b/build_exp_claude.py new file mode 100644 index 0000000..891507b --- /dev/null +++ b/build_exp_claude.py @@ -0,0 +1,193 @@ +# build_exp_claude.py +# +# Build or update the vector store from journal entries in ./data. +# +# Default mode (incremental): loads the existing index and adds only +# new or modified files. Use --rebuild for a full rebuild from scratch. +# +# January 2026 +# E. M. Furst +# Used Sonnet 4.5 to suggest changes; Opus 4.6 for incremental update + +from llama_index.core import ( + SimpleDirectoryReader, + StorageContext, + VectorStoreIndex, + load_index_from_storage, + Settings, +) +from pathlib import Path +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.core.node_parser import SentenceSplitter +import argparse +import datetime +import os +import time + +# Shared constants +DATA_DIR = Path("./data") +PERSIST_DIR = "./storage_exp" +EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5" +CHUNK_SIZE = 256 +CHUNK_OVERLAP = 25 + + +def get_text_splitter(): + return SentenceSplitter( + chunk_size=CHUNK_SIZE, + chunk_overlap=CHUNK_OVERLAP, + paragraph_separator="\n\n", + ) + + +def rebuild(): + """Full rebuild: delete and recreate the vector store from scratch.""" + if not DATA_DIR.exists(): + raise FileNotFoundError(f"Data directory not found: {DATA_DIR.absolute()}") + + print(f"Loading documents from {DATA_DIR.absolute()}...") + documents = SimpleDirectoryReader(str(DATA_DIR)).load_data() + + if not documents: + raise ValueError("No documents found in data directory") + + print(f"Loaded {len(documents)} document(s)") + + print("Building vector index...") + index = VectorStoreIndex.from_documents( + documents, + transformations=[get_text_splitter()], + show_progress=True, + ) + + index.storage_context.persist(persist_dir=PERSIST_DIR) + print(f"Index built and saved to {PERSIST_DIR}") + + +def update(): + """Incremental update: add new files, re-index modified files, remove deleted files.""" + if not DATA_DIR.exists(): + raise FileNotFoundError(f"Data directory not found: {DATA_DIR.absolute()}") + + # Load existing index + print(f"Loading existing index from {PERSIST_DIR}...") + storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR) + index = load_index_from_storage(storage_context) + + # Set transformations so index.insert() chunks correctly + Settings.transformations = [get_text_splitter()] + + # Build lookup of indexed files: file_name -> (ref_doc_id, metadata) + all_ref_docs = index.docstore.get_all_ref_doc_info() + indexed = {} + for ref_id, info in all_ref_docs.items(): + fname = info.metadata.get("file_name") + if fname: + indexed[fname] = (ref_id, info.metadata) + + print(f"Index contains {len(indexed)} documents") + + # Scan current files on disk + disk_files = {f.name: f for f in sorted(DATA_DIR.glob("*.txt"))} + print(f"Data directory contains {len(disk_files)} files") + + # Classify files + new_files = [] + modified_files = [] + deleted_files = [] + unchanged = 0 + + for fname, fpath in disk_files.items(): + if fname not in indexed: + new_files.append(fpath) + else: + ref_id, meta = indexed[fname] + # Compare file size and modification date + stat = fpath.stat() + disk_size = stat.st_size + # Must use UTC to match SimpleDirectoryReader's date format + disk_mdate = datetime.datetime.fromtimestamp( + stat.st_mtime, tz=datetime.timezone.utc + ).strftime("%Y-%m-%d") + + stored_size = meta.get("file_size") + stored_mdate = meta.get("last_modified_date") + + if disk_size != stored_size or disk_mdate != stored_mdate: + modified_files.append((fpath, ref_id)) + else: + unchanged += 1 + + for fname, (ref_id, meta) in indexed.items(): + if fname not in disk_files: + deleted_files.append((fname, ref_id)) + + # Report + print(f"\n New: {len(new_files)}") + print(f" Modified: {len(modified_files)}") + print(f" Deleted: {len(deleted_files)}") + print(f" Unchanged: {unchanged}") + + if not new_files and not modified_files and not deleted_files: + print("\nNothing to do.") + return + + # Process deletions (including modified files that need re-indexing) + for fname, ref_id in deleted_files: + print(f" Removing {fname}") + index.delete_ref_doc(ref_id, delete_from_docstore=True) + + for fpath, ref_id in modified_files: + print(f" Re-indexing {fpath.name} (modified)") + index.delete_ref_doc(ref_id, delete_from_docstore=True) + + # Process additions (new files + modified files) + files_to_add = new_files + [fpath for fpath, _ in modified_files] + if files_to_add: + print(f"\nIndexing {len(files_to_add)} file(s)...") + # Use "./" prefix to match paths from full build (pathlib strips it) + docs = SimpleDirectoryReader( + input_files=[f"./{f}" for f in files_to_add] + ).load_data() + for doc in docs: + index.insert(doc) + + # Persist + index.storage_context.persist(persist_dir=PERSIST_DIR) + print(f"\nIndex updated and saved to {PERSIST_DIR}") + + +def main(): + parser = argparse.ArgumentParser( + description="Build or update the vector store from journal entries." + ) + parser.add_argument( + "--rebuild", + action="store_true", + help="Full rebuild from scratch (default: incremental update)", + ) + args = parser.parse_args() + + # Configure embedding model + embed_model = HuggingFaceEmbedding(model_name=EMBED_MODEL_NAME) + Settings.embed_model = embed_model + + start = time.time() + + if args.rebuild: + print("Mode: full rebuild") + rebuild() + else: + print("Mode: incremental update") + if not Path(PERSIST_DIR).exists(): + print(f"No existing index at {PERSIST_DIR}, doing full rebuild.") + rebuild() + else: + update() + + elapsed = time.time() - start + print(f"Done in {elapsed:.1f}s") + + +if __name__ == "__main__": + main() diff --git a/query_hybrid_bm25_v4.py b/query_hybrid_bm25_v4.py new file mode 100644 index 0000000..277f870 --- /dev/null +++ b/query_hybrid_bm25_v4.py @@ -0,0 +1,176 @@ +# query_hybrid_bm25_v4.py +# Hybrid retrieval: BM25 (sparse) + vector similarity (dense) + cross-encoder +# +# Combines two retrieval strategies to catch both exact term matches and +# semantic similarity: +# 1. Retrieve top-20 via vector similarity (bi-encoder, catches meaning) +# 2. Retrieve top-20 via BM25 (term frequency, catches exact names/dates) +# 3. Merge and deduplicate candidates by node ID +# 4. Re-rank the union with a cross-encoder -> top-15 +# 5. Pass re-ranked chunks to LLM for synthesis +# +# The cross-encoder doesn't care where candidates came from -- it scores +# each (query, chunk) pair on its own merits. BM25's job is just to +# nominate candidates that vector similarity might miss. +# +# E.M.F. February 2026 + +# Environment vars must be set before importing huggingface/transformers +# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE +# at import time. +import os +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models" +os.environ["HF_HUB_OFFLINE"] = "1" + +from llama_index.core import ( + StorageContext, + load_index_from_storage, + Settings, + get_response_synthesizer, +) +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.llms.ollama import Ollama +from llama_index.core.prompts import PromptTemplate +from llama_index.core.postprocessor import SentenceTransformerRerank +from llama_index.retrievers.bm25 import BM25Retriever +import sys + +# +# Globals +# + +# Embedding model (must match build_exp_claude.py) +EMBED_MODEL = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5", local_files_only=True) + +# LLM model for generation +LLM_MODEL = "command-r7b" + +# Cross-encoder model for re-ranking (cached in ./models/) +RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2" +RERANK_TOP_N = 15 + +# Retrieval parameters +VECTOR_TOP_K = 20 # candidates from vector similarity +BM25_TOP_K = 20 # candidates from BM25 term matching + +# +# Custom prompt -- same as v3 +# +PROMPT = PromptTemplate( +"""You are a precise research assistant analyzing excerpts from a personal journal collection. +Every excerpt below has been selected and ranked for relevance to the query. + +CONTEXT (ranked by relevance): +{context_str} + +QUERY: +{query_str} + +Instructions: +- Answer ONLY using information explicitly present in the CONTEXT above +- Examine ALL provided excerpts, not just the top few -- each one was selected for relevance +- Be specific: quote or closely paraphrase key passages and cite their file names +- When multiple files touch on the query, note what each one contributes +- If the context doesn't contain enough information to answer fully, say so + +Your response should: +1. Directly answer the query, drawing on as many relevant excerpts as possible +2. Reference specific files and their content (e.g., "In , ...") +3. End with a list of all files that contributed to your answer, with a brief note on each + +If the context is insufficient, explain what's missing.""" +) + + +def main(): + # Configure LLM and embedding model + # for local model using ollama + # Note: Ollama temperature defaults to 0.8 + Settings.llm = Ollama( + model=LLM_MODEL, + temperature=0.3, + request_timeout=360.0, + context_window=8000, + ) + + # Use OpenAI API: + # from llama_index.llms.openai import OpenAI + # Settings.llm = OpenAI( + # model="gpt-4o-mini", # or "gpt-4o" for higher quality + # temperature=0.3, + # ) + + Settings.embed_model = EMBED_MODEL + + + # Load persisted vector store + storage_context = StorageContext.from_defaults(persist_dir="./storage_exp") + index = load_index_from_storage(storage_context) + + # --- Retrievers --- + + # Vector retriever (dense: cosine similarity over embeddings) + vector_retriever = index.as_retriever(similarity_top_k=VECTOR_TOP_K) + + # BM25 retriever (sparse: term frequency scoring) + bm25_retriever = BM25Retriever.from_defaults( + index=index, + similarity_top_k=BM25_TOP_K, + ) + + # Cross-encoder re-ranker + reranker = SentenceTransformerRerank( + model=RERANK_MODEL, + top_n=RERANK_TOP_N, + ) + + # --- Query --- + + if len(sys.argv) < 2: + print("Usage: python query_hybrid_bm25_v4.py QUERY_TEXT") + sys.exit(1) + q = " ".join(sys.argv[1:]) + + # Retrieve from both sources + vector_nodes = vector_retriever.retrieve(q) + bm25_nodes = bm25_retriever.retrieve(q) + + # Merge and deduplicate by node ID + seen_ids = set() + merged = [] + for node in vector_nodes + bm25_nodes: + node_id = node.node.node_id + if node_id not in seen_ids: + seen_ids.add(node_id) + merged.append(node) + + # Re-rank the merged candidates with cross-encoder + reranked = reranker.postprocess_nodes(merged, query_str=q) + + # Report retrieval stats + n_vector_only = len([n for n in vector_nodes if n.node.node_id not in {b.node.node_id for b in bm25_nodes}]) + n_bm25_only = len([n for n in bm25_nodes if n.node.node_id not in {v.node.node_id for v in vector_nodes}]) + n_both = len(vector_nodes) + len(bm25_nodes) - len(merged) + + print(f"\nQuery: {q}") + print(f"Vector: {len(vector_nodes)}, BM25: {len(bm25_nodes)}, " + f"overlap: {n_both}, merged: {len(merged)}, re-ranked to: {len(reranked)}") + + # Synthesize response with LLM + synthesizer = get_response_synthesizer(text_qa_template=PROMPT) + response = synthesizer.synthesize(q, nodes=reranked) + + # Output + print("\nResponse:\n") + print(response.response) + + print("\nSource documents:") + for node in response.source_nodes: + meta = getattr(node, "metadata", None) or node.node.metadata + score = getattr(node, "score", None) + print(f"{meta.get('file_name')} {meta.get('file_path')} {score:.3f}") + + +if __name__ == "__main__": + main() diff --git a/query_topk_prompt_engine_v3.py b/query_topk_prompt_engine_v3.py new file mode 100644 index 0000000..12083b0 --- /dev/null +++ b/query_topk_prompt_engine_v3.py @@ -0,0 +1,136 @@ +# query_topk_prompt_engine_v3.py +# Run a query on a vector store with cross-encoder re-ranking +# +# Based on v2. Adds a cross-encoder re-ranking step: +# 1. Retrieve top-30 chunks via vector similarity (bi-encoder, fast) +# 2. Re-rank to top-15 using a cross-encoder (slower but more accurate) +# 3. Pass re-ranked chunks to LLM for synthesis +# +# The cross-encoder scores each (query, chunk) pair jointly, which captures +# nuance that bi-encoder dot-product similarity misses. +# +# E.M.F. February 2026 + +# Environment vars must be set before importing huggingface/transformers +# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE +# at import time. +import os +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models" +os.environ["HF_HUB_OFFLINE"] = "1" + +from llama_index.core import ( + StorageContext, + load_index_from_storage, + Settings, +) +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.llms.ollama import Ollama +from llama_index.core.prompts import PromptTemplate +from llama_index.core.postprocessor import SentenceTransformerRerank +import sys + +# +# Globals +# + +# Embedding model used in vector store (must match build_exp_claude.py) +EMBED_MODEL = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5", local_files_only=True) + +# LLM model for generation +llm = "command-r7b" + +# Cross-encoder model for re-ranking (cached in ./models/) +#RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2" +RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2" +#RERANK_MODEL = "cross-encoder/stsb-roberta-base" +#RERANK_MODEL = "BAAI/bge-reranker-v2-m3" +RERANK_TOP_N = 15 # keep top 15 after re-ranking +RETRIEVE_TOP_K = 30 # retrieve wider pool for re-ranker to work with + +# +# Custom prompt for the query engine - Version 3 +# +# Adapted for re-ranked context: every excerpt below has been scored for +# relevance by a cross-encoder, so even lower-ranked ones are worth examining. +# The prompt encourages the LLM to draw from all provided excerpts and to +# note what each distinct file contributes rather than collapsing onto one. +# +PROMPT = PromptTemplate( +"""You are a precise research assistant analyzing excerpts from a personal journal collection. +Every excerpt below has been selected and ranked for relevance to the query. + +CONTEXT (ranked by relevance): +{context_str} + +QUERY: +{query_str} + +Instructions: +- Answer ONLY using information explicitly present in the CONTEXT above +- Examine ALL provided excerpts, not just the top few -- each one was selected for relevance +- Be specific: quote or closely paraphrase key passages and cite their file names +- When multiple files touch on the query, note what each one contributes +- If the context doesn't contain enough information to answer fully, say so + +Your response should: +1. Directly answer the query, drawing on as many relevant excerpts as possible +2. Reference specific files and their content (e.g., "In , ...") +3. End with a list of all files that contributed to your answer, with a brief note on each + +If the context is insufficient, explain what's missing.""" +) + +# +# Main program routine +# + +def main(): + # Use a local model to generate -- in this case using Ollama + Settings.llm = Ollama( + model=llm, + request_timeout=360.0, + context_window=8000 + ) + + # Load embedding model (same as used for vector store) + Settings.embed_model = EMBED_MODEL + + # Load persisted vector store + metadata + storage_context = StorageContext.from_defaults(persist_dir="./storage_exp") + index = load_index_from_storage(storage_context) + + # Cross-encoder re-ranker + reranker = SentenceTransformerRerank( + model=RERANK_MODEL, + top_n=RERANK_TOP_N, + ) + + # Build query engine: retrieve wide (top-30), re-rank to top-15, then synthesize + query_engine = index.as_query_engine( + similarity_top_k=RETRIEVE_TOP_K, + text_qa_template=PROMPT, + node_postprocessors=[reranker], + ) + + # Query + if len(sys.argv) < 2: + print("Usage: python query_topk_prompt_engine_v3.py QUERY_TEXT") + sys.exit(1) + q = " ".join(sys.argv[1:]) + + # Generate the response by querying the engine + response = query_engine.query(q) + + # Return the query response and source documents + print("\nResponse:\n") + print(response.response) + + print("\nSource documents:") + for node in response.source_nodes: + meta = getattr(node, "metadata", None) or node.node.metadata + print(f"{meta.get('file_name')} {meta.get('file_path')} {getattr(node, 'score', None):.3f}") + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..85f1abf --- /dev/null +++ b/requirements.txt @@ -0,0 +1,216 @@ +aiohappyeyeballs==2.6.1 +aiohttp==3.12.15 +aiosignal==1.4.0 +aiosqlite==0.21.0 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anyio==4.10.0 +appnope==0.1.4 +argon2-cffi==25.1.0 +argon2-cffi-bindings==25.1.0 +arrow==1.3.0 +asttokens==3.0.0 +async-lru==2.0.5 +attrs==25.3.0 +babel==2.17.0 +backoff==2.2.1 +banks==2.2.0 +bcrypt==5.0.0 +beautifulsoup4==4.13.4 +bleach==6.2.0 +bm25s==0.2.14 +build==1.4.0 +certifi==2025.8.3 +cffi==1.17.1 +charset-normalizer==3.4.3 +chromadb==1.5.1 +click==8.2.1 +colorama==0.4.6 +comm==0.2.3 +contourpy==1.3.3 +cycler==0.12.1 +dataclasses-json==0.6.7 +debugpy==1.8.16 +decorator==5.2.1 +defusedxml==0.7.1 +Deprecated==1.2.18 +dirtyjson==1.0.8 +distro==1.9.0 +durationpy==0.10 +executing==2.2.0 +fastjsonschema==2.21.1 +filelock==3.18.0 +filetype==1.2.0 +flatbuffers==25.12.19 +fonttools==4.59.1 +fqdn==1.5.1 +frozenlist==1.7.0 +fsspec==2025.7.0 +googleapis-common-protos==1.72.0 +greenlet==3.2.4 +griffe==1.11.0 +grpcio==1.78.1 +h11==0.16.0 +hf-xet==1.1.7 +httpcore==1.0.9 +httptools==0.7.1 +httpx==0.28.1 +huggingface-hub==0.34.4 +idna==3.10 +importlib_metadata==8.7.1 +importlib_resources==6.5.2 +ipykernel==6.30.1 +ipython==9.4.0 +ipython_pygments_lexers==1.1.1 +ipywidgets==8.1.7 +isoduration==20.11.0 +jedi==0.19.2 +Jinja2==3.1.6 +jiter==0.13.0 +joblib==1.5.1 +json5==0.12.1 +jsonpointer==3.0.0 +jsonschema==4.25.0 +jsonschema-specifications==2025.4.1 +jupyter==1.1.1 +jupyter-console==6.6.3 +jupyter-events==0.12.0 +jupyter-lsp==2.2.6 +jupyter_client==8.6.3 +jupyter_core==5.8.1 +jupyter_server==2.16.0 +jupyter_server_terminals==0.5.3 +jupyterlab==4.4.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.15 +kiwisolver==1.4.9 +kubernetes==35.0.0 +lark==1.2.2 +llama-index-core==0.14.14 +llama-index-embeddings-huggingface==0.6.1 +llama-index-instrumentation==0.4.0 +llama-index-llms-ollama==0.9.1 +llama-index-llms-openai==0.6.18 +llama-index-readers-file==0.5.6 +llama-index-retrievers-bm25==0.6.5 +llama-index-vector-stores-chroma==0.5.5 +llama-index-workflows==2.14.2 +markdown-it-py==4.0.0 +MarkupSafe==3.0.2 +marshmallow==3.26.1 +matplotlib==3.10.5 +matplotlib-inline==0.1.7 +mdurl==0.1.2 +mistune==3.1.3 +mmh3==5.2.0 +mpmath==1.3.0 +multidict==6.6.3 +mypy_extensions==1.1.0 +nbclient==0.10.2 +nbconvert==7.16.6 +nbformat==5.10.4 +nest-asyncio==1.6.0 +networkx==3.5 +nltk==3.9.1 +notebook==7.4.5 +notebook_shim==0.2.4 +numpy==2.3.2 +oauthlib==3.3.1 +ollama==0.5.3 +onnxruntime==1.24.2 +openai==2.21.0 +opentelemetry-api==1.39.1 +opentelemetry-exporter-otlp-proto-common==1.39.1 +opentelemetry-exporter-otlp-proto-grpc==1.39.1 +opentelemetry-proto==1.39.1 +opentelemetry-sdk==1.39.1 +opentelemetry-semantic-conventions==0.60b1 +orjson==3.11.7 +overrides==7.7.0 +packaging==25.0 +pandas==2.2.3 +pandocfilters==1.5.1 +parso==0.8.4 +pexpect==4.9.0 +pillow==11.3.0 +platformdirs==4.3.8 +posthog==5.4.0 +prometheus_client==0.22.1 +prompt_toolkit==3.0.51 +propcache==0.3.2 +protobuf==6.33.5 +psutil==7.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pybase64==1.4.3 +pycparser==2.22 +pydantic==2.11.7 +pydantic_core==2.33.2 +Pygments==2.19.2 +pyparsing==3.2.3 +pypdf==6.7.1 +PyPika==0.51.1 +pyproject_hooks==1.2.0 +PyStemmer==2.2.0.3 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 +python-json-logger==3.3.0 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==27.0.1 +referencing==0.36.2 +regex==2025.7.34 +requests==2.32.4 +requests-oauthlib==2.0.0 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rfc3987-syntax==1.1.0 +rich==14.3.3 +rpds-py==0.27.0 +safetensors==0.6.2 +scikit-learn==1.7.1 +scipy==1.16.1 +seaborn==0.13.2 +Send2Trash==1.8.3 +sentence-transformers==5.1.0 +setuptools==80.9.0 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +soupsieve==2.7 +SQLAlchemy==2.0.42 +stack-data==0.6.3 +striprtf==0.0.26 +sympy==1.14.0 +tenacity==9.1.2 +terminado==0.18.1 +threadpoolctl==3.6.0 +tiktoken==0.11.0 +tinycss2==1.4.0 +tokenizers==0.21.4 +torch==2.8.0 +tornado==6.5.2 +tqdm==4.67.1 +traitlets==5.14.3 +transformers==4.55.0 +typer==0.24.1 +types-python-dateutil==2.9.0.20250809 +typing-inspect==0.9.0 +typing-inspection==0.4.1 +typing_extensions==4.14.1 +tzdata==2025.2 +uri-template==1.3.0 +urllib3==2.5.0 +uvicorn==0.41.0 +uvloop==0.22.1 +watchfiles==1.1.1 +wcwidth==0.2.13 +webcolors==24.11.1 +webencodings==0.5.1 +websocket-client==1.8.0 +websockets==16.0 +widgetsnbextension==4.0.14 +wrapt==1.17.2 +yarl==1.20.1 +zipp==3.23.0 diff --git a/retrieve_clippings.py b/retrieve_clippings.py new file mode 100644 index 0000000..2cb0c9e --- /dev/null +++ b/retrieve_clippings.py @@ -0,0 +1,138 @@ +# retrieve_clippings.py +# Verbatim chunk retrieval from clippings index (ChromaDB). +# Vector search + cross-encoder re-ranking, no LLM. +# +# Returns the top re-ranked chunks with their full text, file metadata, and +# scores. Includes page numbers for PDF sources when available. +# +# E.M.F. February 2026 + +# Environment vars must be set before importing huggingface/transformers +# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE +# at import time. +import os +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models" +os.environ["HF_HUB_OFFLINE"] = "1" + +import chromadb +from llama_index.core import VectorStoreIndex, Settings +from llama_index.vector_stores.chroma import ChromaVectorStore +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.core.postprocessor import SentenceTransformerRerank +import sys +import textwrap + +# +# Globals +# + +PERSIST_DIR = "./storage_clippings" +COLLECTION_NAME = "clippings" + +# Embedding model (must match build_clippings.py) +EMBED_MODEL = HuggingFaceEmbedding( + cache_folder="./models", + model_name="BAAI/bge-large-en-v1.5", + local_files_only=True, +) + +# Cross-encoder model for re-ranking (cached in ./models/) +RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2" +RERANK_TOP_N = 15 +RETRIEVE_TOP_K = 30 + +# Output formatting +WRAP_WIDTH = 80 + + +def main(): + # No LLM needed -- set embed model only + Settings.embed_model = EMBED_MODEL + + # Load ChromaDB collection + client = chromadb.PersistentClient(path=PERSIST_DIR) + collection = client.get_collection(COLLECTION_NAME) + + # Build index from existing vector store + vector_store = ChromaVectorStore(chroma_collection=collection) + index = VectorStoreIndex.from_vector_store(vector_store) + + # Build retriever (vector search only, no query engine / LLM) + retriever = index.as_retriever(similarity_top_k=RETRIEVE_TOP_K) + + # Cross-encoder re-ranker + reranker = SentenceTransformerRerank( + model=RERANK_MODEL, + top_n=RERANK_TOP_N, + ) + + # Query + if len(sys.argv) < 2: + print("Usage: python retrieve_clippings.py QUERY_TEXT") + sys.exit(1) + q = " ".join(sys.argv[1:]) + + # Retrieve and re-rank + nodes = retriever.retrieve(q) + reranked = reranker.postprocess_nodes(nodes, query_str=q) + + # Build result list with metadata + results = [] + for i, node in enumerate(reranked, 1): + meta = getattr(node, "metadata", None) or node.node.metadata + score = getattr(node, "score", None) + file_name = meta.get("file_name", "unknown") + page_label = meta.get("page_label", "") + results.append((i, node, file_name, page_label, score)) + + # --- Summary: source files and rankings --- + print(f"\nQuery: {q}") + print(f"Retrieved {len(nodes)} chunks, re-ranked to top {len(reranked)}") + print(f"({collection.count()} total vectors in collection)\n") + + # Unique source files in rank order + seen = set() + unique_sources = [] + for i, node, file_name, page_label, score in results: + if file_name not in seen: + seen.add(file_name) + unique_sources.append(file_name) + + print(f"Source files ({len(unique_sources)} unique):") + for j, fname in enumerate(unique_sources, 1): + print(f" {j}. {fname}") + + print(f"\nRankings:") + for i, node, file_name, page_label, score in results: + line = f" [{i:2d}] {score:+7.3f} {file_name}" + if page_label: + line += f" (p. {page_label})" + print(line) + + # --- Full chunk text --- + print(f"\n{'=' * WRAP_WIDTH}") + print("CHUNKS") + print("=" * WRAP_WIDTH) + + for i, node, file_name, page_label, score in results: + header = f"=== [{i}] {file_name}" + if page_label: + header += f" (p. {page_label})" + header += f" (score: {score:.3f})" + + print("\n" + "=" * WRAP_WIDTH) + print(header) + print("=" * WRAP_WIDTH) + + text = node.get_content() + for line in text.splitlines(): + if line.strip(): + print(textwrap.fill(line, width=WRAP_WIDTH)) + else: + print() + print() + + +if __name__ == "__main__": + main() diff --git a/retrieve_hybrid_raw.py b/retrieve_hybrid_raw.py new file mode 100644 index 0000000..ff64140 --- /dev/null +++ b/retrieve_hybrid_raw.py @@ -0,0 +1,140 @@ +# retrieve_hybrid_raw.py +# Hybrid verbatim chunk retrieval: BM25 + vector search + cross-encoder, no LLM. +# +# Same hybrid retrieval as query_hybrid_bm25_v4.py but outputs raw chunk text +# instead of LLM synthesis. Useful for inspecting what the hybrid pipeline +# retrieves and comparing against retrieve_raw.py (vector-only). +# +# Each chunk is annotated with its source (vector, BM25, or both) so you can +# see which retriever nominated it. +# +# E.M.F. February 2026 + +# Environment vars must be set before importing huggingface/transformers +# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE +# at import time. +import os +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models" +os.environ["HF_HUB_OFFLINE"] = "1" + +from llama_index.core import ( + StorageContext, + load_index_from_storage, + Settings, +) +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.core.postprocessor import SentenceTransformerRerank +from llama_index.retrievers.bm25 import BM25Retriever +import sys +import textwrap + +# +# Globals +# + +# Embedding model (must match build_exp_claude.py) +EMBED_MODEL = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5", local_files_only=True) + +# Cross-encoder model for re-ranking (cached in ./models/) +RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2" +RERANK_TOP_N = 15 + +# Retrieval parameters +VECTOR_TOP_K = 20 +BM25_TOP_K = 20 + +# Output formatting +WRAP_WIDTH = 80 + + +def main(): + # No LLM needed -- set embed model only + Settings.embed_model = EMBED_MODEL + + # Load persisted vector store + storage_context = StorageContext.from_defaults(persist_dir="./storage_exp") + index = load_index_from_storage(storage_context) + + # --- Retrievers --- + + vector_retriever = index.as_retriever(similarity_top_k=VECTOR_TOP_K) + + bm25_retriever = BM25Retriever.from_defaults( + index=index, + similarity_top_k=BM25_TOP_K, + ) + + # Cross-encoder re-ranker + reranker = SentenceTransformerRerank( + model=RERANK_MODEL, + top_n=RERANK_TOP_N, + ) + + # Query + if len(sys.argv) < 2: + print("Usage: python retrieve_hybrid_raw.py QUERY_TEXT") + sys.exit(1) + q = " ".join(sys.argv[1:]) + + # Retrieve from both sources + vector_nodes = vector_retriever.retrieve(q) + bm25_nodes = bm25_retriever.retrieve(q) + + # Track which retriever found each node + vector_ids = {n.node.node_id for n in vector_nodes} + bm25_ids = {n.node.node_id for n in bm25_nodes} + + # Merge and deduplicate by node ID + seen_ids = set() + merged = [] + for node in vector_nodes + bm25_nodes: + node_id = node.node.node_id + if node_id not in seen_ids: + seen_ids.add(node_id) + merged.append(node) + + # Re-rank merged candidates + reranked = reranker.postprocess_nodes(merged, query_str=q) + + # Retrieval stats + n_both = len(vector_ids & bm25_ids) + n_vector_only = len(vector_ids - bm25_ids) + n_bm25_only = len(bm25_ids - vector_ids) + + print(f"\nQuery: {q}") + print(f"Vector: {len(vector_nodes)}, BM25: {len(bm25_nodes)}, " + f"overlap: {n_both}, merged: {len(merged)}, re-ranked to: {len(reranked)}") + print(f" vector-only: {n_vector_only}, bm25-only: {n_bm25_only}, both: {n_both}\n") + + # Output re-ranked chunks with source annotation + for i, node in enumerate(reranked, 1): + meta = getattr(node, "metadata", None) or node.node.metadata + score = getattr(node, "score", None) + file_name = meta.get("file_name", "unknown") + text = node.get_content() + node_id = node.node.node_id + + # Annotate source + in_vector = node_id in vector_ids + in_bm25 = node_id in bm25_ids + if in_vector and in_bm25: + source = "vector+bm25" + elif in_bm25: + source = "bm25-only" + else: + source = "vector-only" + + print("=" * WRAP_WIDTH) + print(f"=== [{i}] {file_name} (score: {score:.3f}) [{source}]") + print("=" * WRAP_WIDTH) + for line in text.splitlines(): + if line.strip(): + print(textwrap.fill(line, width=WRAP_WIDTH)) + else: + print() + print() + + +if __name__ == "__main__": + main() diff --git a/retrieve_raw.py b/retrieve_raw.py new file mode 100644 index 0000000..de639ee --- /dev/null +++ b/retrieve_raw.py @@ -0,0 +1,97 @@ +# retrieve_raw.py +# Verbatim chunk retrieval: vector search + cross-encoder re-ranking, no LLM. +# +# Returns the top re-ranked chunks with their full text, file metadata, and +# scores. Useful for browsing source material directly and verifying what +# the RAG pipeline retrieves before LLM synthesis. +# +# Uses the same vector store, embedding model, and re-ranker as +# query_topk_prompt_engine_v3.py, but skips the LLM step entirely. +# +# E.M.F. February 2026 + +# Environment vars must be set before importing huggingface/transformers +# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE +# at import time. +import os +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models" +os.environ["HF_HUB_OFFLINE"] = "1" + +from llama_index.core import ( + StorageContext, + load_index_from_storage, + Settings, +) +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.core.postprocessor import SentenceTransformerRerank +import sys +import textwrap + +# +# Globals +# + +# Embedding model (must match build_exp_claude.py) +EMBED_MODEL = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5", local_files_only=True) + +# Cross-encoder model for re-ranking (cached in ./models/) +RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2" +RERANK_TOP_N = 15 +RETRIEVE_TOP_K = 30 + +# Output formatting +WRAP_WIDTH = 80 + + +def main(): + # No LLM needed -- set embed model only + Settings.embed_model = EMBED_MODEL + + # Load persisted vector store + storage_context = StorageContext.from_defaults(persist_dir="./storage_exp") + index = load_index_from_storage(storage_context) + + # Build retriever (vector search only, no query engine / LLM) + retriever = index.as_retriever(similarity_top_k=RETRIEVE_TOP_K) + + # Cross-encoder re-ranker + reranker = SentenceTransformerRerank( + model=RERANK_MODEL, + top_n=RERANK_TOP_N, + ) + + # Query + if len(sys.argv) < 2: + print("Usage: python retrieve_raw.py QUERY_TEXT") + sys.exit(1) + q = " ".join(sys.argv[1:]) + + # Retrieve and re-rank + nodes = retriever.retrieve(q) + reranked = reranker.postprocess_nodes(nodes, query_str=q) + + # Output + print(f"\nQuery: {q}") + print(f"Retrieved {len(nodes)} chunks, re-ranked to top {len(reranked)}\n") + + for i, node in enumerate(reranked, 1): + meta = getattr(node, "metadata", None) or node.node.metadata + score = getattr(node, "score", None) + file_name = meta.get("file_name", "unknown") + text = node.get_content() + + print("="*WRAP_WIDTH) + print(f"=== [{i}] {file_name} (score: {score:.3f}) ") + print("="*WRAP_WIDTH) + # Wrap text for readability + for line in text.splitlines(): + if line.strip(): + print(textwrap.fill(line, width=WRAP_WIDTH)) + else: + print() + print() + + +if __name__ == "__main__": + main() diff --git a/run_query.sh b/run_query.sh new file mode 100755 index 0000000..d34683a --- /dev/null +++ b/run_query.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# This shell script will handle I/O for the python query engine +# It will take a query and return the formatted results + +# E.M.F. August 2025 + +# Usage: ./run_query.sh + +QUERY_SCRIPT="query_hybrid_bm25_v4.py" + +echo -e "Current query engine is $QUERY_SCRIPT\n" + +# Loop until input is "exit" +while true; do + read -p "Enter your query (or type 'exit' to quit): " query + if [ "$query" == "exit" ] || [ "$query" == "quit" ] || [ "$query" == "" ] ; then + echo "Exiting..." + break + fi + time_start=$(date +%s) + + # Call the python script with the query and format the output + python3 $QUERY_SCRIPT --query "$query" | \ + expand | sed -E 's|(.* )(.*/data)|\1./data|' | fold -s -w 131 + + time_end=$(date +%s) + elapsed=$((time_end - time_start)) + echo -e "Query processed in $elapsed seconds.\n" + echo $query >> query.log +done diff --git a/search_keywords.py b/search_keywords.py new file mode 100644 index 0000000..cb2cd51 --- /dev/null +++ b/search_keywords.py @@ -0,0 +1,189 @@ +# search_keywords.py +# Keyword search: extract terms from a query using POS tagging, then grep +# across journal files for matches. +# +# Complements the vector search pipeline by catching exact names, places, +# and dates that embeddings can miss. No vector store or LLM needed. +# +# Term extraction uses NLTK POS tagging to keep nouns (NN*), proper nouns +# (NNP*), and adjectives (JJ*) -- skipping stopwords and function words +# automatically. Consecutive proper nouns are joined into multi-word phrases +# (e.g., "Robert Wright" stays as one search term, not "robert" + "wright"). +# +# E.M.F. February 2026 + +import os +import sys +import re +from pathlib import Path + +import nltk + +# +# Globals +# +DATA_DIR = Path("./data") +CONTEXT_LINES = 2 # lines of context around each match +MAX_MATCHES_PER_FILE = 3 # cap matches shown per file to avoid flooding + +# POS tags to keep: nouns, proper nouns, adjectives +KEEP_TAGS = {"NN", "NNS", "NNP", "NNPS", "JJ", "JJS", "JJR"} + +# Proper noun tags (consecutive runs are joined as phrases) +PROPER_NOUN_TAGS = {"NNP", "NNPS"} + +# Minimum word length to keep (filters out short noise) +MIN_WORD_LEN = 3 + + +def ensure_nltk_data(): + """Download NLTK data if not already present.""" + for resource, name in [ + ("tokenizers/punkt_tab", "punkt_tab"), + ("taggers/averaged_perceptron_tagger_eng", "averaged_perceptron_tagger_eng"), + ]: + try: + nltk.data.find(resource) + except LookupError: + print(f"Downloading NLTK resource: {name}") + nltk.download(name, quiet=True) + + +def extract_terms(query): + """Extract key terms from a query using POS tagging. + + Tokenizes the query, runs POS tagging, and keeps nouns, proper nouns, + and adjectives. Consecutive proper nouns (NNP/NNPS) are joined into + multi-word phrases (e.g., "Robert Wright" → "robert wright"). + + Returns a list of terms (lowercase), phrases listed first. + """ + tokens = nltk.word_tokenize(query) + tagged = nltk.pos_tag(tokens) + + phrases = [] # multi-word proper noun phrases + single_terms = [] # individual nouns/adjectives + proper_run = [] # accumulator for consecutive proper nouns + + for word, tag in tagged: + if tag in PROPER_NOUN_TAGS: + proper_run.append(word) + else: + # Flush any accumulated proper noun run + if proper_run: + phrase = " ".join(proper_run).lower() + if len(phrase) >= MIN_WORD_LEN: + phrases.append(phrase) + proper_run = [] + # Keep other nouns and adjectives as single terms + if tag in KEEP_TAGS and len(word) >= MIN_WORD_LEN: + single_terms.append(word.lower()) + + # Flush final proper noun run + if proper_run: + phrase = " ".join(proper_run).lower() + if len(phrase) >= MIN_WORD_LEN: + phrases.append(phrase) + + # Phrases first (more specific), then single terms + all_terms = phrases + single_terms + return list(dict.fromkeys(all_terms)) # deduplicate, preserve order + + +def search_files(terms, data_dir, context_lines=CONTEXT_LINES): + """Search all .txt files in data_dir for the given terms. + + Returns a list of (file_path, match_count, matches) where matches is a + list of (line_number, context_block) tuples. + """ + if not terms: + return [] + + # Build a single regex pattern that matches any term (case-insensitive) + pattern = re.compile( + r"\b(" + "|".join(re.escape(t) for t in terms) + r")\b", + re.IGNORECASE + ) + + results = [] + txt_files = sorted(data_dir.glob("*.txt")) + + for fpath in txt_files: + try: + lines = fpath.read_text(encoding="utf-8").splitlines() + except (OSError, UnicodeDecodeError): + continue + + matches = [] + match_count = 0 + seen_lines = set() # avoid overlapping context blocks + + for i, line in enumerate(lines): + if pattern.search(line): + match_count += 1 + if i in seen_lines: + continue + + # Extract context window + start = max(0, i - context_lines) + end = min(len(lines), i + context_lines + 1) + block = [] + for j in range(start, end): + seen_lines.add(j) + marker = ">>>" if j == i else " " + block.append(f" {marker} {j+1:4d}: {lines[j]}") + + matches.append((i + 1, "\n".join(block))) + + if match_count > 0: + results.append((fpath, match_count, matches)) + + # Sort by match count (most matches first) + results.sort(key=lambda x: x[1], reverse=True) + return results + + +def main(): + if len(sys.argv) < 2: + print("Usage: python search_keywords.py QUERY_TEXT") + sys.exit(1) + + ensure_nltk_data() + + q = " ".join(sys.argv[1:]) + + # Extract terms + terms = extract_terms(q) + if not terms: + print(f"Query: {q}") + print("No searchable terms extracted. Try a more specific query.") + sys.exit(0) + + print(f"Query: {q}") + print(f"Extracted terms: {', '.join(terms)}\n") + + # Search + results = search_files(terms, DATA_DIR) + + if not results: + print("No matches found.") + sys.exit(0) + + # Summary + total_matches = sum(r[1] for r in results) + print(f"Found {total_matches} matches across {len(results)} files\n") + + # Detailed output + for fpath, match_count, matches in results: + print("="*60) + print(f"--- {fpath.name} ({match_count} matches) ---") + print("="*60) + for line_num, block in matches[:MAX_MATCHES_PER_FILE]: + print(block) + print() + if len(matches) > MAX_MATCHES_PER_FILE: + print(f" ... and {len(matches) - MAX_MATCHES_PER_FILE} more matches\n") + + +if __name__ == "__main__": + main()