commit 42e5e20e17bff9994ce0e28c6dae8afd539d2b1a Author: Eric Furst Date: Fri Feb 27 05:59:01 2026 -0500 Test clean deploy Co-Authored-By: Claude Opus 4.6 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..af82761 --- /dev/null +++ b/.gitignore @@ -0,0 +1,36 @@ +# Python +.venv/ +__pycache__/ +*.pyc + +# HuggingFace cached models (large, ~2 GB) +models/ + +# Vector stores (large, rebuild with build scripts) +store/ +clippings_search/store_clippings/ + +# Data (symlinks to private files) +data +clippings + +# Generated file lists +ocr_needed.txt + +# IDE and OS +.DS_Store +.vscode/ +.idea/ + +# Jupyter checkpoints +.ipynb_checkpoints/ + +# Secrets +.env +API_key_temp + +# Query log +query.log + +# Duplicate of CLAUDE.md +claude.md diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7ab86b1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 E. M. Furst + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..0499f9a --- /dev/null +++ b/README.md @@ -0,0 +1,180 @@ +# ssearch + +Semantic search over a personal journal archive and a collection of clippings. Uses vector embeddings and a local LLM to find and synthesize information across 1800+ dated text entries spanning 2000-2025, plus a library of PDFs, articles, and web saves. + +## How it works + +``` +Query → Embed (BAAI/bge-large-en-v1.5) → Vector similarity (top-30) → Cross-encoder re-rank (top-15) → LLM synthesis (command-r7b via Ollama, or OpenAI API) → Response + sources +``` + +1. **Build**: Source files are chunked (256 tokens, 25-token overlap) and embedded into a vector store using LlamaIndex. The journal index uses LlamaIndex's JSON store; the clippings index uses ChromaDB. Both support incremental updates. +2. **Retrieve**: A user query is embedded with the same model and matched against stored vectors by cosine similarity, returning the top 30 candidate chunks. +3. **Re-rank**: A cross-encoder (`cross-encoder/ms-marco-MiniLM-L-12-v2`) scores each (query, chunk) pair jointly and keeps the top 15. +4. **Synthesize**: The re-ranked chunks are passed to a local LLM with a custom prompt that encourages multi-source synthesis, producing a grounded answer with file citations. + +## Project structure + +``` +ssearch/ +├── build_store.py # Build/update journal vector store (incremental) +├── query_hybrid.py # Hybrid BM25+vector query with LLM synthesis +├── retrieve.py # Verbatim hybrid retrieval (no LLM) +├── search_keywords.py # Keyword search via POS-based term extraction +├── run_query.sh # Interactive shell wrapper with timing and logging +├── clippings_search/ +│ ├── build_clippings.py # Build/update clippings vector store (ChromaDB) +│ ├── retrieve_clippings.py # Verbatim clippings chunk retrieval +│ └── store_clippings/ # Persisted clippings vector store (ChromaDB) +├── data/ # Symlink to journal .txt files +├── clippings/ # Symlink to clippings (PDFs, TXT, webarchive, RTF) +├── store/ # Persisted journal vector store +├── models/ # Cached HuggingFace models (offline) +├── requirements.txt # Python dependencies +``` + +## Setup + +**Prerequisites**: Python 3.12, [Ollama](https://ollama.com) with `command-r7b` pulled. + +```bash +cd ssearch +python3 -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +``` + +The `data/` symlink should point to the journal archive (plain `.txt` files). The `clippings/` symlink should point to the clippings folder. The embedding model (`BAAI/bge-large-en-v1.5`) and cross-encoder (`cross-encoder/ms-marco-MiniLM-L-12-v2`) are cached in `./models/` for offline use. + +### Offline model loading + +All query scripts set three environment variables to prevent HuggingFace from making network requests: + +```python +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models" +os.environ["HF_HUB_OFFLINE"] = "1" +``` + +**These must appear before any imports that touch HuggingFace libraries.** The `huggingface_hub` library evaluates `HF_HUB_OFFLINE` once at import time (in `huggingface_hub/constants.py`). If the env var is set after imports, the library will still attempt network access and fail offline. + +Alternatively, set the variable in your shell before running Python: +```bash +export HF_HUB_OFFLINE=1 +python query_hybrid.py "your query" +``` + +## Usage + +### Build the vector stores + +```bash +# Journal index -- incremental update (default) +python build_store.py + +# Journal index -- full rebuild +python build_store.py --rebuild + +# Clippings index -- incremental update (default) +python clippings_search/build_clippings.py + +# Clippings index -- full rebuild +python clippings_search/build_clippings.py --rebuild +``` + +The default incremental mode loads the existing index, compares file sizes and modification dates, and only re-indexes what changed. A full rebuild is only needed when chunk parameters or the embedding model change. + +`build_clippings.py` handles PDFs, TXT, webarchive, and RTF files. PDFs are validated before indexing — those without extractable text are skipped and written to `ocr_needed.txt` for later OCR. + +### Search journals + +#### Semantic search with LLM synthesis + +**Requires Ollama running with `command-r7b`.** + +**Hybrid BM25 + vector** (`query_hybrid.py`): Retrieves top 20 by vector similarity and top 20 by BM25 term frequency, merges and deduplicates, re-ranks the union to top 15, synthesizes. Catches exact name/term matches that vector-only retrieval misses. +```bash +python query_hybrid.py "What does the author say about creativity?" +``` + +**Interactive wrapper** (`run_query.sh`): Loops for queries, displays timing, and appends queries to `query.log`. +```bash +./run_query.sh +``` + +#### Verbatim chunk retrieval (no LLM) + +Same hybrid retrieval and re-ranking pipeline but outputs raw chunk text. Each chunk is annotated with its source: `[vector-only]`, `[bm25-only]`, or `[vector+bm25]`. **No Ollama needed.** + +```bash +python retrieve.py "Kondiaronk and the Wendats" +``` + +#### Keyword search (no vector store, no LLM) + +Extracts nouns and adjectives from the query using NLTK POS tagging, then greps journal files for matches with surrounding context. +```bash +python search_keywords.py "Discussions of Kondiaronk and the Wendats" +``` + +### Search clippings + +Verbatim chunk retrieval from the clippings index. Same embedding model and cross-encoder re-ranking. Outputs a summary of source files and rankings, then full chunk text. Includes page numbers for PDF sources. **No Ollama needed.** + +```bash +python clippings_search/retrieve_clippings.py "creativity and innovation" +``` + +### Output format + +``` +Response: + + +Source documents: +2024-03-15.txt ./data/2024-03-15.txt 0.683 +2023-11-02.txt ./data/2023-11-02.txt 0.651 +... +``` + +## Configuration + +Key parameters (set in source files): + +| Parameter | Value | Location | +|-----------|-------|----------| +| Embedding model | `BAAI/bge-large-en-v1.5` | all build and query scripts | +| Chunk size | 256 tokens | `build_store.py`, `clippings_search/build_clippings.py` | +| Chunk overlap | 25 tokens | `build_store.py`, `clippings_search/build_clippings.py` | +| Paragraph separator | `\n\n` | `build_store.py` | +| Initial retrieval | 30 chunks | query and retrieve scripts | +| Re-rank model | `cross-encoder/ms-marco-MiniLM-L-12-v2` | query and retrieve scripts | +| Re-rank top-n | 15 | query and retrieve scripts | +| LLM | `command-r7b` (Ollama) or `gpt-4o-mini` (OpenAI API) | `query_hybrid.py` | +| Temperature | 0.3 | `query_hybrid.py` | +| Context window | 8000 tokens | `query_hybrid.py` | +| Request timeout | 360 seconds | `query_hybrid.py` | + +## Key dependencies + +- **llama-index-core** (0.14.14) -- RAG framework +- **llama-index-embeddings-huggingface** -- embedding integration +- **llama-index-vector-stores-chroma** -- ChromaDB vector store for clippings +- **llama-index-llms-ollama** -- local LLM via Ollama +- **llama-index-llms-openai** -- OpenAI API LLM (optional) +- **llama-index-retrievers-bm25** -- BM25 sparse retrieval for hybrid search +- **chromadb** -- persistent vector store for clippings index +- **sentence-transformers** -- cross-encoder re-ranking +- **torch** -- ML runtime + +## Design decisions + +- **BAAI/bge-large-en-v1.5 over all-mpnet-base-v2**: Better semantic matching quality for journal text despite slower embedding. +- **256-token chunks**: Tested 512 and 384; 256 with 25-token overlap produced the highest quality matches. +- **command-r7b over llama3.1:8B**: Sticks closer to provided context with less hallucination at comparable speed. +- **Cross-encoder re-ranking**: Retrieve top-30 via bi-encoder, re-rank to top-15 with a cross-encoder that scores each (query, chunk) pair jointly. Tested three models; `ms-marco-MiniLM-L-12-v2` selected over `stsb-roberta-base` (wrong task) and `BAAI/bge-reranker-v2-m3` (slower, weak score tail). +- **HyDE query rewriting tested and dropped**: Did not improve results over direct prompt engineering. +- **Hybrid BM25 + vector retrieval**: BM25 nominates candidates with exact term matches that embeddings miss; the cross-encoder decides final relevance. +- **ChromaDB for clippings**: Persistent SQLite-backed store. Chosen over the JSON store for its metadata filtering and direct chunk-level operations for incremental updates. +- **PDF validation before indexing**: Pre-check each PDF with pypdf — skip if text extraction yields <100 chars or low printable ratio. Skipped files written to `ocr_needed.txt`. + diff --git a/build_store.py b/build_store.py new file mode 100644 index 0000000..add3db3 --- /dev/null +++ b/build_store.py @@ -0,0 +1,193 @@ +# build_store.py +# +# Build or update the vector store from journal entries in ./data. +# +# Default mode (incremental): loads the existing index and adds only +# new or modified files. Use --rebuild for a full rebuild from scratch. +# +# January 2026 +# E. M. Furst +# Used Sonnet 4.5 to suggest changes; Opus 4.6 for incremental update + +from llama_index.core import ( + SimpleDirectoryReader, + StorageContext, + VectorStoreIndex, + load_index_from_storage, + Settings, +) +from pathlib import Path +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.core.node_parser import SentenceSplitter +import argparse +import datetime +import os +import time + +# Shared constants +DATA_DIR = Path("./data") +PERSIST_DIR = "./store" +EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5" +CHUNK_SIZE = 256 +CHUNK_OVERLAP = 25 + + +def get_text_splitter(): + return SentenceSplitter( + chunk_size=CHUNK_SIZE, + chunk_overlap=CHUNK_OVERLAP, + paragraph_separator="\n\n", + ) + + +def rebuild(): + """Full rebuild: delete and recreate the vector store from scratch.""" + if not DATA_DIR.exists(): + raise FileNotFoundError(f"Data directory not found: {DATA_DIR.absolute()}") + + print(f"Loading documents from {DATA_DIR.absolute()}...") + documents = SimpleDirectoryReader(str(DATA_DIR)).load_data() + + if not documents: + raise ValueError("No documents found in data directory") + + print(f"Loaded {len(documents)} document(s)") + + print("Building vector index...") + index = VectorStoreIndex.from_documents( + documents, + transformations=[get_text_splitter()], + show_progress=True, + ) + + index.storage_context.persist(persist_dir=PERSIST_DIR) + print(f"Index built and saved to {PERSIST_DIR}") + + +def update(): + """Incremental update: add new files, re-index modified files, remove deleted files.""" + if not DATA_DIR.exists(): + raise FileNotFoundError(f"Data directory not found: {DATA_DIR.absolute()}") + + # Load existing index + print(f"Loading existing index from {PERSIST_DIR}...") + storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR) + index = load_index_from_storage(storage_context) + + # Set transformations so index.insert() chunks correctly + Settings.transformations = [get_text_splitter()] + + # Build lookup of indexed files: file_name -> (ref_doc_id, metadata) + all_ref_docs = index.docstore.get_all_ref_doc_info() + indexed = {} + for ref_id, info in all_ref_docs.items(): + fname = info.metadata.get("file_name") + if fname: + indexed[fname] = (ref_id, info.metadata) + + print(f"Index contains {len(indexed)} documents") + + # Scan current files on disk + disk_files = {f.name: f for f in sorted(DATA_DIR.glob("*.txt"))} + print(f"Data directory contains {len(disk_files)} files") + + # Classify files + new_files = [] + modified_files = [] + deleted_files = [] + unchanged = 0 + + for fname, fpath in disk_files.items(): + if fname not in indexed: + new_files.append(fpath) + else: + ref_id, meta = indexed[fname] + # Compare file size and modification date + stat = fpath.stat() + disk_size = stat.st_size + # Must use UTC to match SimpleDirectoryReader's date format + disk_mdate = datetime.datetime.fromtimestamp( + stat.st_mtime, tz=datetime.timezone.utc + ).strftime("%Y-%m-%d") + + stored_size = meta.get("file_size") + stored_mdate = meta.get("last_modified_date") + + if disk_size != stored_size or disk_mdate != stored_mdate: + modified_files.append((fpath, ref_id)) + else: + unchanged += 1 + + for fname, (ref_id, meta) in indexed.items(): + if fname not in disk_files: + deleted_files.append((fname, ref_id)) + + # Report + print(f"\n New: {len(new_files)}") + print(f" Modified: {len(modified_files)}") + print(f" Deleted: {len(deleted_files)}") + print(f" Unchanged: {unchanged}") + + if not new_files and not modified_files and not deleted_files: + print("\nNothing to do.") + return + + # Process deletions (including modified files that need re-indexing) + for fname, ref_id in deleted_files: + print(f" Removing {fname}") + index.delete_ref_doc(ref_id, delete_from_docstore=True) + + for fpath, ref_id in modified_files: + print(f" Re-indexing {fpath.name} (modified)") + index.delete_ref_doc(ref_id, delete_from_docstore=True) + + # Process additions (new files + modified files) + files_to_add = new_files + [fpath for fpath, _ in modified_files] + if files_to_add: + print(f"\nIndexing {len(files_to_add)} file(s)...") + # Use "./" prefix to match paths from full build (pathlib strips it) + docs = SimpleDirectoryReader( + input_files=[f"./{f}" for f in files_to_add] + ).load_data() + for doc in docs: + index.insert(doc) + + # Persist + index.storage_context.persist(persist_dir=PERSIST_DIR) + print(f"\nIndex updated and saved to {PERSIST_DIR}") + + +def main(): + parser = argparse.ArgumentParser( + description="Build or update the vector store from journal entries." + ) + parser.add_argument( + "--rebuild", + action="store_true", + help="Full rebuild from scratch (default: incremental update)", + ) + args = parser.parse_args() + + # Configure embedding model + embed_model = HuggingFaceEmbedding(model_name=EMBED_MODEL_NAME) + Settings.embed_model = embed_model + + start = time.time() + + if args.rebuild: + print("Mode: full rebuild") + rebuild() + else: + print("Mode: incremental update") + if not Path(PERSIST_DIR).exists(): + print(f"No existing index at {PERSIST_DIR}, doing full rebuild.") + rebuild() + else: + update() + + elapsed = time.time() - start + print(f"Done in {elapsed:.1f}s") + + +if __name__ == "__main__": + main() diff --git a/clippings_search/build_clippings.py b/clippings_search/build_clippings.py new file mode 100644 index 0000000..a12c869 --- /dev/null +++ b/clippings_search/build_clippings.py @@ -0,0 +1,471 @@ +# build_clippings.py +# +# Build or update the ChromaDB vector store from clippings in ./clippings. +# +# Default mode (incremental): loads the existing index and adds only +# new or modified files. Use --rebuild for a full rebuild from scratch. +# +# Handles PDFs, TXT, webarchive, and RTF files. Skips non-extractable PDFs +# and writes them to ocr_needed.txt for later OCR processing. +# +# February 2026 +# E. M. Furst + +# Environment vars must be set before importing huggingface/transformers +# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE +# at import time. +import os +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models" +os.environ["HF_HUB_OFFLINE"] = "1" + +import chromadb +from llama_index.core import ( + SimpleDirectoryReader, + StorageContext, + VectorStoreIndex, + Settings, + Document, +) +from llama_index.vector_stores.chroma import ChromaVectorStore +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.core.node_parser import SentenceSplitter +from pathlib import Path +import argparse +import datetime +import time + +# Shared constants +DATA_DIR = Path("./clippings") +PERSIST_DIR = "./clippings_search/store_clippings" +COLLECTION_NAME = "clippings" +EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5" +CHUNK_SIZE = 256 +CHUNK_OVERLAP = 25 + +# File types handled by SimpleDirectoryReader (PDF + TXT) +READER_EXTS = {".pdf", ".txt"} +# File types handled by custom loaders +CUSTOM_EXTS = {".webarchive", ".rtf"} +# All supported extensions +SUPPORTED_EXTS = READER_EXTS | CUSTOM_EXTS + +# Minimum extracted text length to consider a PDF valid (characters) +MIN_TEXT_LENGTH = 100 + + +def get_text_splitter(): + return SentenceSplitter( + chunk_size=CHUNK_SIZE, + chunk_overlap=CHUNK_OVERLAP, + paragraph_separator="\n\n", + ) + + +def validate_pdf(file_path): + """Check if a PDF has extractable text. + + Returns (is_valid, reason) where reason explains why it was skipped. + """ + import pypdf + try: + reader = pypdf.PdfReader(str(file_path)) + page_count = len(reader.pages) + total_chars = 0 + printable_chars = 0 + for page in reader.pages: + text = page.extract_text() or "" + total_chars += len(text) + printable_chars += sum( + 1 for c in text if c.isprintable() or c in "\n\r\t" + ) + + if total_chars < MIN_TEXT_LENGTH: + return False, f"too little text ({total_chars} chars, {page_count} pages)" + + ratio = printable_chars / total_chars if total_chars > 0 else 0 + if ratio < 0.5: + return False, f"low printable ratio ({ratio:.2f}, {page_count} pages)" + + return True, None + except Exception as e: + return False, str(e) + + +def load_webarchive(file_path): + """Extract text from a macOS .webarchive file. + + Returns a LlamaIndex Document, or None if extraction fails. + """ + import plistlib + from bs4 import BeautifulSoup + + try: + with open(file_path, "rb") as f: + plist = plistlib.load(f) + + resource = plist.get("WebMainResource", {}) + html_bytes = resource.get("WebResourceData", b"") + if not html_bytes: + return None + + html = html_bytes.decode("utf-8", errors="replace") + soup = BeautifulSoup(html, "html.parser") + text = soup.get_text(separator="\n", strip=True) + + if len(text) < MIN_TEXT_LENGTH: + return None + + stat = file_path.stat() + mdate = datetime.datetime.fromtimestamp( + stat.st_mtime, tz=datetime.timezone.utc + ).strftime("%Y-%m-%d") + + return Document( + text=text, + metadata={ + "file_name": file_path.name, + "file_path": str(file_path), + "file_size": stat.st_size, + "last_modified_date": mdate, + "file_type": "webarchive", + }, + ) + except Exception as e: + print(f" Warning: could not read webarchive {file_path.name}: {e}") + return None + + +def load_rtf(file_path): + """Extract text from an RTF file. + + Returns a LlamaIndex Document, or None if extraction fails. + """ + from striprtf.striprtf import rtf_to_text + + try: + with open(file_path, "r", errors="replace") as f: + rtf_content = f.read() + + text = rtf_to_text(rtf_content) + + if len(text) < MIN_TEXT_LENGTH: + return None + + stat = file_path.stat() + mdate = datetime.datetime.fromtimestamp( + stat.st_mtime, tz=datetime.timezone.utc + ).strftime("%Y-%m-%d") + + return Document( + text=text, + metadata={ + "file_name": file_path.name, + "file_path": str(file_path), + "file_size": stat.st_size, + "last_modified_date": mdate, + "file_type": "rtf", + }, + ) + except Exception as e: + print(f" Warning: could not read RTF {file_path.name}: {e}") + return None + + +def scan_clippings(): + """Scan the clippings directory and classify files. + + Returns (reader_files, custom_docs, skipped, ocr_needed) where: + - reader_files: list of Paths for SimpleDirectoryReader (PDF + TXT) + - custom_docs: list of Document objects from custom loaders + - skipped: list of (Path, reason) tuples + - ocr_needed: list of Paths for PDFs that need OCR + """ + reader_files = [] + custom_docs = [] + skipped = [] + ocr_needed = [] + + for fpath in sorted(DATA_DIR.rglob("*")): + if not fpath.is_file(): + continue + if fpath.name.startswith("."): + continue + + ext = fpath.suffix.lower() + + if ext not in SUPPORTED_EXTS: + skipped.append((fpath, f"unsupported type: {ext}")) + continue + + if ext == ".pdf": + is_valid, reason = validate_pdf(fpath) + if not is_valid: + skipped.append((fpath, f"no extractable text: {reason}")) + ocr_needed.append(fpath) + continue + reader_files.append(fpath) + + elif ext == ".txt": + reader_files.append(fpath) + + elif ext == ".webarchive": + doc = load_webarchive(fpath) + if doc: + custom_docs.append(doc) + else: + skipped.append((fpath, "no extractable text from webarchive")) + + elif ext == ".rtf": + doc = load_rtf(fpath) + if doc: + custom_docs.append(doc) + else: + skipped.append((fpath, "no extractable text from RTF")) + + return reader_files, custom_docs, skipped, ocr_needed + + +def write_ocr_list(ocr_needed): + """Write the list of PDFs needing OCR to ocr_needed.txt.""" + with open("ocr_needed.txt", "w") as f: + for fpath in ocr_needed: + f.write(f"{fpath}\n") + print(f"Wrote {len(ocr_needed)} file(s) to ocr_needed.txt") + + +def load_all_documents(reader_files, custom_docs): + """Load documents from SimpleDirectoryReader and merge with custom docs.""" + documents = [] + + if reader_files: + print(f"Loading {len(reader_files)} PDF/TXT files...") + reader_docs = SimpleDirectoryReader( + input_files=[str(f) for f in reader_files], + filename_as_id=True, + ).load_data() + documents.extend(reader_docs) + + if custom_docs: + print(f"Adding {len(custom_docs)} webarchive/RTF documents...") + documents.extend(custom_docs) + + return documents + + +def rebuild(reader_files, custom_docs): + """Full rebuild: delete existing collection and recreate from scratch.""" + client = chromadb.PersistentClient(path=PERSIST_DIR) + # Delete existing collection if present + try: + client.delete_collection(COLLECTION_NAME) + print(f"Deleted existing collection '{COLLECTION_NAME}'") + except Exception: + pass + + collection = client.get_or_create_collection(COLLECTION_NAME) + vector_store = ChromaVectorStore(chroma_collection=collection) + storage_context = StorageContext.from_defaults(vector_store=vector_store) + + documents = load_all_documents(reader_files, custom_docs) + if not documents: + raise ValueError("No documents loaded") + + print(f"Loaded {len(documents)} document(s) total") + print("Building vector index...") + + index = VectorStoreIndex.from_documents( + documents, + storage_context=storage_context, + transformations=[get_text_splitter()], + show_progress=True, + ) + + print(f"Index built. Collection has {collection.count()} vectors.") + return index + + +def update(reader_files, custom_docs): + """Incremental update: add new, re-index modified, remove deleted files.""" + client = chromadb.PersistentClient(path=PERSIST_DIR) + collection = client.get_collection(COLLECTION_NAME) + count = collection.count() + print(f"Existing collection has {count} vectors") + + # Get all stored metadata to find what's indexed + # Key on file_path (not file_name) to handle duplicate names across subdirs + indexed = {} # file_path -> {"ids": [], "file_size": ..., "last_modified_date": ...} + if count > 0: + results = collection.get(include=["metadatas"]) + for i, meta in enumerate(results["metadatas"]): + fpath = meta.get("file_path", "") + if fpath not in indexed: + indexed[fpath] = { + "ids": [], + "file_size": meta.get("file_size"), + "last_modified_date": meta.get("last_modified_date"), + } + indexed[fpath]["ids"].append(results["ids"][i]) + + print(f"Index contains {len(indexed)} unique files") + + # Build disk file lookup: file_path_str -> Path + # For reader_files, match the path format SimpleDirectoryReader would store + disk_files = {} + for f in reader_files: + disk_files[str(f)] = f + for doc in custom_docs: + disk_files[doc.metadata["file_path"]] = Path(doc.metadata["file_path"]) + + # Classify files + new_reader = [] + new_custom = [] + modified_reader = [] + modified_custom = [] + deleted_paths = [] + unchanged = 0 + + for path_str, fpath in disk_files.items(): + if path_str not in indexed: + # Check if it's a custom doc + if fpath.suffix.lower() in CUSTOM_EXTS: + matching = [d for d in custom_docs if d.metadata["file_path"] == path_str] + if matching: + new_custom.extend(matching) + else: + new_reader.append(fpath) + else: + info = indexed[path_str] + stat = fpath.stat() + disk_mdate = datetime.datetime.fromtimestamp( + stat.st_mtime, tz=datetime.timezone.utc + ).strftime("%Y-%m-%d") + + if stat.st_size != info["file_size"] or disk_mdate != info["last_modified_date"]: + if fpath.suffix.lower() in CUSTOM_EXTS: + matching = [d for d in custom_docs if d.metadata["file_path"] == path_str] + if matching: + modified_custom.extend(matching) + else: + modified_reader.append(fpath) + else: + unchanged += 1 + + for path_str in indexed: + if path_str not in disk_files: + deleted_paths.append(path_str) + + n_new = len(new_reader) + len(new_custom) + n_modified = len(modified_reader) + len(modified_custom) + print(f"\n New: {n_new}") + print(f" Modified: {n_modified}") + print(f" Deleted: {len(deleted_paths)}") + print(f" Unchanged: {unchanged}") + + if n_new == 0 and n_modified == 0 and len(deleted_paths) == 0: + print("\nNothing to do.") + return + + # Delete chunks for removed and modified files + for path_str in deleted_paths: + ids = indexed[path_str]["ids"] + fname = Path(path_str).name + print(f" Removing {fname} ({len(ids)} chunks)") + collection.delete(ids=ids) + + for fpath in modified_reader: + path_str = str(fpath) + ids = indexed[path_str]["ids"] + print(f" Re-indexing {fpath.name} ({len(ids)} chunks)") + collection.delete(ids=ids) + + for doc in modified_custom: + path_str = doc.metadata["file_path"] + if path_str in indexed: + ids = indexed[path_str]["ids"] + print(f" Re-indexing {doc.metadata['file_name']} ({len(ids)} chunks)") + collection.delete(ids=ids) + + # Add new and modified files + files_to_add = new_reader + modified_reader + docs_to_add = new_custom + modified_custom + + if files_to_add or docs_to_add: + documents = load_all_documents(files_to_add, docs_to_add) + if documents: + print(f"Indexing {len(documents)} document(s)...") + vector_store = ChromaVectorStore(chroma_collection=collection) + storage_context = StorageContext.from_defaults(vector_store=vector_store) + + VectorStoreIndex.from_documents( + documents, + storage_context=storage_context, + transformations=[get_text_splitter()], + show_progress=True, + ) + + print(f"\nIndex updated. Collection now has {collection.count()} vectors.") + + +def main(): + parser = argparse.ArgumentParser( + description="Build or update the clippings vector store (ChromaDB)." + ) + parser.add_argument( + "--rebuild", + action="store_true", + help="Full rebuild from scratch (default: incremental update)", + ) + args = parser.parse_args() + + # Configure embedding model (offline, cached in ./models) + embed_model = HuggingFaceEmbedding( + model_name=EMBED_MODEL_NAME, + cache_folder="./models", + local_files_only=True, + ) + Settings.embed_model = embed_model + + if not DATA_DIR.exists(): + raise FileNotFoundError( + f"Clippings directory not found: {DATA_DIR.absolute()}\n" + f"Create symlink: ln -s ../clippings ./clippings" + ) + + start = time.time() + + # Scan and classify files + print(f"Scanning {DATA_DIR.absolute()}...") + reader_files, custom_docs, skipped, ocr_needed = scan_clippings() + + n_valid = len(reader_files) + len(custom_docs) + print(f"\nFiles to index: {n_valid}") + print(f" PDF/TXT: {len(reader_files)}") + print(f" Webarchive/RTF: {len(custom_docs)}") + print(f"Files skipped: {len(skipped)}") + for fpath, reason in skipped: + print(f" SKIP: {fpath.name} -- {reason}") + + if ocr_needed: + write_ocr_list(ocr_needed) + + if n_valid == 0: + raise ValueError("No valid files found to index") + + if args.rebuild: + print("\nMode: full rebuild") + rebuild(reader_files, custom_docs) + else: + print("\nMode: incremental update") + if not Path(PERSIST_DIR).exists(): + print(f"No existing index at {PERSIST_DIR}, doing full rebuild.") + rebuild(reader_files, custom_docs) + else: + update(reader_files, custom_docs) + + elapsed = time.time() - start + print(f"Done in {elapsed:.1f}s") + + +if __name__ == "__main__": + main() diff --git a/clippings_search/retrieve_clippings.py b/clippings_search/retrieve_clippings.py new file mode 100644 index 0000000..a96995c --- /dev/null +++ b/clippings_search/retrieve_clippings.py @@ -0,0 +1,138 @@ +# retrieve_clippings.py +# Verbatim chunk retrieval from clippings index (ChromaDB). +# Vector search + cross-encoder re-ranking, no LLM. +# +# Returns the top re-ranked chunks with their full text, file metadata, and +# scores. Includes page numbers for PDF sources when available. +# +# E.M.F. February 2026 + +# Environment vars must be set before importing huggingface/transformers +# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE +# at import time. +import os +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models" +os.environ["HF_HUB_OFFLINE"] = "1" + +import chromadb +from llama_index.core import VectorStoreIndex, Settings +from llama_index.vector_stores.chroma import ChromaVectorStore +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.core.postprocessor import SentenceTransformerRerank +import sys +import textwrap + +# +# Globals +# + +PERSIST_DIR = "./clippings_search/store_clippings" +COLLECTION_NAME = "clippings" + +# Embedding model (must match build_clippings.py) +EMBED_MODEL = HuggingFaceEmbedding( + cache_folder="./models", + model_name="BAAI/bge-large-en-v1.5", + local_files_only=True, +) + +# Cross-encoder model for re-ranking (cached in ./models/) +RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2" +RERANK_TOP_N = 15 +RETRIEVE_TOP_K = 30 + +# Output formatting +WRAP_WIDTH = 80 + + +def main(): + # No LLM needed -- set embed model only + Settings.embed_model = EMBED_MODEL + + # Load ChromaDB collection + client = chromadb.PersistentClient(path=PERSIST_DIR) + collection = client.get_collection(COLLECTION_NAME) + + # Build index from existing vector store + vector_store = ChromaVectorStore(chroma_collection=collection) + index = VectorStoreIndex.from_vector_store(vector_store) + + # Build retriever (vector search only, no query engine / LLM) + retriever = index.as_retriever(similarity_top_k=RETRIEVE_TOP_K) + + # Cross-encoder re-ranker + reranker = SentenceTransformerRerank( + model=RERANK_MODEL, + top_n=RERANK_TOP_N, + ) + + # Query + if len(sys.argv) < 2: + print("Usage: python retrieve_clippings.py QUERY_TEXT") + sys.exit(1) + q = " ".join(sys.argv[1:]) + + # Retrieve and re-rank + nodes = retriever.retrieve(q) + reranked = reranker.postprocess_nodes(nodes, query_str=q) + + # Build result list with metadata + results = [] + for i, node in enumerate(reranked, 1): + meta = getattr(node, "metadata", None) or node.node.metadata + score = getattr(node, "score", None) + file_name = meta.get("file_name", "unknown") + page_label = meta.get("page_label", "") + results.append((i, node, file_name, page_label, score)) + + # --- Summary: source files and rankings --- + print(f"\nQuery: {q}") + print(f"Retrieved {len(nodes)} chunks, re-ranked to top {len(reranked)}") + print(f"({collection.count()} total vectors in collection)\n") + + # Unique source files in rank order + seen = set() + unique_sources = [] + for i, node, file_name, page_label, score in results: + if file_name not in seen: + seen.add(file_name) + unique_sources.append(file_name) + + print(f"Source files ({len(unique_sources)} unique):") + for j, fname in enumerate(unique_sources, 1): + print(f" {j}. {fname}") + + print(f"\nRankings:") + for i, node, file_name, page_label, score in results: + line = f" [{i:2d}] {score:+7.3f} {file_name}" + if page_label: + line += f" (p. {page_label})" + print(line) + + # --- Full chunk text --- + print(f"\n{'=' * WRAP_WIDTH}") + print("CHUNKS") + print("=" * WRAP_WIDTH) + + for i, node, file_name, page_label, score in results: + header = f"=== [{i}] {file_name}" + if page_label: + header += f" (p. {page_label})" + header += f" (score: {score:.3f})" + + print("\n" + "=" * WRAP_WIDTH) + print(header) + print("=" * WRAP_WIDTH) + + text = node.get_content() + for line in text.splitlines(): + if line.strip(): + print(textwrap.fill(line, width=WRAP_WIDTH)) + else: + print() + print() + + +if __name__ == "__main__": + main() diff --git a/query_hybrid.py b/query_hybrid.py new file mode 100644 index 0000000..e32d942 --- /dev/null +++ b/query_hybrid.py @@ -0,0 +1,176 @@ +# query_hybrid.py +# Hybrid retrieval: BM25 (sparse) + vector similarity (dense) + cross-encoder +# +# Combines two retrieval strategies to catch both exact term matches and +# semantic similarity: +# 1. Retrieve top-20 via vector similarity (bi-encoder, catches meaning) +# 2. Retrieve top-20 via BM25 (term frequency, catches exact names/dates) +# 3. Merge and deduplicate candidates by node ID +# 4. Re-rank the union with a cross-encoder -> top-15 +# 5. Pass re-ranked chunks to LLM for synthesis +# +# The cross-encoder doesn't care where candidates came from -- it scores +# each (query, chunk) pair on its own merits. BM25's job is just to +# nominate candidates that vector similarity might miss. +# +# E.M.F. February 2026 + +# Environment vars must be set before importing huggingface/transformers +# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE +# at import time. +import os +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models" +os.environ["HF_HUB_OFFLINE"] = "1" + +from llama_index.core import ( + StorageContext, + load_index_from_storage, + Settings, + get_response_synthesizer, +) +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.llms.ollama import Ollama +from llama_index.core.prompts import PromptTemplate +from llama_index.core.postprocessor import SentenceTransformerRerank +from llama_index.retrievers.bm25 import BM25Retriever +import sys + +# +# Globals +# + +# Embedding model (must match build_store.py) +EMBED_MODEL = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5", local_files_only=True) + +# LLM model for generation +LLM_MODEL = "command-r7b" + +# Cross-encoder model for re-ranking (cached in ./models/) +RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2" +RERANK_TOP_N = 15 + +# Retrieval parameters +VECTOR_TOP_K = 20 # candidates from vector similarity +BM25_TOP_K = 20 # candidates from BM25 term matching + +# +# Custom prompt -- same as v3 +# +PROMPT = PromptTemplate( +"""You are a precise research assistant analyzing excerpts from a personal journal collection. +Every excerpt below has been selected and ranked for relevance to the query. + +CONTEXT (ranked by relevance): +{context_str} + +QUERY: +{query_str} + +Instructions: +- Answer ONLY using information explicitly present in the CONTEXT above +- Examine ALL provided excerpts, not just the top few -- each one was selected for relevance +- Be specific: quote or closely paraphrase key passages and cite their file names +- When multiple files touch on the query, note what each one contributes +- If the context doesn't contain enough information to answer fully, say so + +Your response should: +1. Directly answer the query, drawing on as many relevant excerpts as possible +2. Reference specific files and their content (e.g., "In , ...") +3. End with a list of all files that contributed to your answer, with a brief note on each + +If the context is insufficient, explain what's missing.""" +) + + +def main(): + # Configure LLM and embedding model + # for local model using ollama + # Note: Ollama temperature defaults to 0.8 + Settings.llm = Ollama( + model=LLM_MODEL, + temperature=0.3, + request_timeout=360.0, + context_window=8000, + ) + + # Use OpenAI API: + # from llama_index.llms.openai import OpenAI + # Settings.llm = OpenAI( + # model="gpt-4o-mini", # or "gpt-4o" for higher quality + # temperature=0.3, + # ) + + Settings.embed_model = EMBED_MODEL + + + # Load persisted vector store + storage_context = StorageContext.from_defaults(persist_dir="./store") + index = load_index_from_storage(storage_context) + + # --- Retrievers --- + + # Vector retriever (dense: cosine similarity over embeddings) + vector_retriever = index.as_retriever(similarity_top_k=VECTOR_TOP_K) + + # BM25 retriever (sparse: term frequency scoring) + bm25_retriever = BM25Retriever.from_defaults( + index=index, + similarity_top_k=BM25_TOP_K, + ) + + # Cross-encoder re-ranker + reranker = SentenceTransformerRerank( + model=RERANK_MODEL, + top_n=RERANK_TOP_N, + ) + + # --- Query --- + + if len(sys.argv) < 2: + print("Usage: python query_hybrid_bm25_v4.py QUERY_TEXT") + sys.exit(1) + q = " ".join(sys.argv[1:]) + + # Retrieve from both sources + vector_nodes = vector_retriever.retrieve(q) + bm25_nodes = bm25_retriever.retrieve(q) + + # Merge and deduplicate by node ID + seen_ids = set() + merged = [] + for node in vector_nodes + bm25_nodes: + node_id = node.node.node_id + if node_id not in seen_ids: + seen_ids.add(node_id) + merged.append(node) + + # Re-rank the merged candidates with cross-encoder + reranked = reranker.postprocess_nodes(merged, query_str=q) + + # Report retrieval stats + n_vector_only = len([n for n in vector_nodes if n.node.node_id not in {b.node.node_id for b in bm25_nodes}]) + n_bm25_only = len([n for n in bm25_nodes if n.node.node_id not in {v.node.node_id for v in vector_nodes}]) + n_both = len(vector_nodes) + len(bm25_nodes) - len(merged) + + print(f"\nQuery: {q}") + print(f"Vector: {len(vector_nodes)}, BM25: {len(bm25_nodes)}, " + f"overlap: {n_both}, merged: {len(merged)}, re-ranked to: {len(reranked)}") + + # Synthesize response with LLM + synthesizer = get_response_synthesizer(text_qa_template=PROMPT) + response = synthesizer.synthesize(q, nodes=reranked) + + # Output + print("\nResponse:\n") + print(response.response) + + print("\nSource documents:") + for node in response.source_nodes: + meta = getattr(node, "metadata", None) or node.node.metadata + score = getattr(node, "score", None) + print(f"{meta.get('file_name')} {meta.get('file_path')} {score:.3f}") + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..85f1abf --- /dev/null +++ b/requirements.txt @@ -0,0 +1,216 @@ +aiohappyeyeballs==2.6.1 +aiohttp==3.12.15 +aiosignal==1.4.0 +aiosqlite==0.21.0 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anyio==4.10.0 +appnope==0.1.4 +argon2-cffi==25.1.0 +argon2-cffi-bindings==25.1.0 +arrow==1.3.0 +asttokens==3.0.0 +async-lru==2.0.5 +attrs==25.3.0 +babel==2.17.0 +backoff==2.2.1 +banks==2.2.0 +bcrypt==5.0.0 +beautifulsoup4==4.13.4 +bleach==6.2.0 +bm25s==0.2.14 +build==1.4.0 +certifi==2025.8.3 +cffi==1.17.1 +charset-normalizer==3.4.3 +chromadb==1.5.1 +click==8.2.1 +colorama==0.4.6 +comm==0.2.3 +contourpy==1.3.3 +cycler==0.12.1 +dataclasses-json==0.6.7 +debugpy==1.8.16 +decorator==5.2.1 +defusedxml==0.7.1 +Deprecated==1.2.18 +dirtyjson==1.0.8 +distro==1.9.0 +durationpy==0.10 +executing==2.2.0 +fastjsonschema==2.21.1 +filelock==3.18.0 +filetype==1.2.0 +flatbuffers==25.12.19 +fonttools==4.59.1 +fqdn==1.5.1 +frozenlist==1.7.0 +fsspec==2025.7.0 +googleapis-common-protos==1.72.0 +greenlet==3.2.4 +griffe==1.11.0 +grpcio==1.78.1 +h11==0.16.0 +hf-xet==1.1.7 +httpcore==1.0.9 +httptools==0.7.1 +httpx==0.28.1 +huggingface-hub==0.34.4 +idna==3.10 +importlib_metadata==8.7.1 +importlib_resources==6.5.2 +ipykernel==6.30.1 +ipython==9.4.0 +ipython_pygments_lexers==1.1.1 +ipywidgets==8.1.7 +isoduration==20.11.0 +jedi==0.19.2 +Jinja2==3.1.6 +jiter==0.13.0 +joblib==1.5.1 +json5==0.12.1 +jsonpointer==3.0.0 +jsonschema==4.25.0 +jsonschema-specifications==2025.4.1 +jupyter==1.1.1 +jupyter-console==6.6.3 +jupyter-events==0.12.0 +jupyter-lsp==2.2.6 +jupyter_client==8.6.3 +jupyter_core==5.8.1 +jupyter_server==2.16.0 +jupyter_server_terminals==0.5.3 +jupyterlab==4.4.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.15 +kiwisolver==1.4.9 +kubernetes==35.0.0 +lark==1.2.2 +llama-index-core==0.14.14 +llama-index-embeddings-huggingface==0.6.1 +llama-index-instrumentation==0.4.0 +llama-index-llms-ollama==0.9.1 +llama-index-llms-openai==0.6.18 +llama-index-readers-file==0.5.6 +llama-index-retrievers-bm25==0.6.5 +llama-index-vector-stores-chroma==0.5.5 +llama-index-workflows==2.14.2 +markdown-it-py==4.0.0 +MarkupSafe==3.0.2 +marshmallow==3.26.1 +matplotlib==3.10.5 +matplotlib-inline==0.1.7 +mdurl==0.1.2 +mistune==3.1.3 +mmh3==5.2.0 +mpmath==1.3.0 +multidict==6.6.3 +mypy_extensions==1.1.0 +nbclient==0.10.2 +nbconvert==7.16.6 +nbformat==5.10.4 +nest-asyncio==1.6.0 +networkx==3.5 +nltk==3.9.1 +notebook==7.4.5 +notebook_shim==0.2.4 +numpy==2.3.2 +oauthlib==3.3.1 +ollama==0.5.3 +onnxruntime==1.24.2 +openai==2.21.0 +opentelemetry-api==1.39.1 +opentelemetry-exporter-otlp-proto-common==1.39.1 +opentelemetry-exporter-otlp-proto-grpc==1.39.1 +opentelemetry-proto==1.39.1 +opentelemetry-sdk==1.39.1 +opentelemetry-semantic-conventions==0.60b1 +orjson==3.11.7 +overrides==7.7.0 +packaging==25.0 +pandas==2.2.3 +pandocfilters==1.5.1 +parso==0.8.4 +pexpect==4.9.0 +pillow==11.3.0 +platformdirs==4.3.8 +posthog==5.4.0 +prometheus_client==0.22.1 +prompt_toolkit==3.0.51 +propcache==0.3.2 +protobuf==6.33.5 +psutil==7.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pybase64==1.4.3 +pycparser==2.22 +pydantic==2.11.7 +pydantic_core==2.33.2 +Pygments==2.19.2 +pyparsing==3.2.3 +pypdf==6.7.1 +PyPika==0.51.1 +pyproject_hooks==1.2.0 +PyStemmer==2.2.0.3 +python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 +python-json-logger==3.3.0 +pytz==2025.2 +PyYAML==6.0.2 +pyzmq==27.0.1 +referencing==0.36.2 +regex==2025.7.34 +requests==2.32.4 +requests-oauthlib==2.0.0 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rfc3987-syntax==1.1.0 +rich==14.3.3 +rpds-py==0.27.0 +safetensors==0.6.2 +scikit-learn==1.7.1 +scipy==1.16.1 +seaborn==0.13.2 +Send2Trash==1.8.3 +sentence-transformers==5.1.0 +setuptools==80.9.0 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +soupsieve==2.7 +SQLAlchemy==2.0.42 +stack-data==0.6.3 +striprtf==0.0.26 +sympy==1.14.0 +tenacity==9.1.2 +terminado==0.18.1 +threadpoolctl==3.6.0 +tiktoken==0.11.0 +tinycss2==1.4.0 +tokenizers==0.21.4 +torch==2.8.0 +tornado==6.5.2 +tqdm==4.67.1 +traitlets==5.14.3 +transformers==4.55.0 +typer==0.24.1 +types-python-dateutil==2.9.0.20250809 +typing-inspect==0.9.0 +typing-inspection==0.4.1 +typing_extensions==4.14.1 +tzdata==2025.2 +uri-template==1.3.0 +urllib3==2.5.0 +uvicorn==0.41.0 +uvloop==0.22.1 +watchfiles==1.1.1 +wcwidth==0.2.13 +webcolors==24.11.1 +webencodings==0.5.1 +websocket-client==1.8.0 +websockets==16.0 +widgetsnbextension==4.0.14 +wrapt==1.17.2 +yarl==1.20.1 +zipp==3.23.0 diff --git a/retrieve.py b/retrieve.py new file mode 100644 index 0000000..28f92e1 --- /dev/null +++ b/retrieve.py @@ -0,0 +1,140 @@ +# retrieve.py +# Hybrid verbatim chunk retrieval: BM25 + vector search + cross-encoder, no LLM. +# +# Same hybrid retrieval as query_hybrid.py but outputs raw chunk text +# instead of LLM synthesis. Useful for inspecting what the hybrid pipeline +# retrieves. +# +# Each chunk is annotated with its source (vector, BM25, or both) so you can +# see which retriever nominated it. +# +# E.M.F. February 2026 + +# Environment vars must be set before importing huggingface/transformers +# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE +# at import time. +import os +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models" +os.environ["HF_HUB_OFFLINE"] = "1" + +from llama_index.core import ( + StorageContext, + load_index_from_storage, + Settings, +) +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.core.postprocessor import SentenceTransformerRerank +from llama_index.retrievers.bm25 import BM25Retriever +import sys +import textwrap + +# +# Globals +# + +# Embedding model (must match build_store.py) +EMBED_MODEL = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5", local_files_only=True) + +# Cross-encoder model for re-ranking (cached in ./models/) +RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2" +RERANK_TOP_N = 15 + +# Retrieval parameters +VECTOR_TOP_K = 20 +BM25_TOP_K = 20 + +# Output formatting +WRAP_WIDTH = 80 + + +def main(): + # No LLM needed -- set embed model only + Settings.embed_model = EMBED_MODEL + + # Load persisted vector store + storage_context = StorageContext.from_defaults(persist_dir="./store") + index = load_index_from_storage(storage_context) + + # --- Retrievers --- + + vector_retriever = index.as_retriever(similarity_top_k=VECTOR_TOP_K) + + bm25_retriever = BM25Retriever.from_defaults( + index=index, + similarity_top_k=BM25_TOP_K, + ) + + # Cross-encoder re-ranker + reranker = SentenceTransformerRerank( + model=RERANK_MODEL, + top_n=RERANK_TOP_N, + ) + + # Query + if len(sys.argv) < 2: + print("Usage: python retrieve_hybrid_raw.py QUERY_TEXT") + sys.exit(1) + q = " ".join(sys.argv[1:]) + + # Retrieve from both sources + vector_nodes = vector_retriever.retrieve(q) + bm25_nodes = bm25_retriever.retrieve(q) + + # Track which retriever found each node + vector_ids = {n.node.node_id for n in vector_nodes} + bm25_ids = {n.node.node_id for n in bm25_nodes} + + # Merge and deduplicate by node ID + seen_ids = set() + merged = [] + for node in vector_nodes + bm25_nodes: + node_id = node.node.node_id + if node_id not in seen_ids: + seen_ids.add(node_id) + merged.append(node) + + # Re-rank merged candidates + reranked = reranker.postprocess_nodes(merged, query_str=q) + + # Retrieval stats + n_both = len(vector_ids & bm25_ids) + n_vector_only = len(vector_ids - bm25_ids) + n_bm25_only = len(bm25_ids - vector_ids) + + print(f"\nQuery: {q}") + print(f"Vector: {len(vector_nodes)}, BM25: {len(bm25_nodes)}, " + f"overlap: {n_both}, merged: {len(merged)}, re-ranked to: {len(reranked)}") + print(f" vector-only: {n_vector_only}, bm25-only: {n_bm25_only}, both: {n_both}\n") + + # Output re-ranked chunks with source annotation + for i, node in enumerate(reranked, 1): + meta = getattr(node, "metadata", None) or node.node.metadata + score = getattr(node, "score", None) + file_name = meta.get("file_name", "unknown") + text = node.get_content() + node_id = node.node.node_id + + # Annotate source + in_vector = node_id in vector_ids + in_bm25 = node_id in bm25_ids + if in_vector and in_bm25: + source = "vector+bm25" + elif in_bm25: + source = "bm25-only" + else: + source = "vector-only" + + print("=" * WRAP_WIDTH) + print(f"=== [{i}] {file_name} (score: {score:.3f}) [{source}]") + print("=" * WRAP_WIDTH) + for line in text.splitlines(): + if line.strip(): + print(textwrap.fill(line, width=WRAP_WIDTH)) + else: + print() + print() + + +if __name__ == "__main__": + main() diff --git a/run_query.sh b/run_query.sh new file mode 100755 index 0000000..674b79f --- /dev/null +++ b/run_query.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# This shell script will handle I/O for the python query engine +# It will take a query and return the formatted results + +# E.M.F. August 2025 + +# Usage: ./run_query.sh + +QUERY_SCRIPT="query_hybrid.py" + +echo -e "Current query engine is $QUERY_SCRIPT\n" + +# Loop until input is "exit" +while true; do + read -p "Enter your query (or type 'exit' to quit): " query + if [ "$query" == "exit" ] || [ "$query" == "quit" ] || [ "$query" == "" ] ; then + echo "Exiting..." + break + fi + time_start=$(date +%s) + + # Call the python script with the query and format the output + python3 $QUERY_SCRIPT --query "$query" | \ + expand | sed -E 's|(.* )(.*/data)|\1./data|' | fold -s -w 131 + + time_end=$(date +%s) + elapsed=$((time_end - time_start)) + echo -e "Query processed in $elapsed seconds.\n" + echo $query >> query.log +done diff --git a/search_keywords.py b/search_keywords.py new file mode 100644 index 0000000..cb2cd51 --- /dev/null +++ b/search_keywords.py @@ -0,0 +1,189 @@ +# search_keywords.py +# Keyword search: extract terms from a query using POS tagging, then grep +# across journal files for matches. +# +# Complements the vector search pipeline by catching exact names, places, +# and dates that embeddings can miss. No vector store or LLM needed. +# +# Term extraction uses NLTK POS tagging to keep nouns (NN*), proper nouns +# (NNP*), and adjectives (JJ*) -- skipping stopwords and function words +# automatically. Consecutive proper nouns are joined into multi-word phrases +# (e.g., "Robert Wright" stays as one search term, not "robert" + "wright"). +# +# E.M.F. February 2026 + +import os +import sys +import re +from pathlib import Path + +import nltk + +# +# Globals +# +DATA_DIR = Path("./data") +CONTEXT_LINES = 2 # lines of context around each match +MAX_MATCHES_PER_FILE = 3 # cap matches shown per file to avoid flooding + +# POS tags to keep: nouns, proper nouns, adjectives +KEEP_TAGS = {"NN", "NNS", "NNP", "NNPS", "JJ", "JJS", "JJR"} + +# Proper noun tags (consecutive runs are joined as phrases) +PROPER_NOUN_TAGS = {"NNP", "NNPS"} + +# Minimum word length to keep (filters out short noise) +MIN_WORD_LEN = 3 + + +def ensure_nltk_data(): + """Download NLTK data if not already present.""" + for resource, name in [ + ("tokenizers/punkt_tab", "punkt_tab"), + ("taggers/averaged_perceptron_tagger_eng", "averaged_perceptron_tagger_eng"), + ]: + try: + nltk.data.find(resource) + except LookupError: + print(f"Downloading NLTK resource: {name}") + nltk.download(name, quiet=True) + + +def extract_terms(query): + """Extract key terms from a query using POS tagging. + + Tokenizes the query, runs POS tagging, and keeps nouns, proper nouns, + and adjectives. Consecutive proper nouns (NNP/NNPS) are joined into + multi-word phrases (e.g., "Robert Wright" → "robert wright"). + + Returns a list of terms (lowercase), phrases listed first. + """ + tokens = nltk.word_tokenize(query) + tagged = nltk.pos_tag(tokens) + + phrases = [] # multi-word proper noun phrases + single_terms = [] # individual nouns/adjectives + proper_run = [] # accumulator for consecutive proper nouns + + for word, tag in tagged: + if tag in PROPER_NOUN_TAGS: + proper_run.append(word) + else: + # Flush any accumulated proper noun run + if proper_run: + phrase = " ".join(proper_run).lower() + if len(phrase) >= MIN_WORD_LEN: + phrases.append(phrase) + proper_run = [] + # Keep other nouns and adjectives as single terms + if tag in KEEP_TAGS and len(word) >= MIN_WORD_LEN: + single_terms.append(word.lower()) + + # Flush final proper noun run + if proper_run: + phrase = " ".join(proper_run).lower() + if len(phrase) >= MIN_WORD_LEN: + phrases.append(phrase) + + # Phrases first (more specific), then single terms + all_terms = phrases + single_terms + return list(dict.fromkeys(all_terms)) # deduplicate, preserve order + + +def search_files(terms, data_dir, context_lines=CONTEXT_LINES): + """Search all .txt files in data_dir for the given terms. + + Returns a list of (file_path, match_count, matches) where matches is a + list of (line_number, context_block) tuples. + """ + if not terms: + return [] + + # Build a single regex pattern that matches any term (case-insensitive) + pattern = re.compile( + r"\b(" + "|".join(re.escape(t) for t in terms) + r")\b", + re.IGNORECASE + ) + + results = [] + txt_files = sorted(data_dir.glob("*.txt")) + + for fpath in txt_files: + try: + lines = fpath.read_text(encoding="utf-8").splitlines() + except (OSError, UnicodeDecodeError): + continue + + matches = [] + match_count = 0 + seen_lines = set() # avoid overlapping context blocks + + for i, line in enumerate(lines): + if pattern.search(line): + match_count += 1 + if i in seen_lines: + continue + + # Extract context window + start = max(0, i - context_lines) + end = min(len(lines), i + context_lines + 1) + block = [] + for j in range(start, end): + seen_lines.add(j) + marker = ">>>" if j == i else " " + block.append(f" {marker} {j+1:4d}: {lines[j]}") + + matches.append((i + 1, "\n".join(block))) + + if match_count > 0: + results.append((fpath, match_count, matches)) + + # Sort by match count (most matches first) + results.sort(key=lambda x: x[1], reverse=True) + return results + + +def main(): + if len(sys.argv) < 2: + print("Usage: python search_keywords.py QUERY_TEXT") + sys.exit(1) + + ensure_nltk_data() + + q = " ".join(sys.argv[1:]) + + # Extract terms + terms = extract_terms(q) + if not terms: + print(f"Query: {q}") + print("No searchable terms extracted. Try a more specific query.") + sys.exit(0) + + print(f"Query: {q}") + print(f"Extracted terms: {', '.join(terms)}\n") + + # Search + results = search_files(terms, DATA_DIR) + + if not results: + print("No matches found.") + sys.exit(0) + + # Summary + total_matches = sum(r[1] for r in results) + print(f"Found {total_matches} matches across {len(results)} files\n") + + # Detailed output + for fpath, match_count, matches in results: + print("="*60) + print(f"--- {fpath.name} ({match_count} matches) ---") + print("="*60) + for line_num, block in matches[:MAX_MATCHES_PER_FILE]: + print(block) + print() + if len(matches) > MAX_MATCHES_PER_FILE: + print(f" ... and {len(matches) - MAX_MATCHES_PER_FILE} more matches\n") + + +if __name__ == "__main__": + main()