Test clean deploy

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-27 05:59:01 -05:00 · 2026-02-27 05:59:01 -05:00 · 42e5e20e17
commit 42e5e20e17
11 changed files with 1790 additions and 0 deletions
--- a/clippings_search/build_clippings.py
+++ b/clippings_search/build_clippings.py
@ -0,0 +1,471 @@
+# build_clippings.py
+#
+# Build or update the ChromaDB vector store from clippings in ./clippings.
+#
+# Default mode (incremental): loads the existing index and adds only
+# new or modified files.  Use --rebuild for a full rebuild from scratch.
+#
+# Handles PDFs, TXT, webarchive, and RTF files. Skips non-extractable PDFs
+# and writes them to ocr_needed.txt for later OCR processing.
+#
+# February 2026
+# E. M. Furst
+
+# Environment vars must be set before importing huggingface/transformers
+# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
+# at import time.
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
+os.environ["HF_HUB_OFFLINE"] = "1"
+
+import chromadb
+from llama_index.core import (
+    SimpleDirectoryReader,
+    StorageContext,
+    VectorStoreIndex,
+    Settings,
+    Document,
+)
+from llama_index.vector_stores.chroma import ChromaVectorStore
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core.node_parser import SentenceSplitter
+from pathlib import Path
+import argparse
+import datetime
+import time
+
+# Shared constants
+DATA_DIR = Path("./clippings")
+PERSIST_DIR = "./clippings_search/store_clippings"
+COLLECTION_NAME = "clippings"
+EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5"
+CHUNK_SIZE = 256
+CHUNK_OVERLAP = 25
+
+# File types handled by SimpleDirectoryReader (PDF + TXT)
+READER_EXTS = {".pdf", ".txt"}
+# File types handled by custom loaders
+CUSTOM_EXTS = {".webarchive", ".rtf"}
+# All supported extensions
+SUPPORTED_EXTS = READER_EXTS | CUSTOM_EXTS
+
+# Minimum extracted text length to consider a PDF valid (characters)
+MIN_TEXT_LENGTH = 100
+
+
+def get_text_splitter():
+    return SentenceSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP,
+        paragraph_separator="\n\n",
+    )
+
+
+def validate_pdf(file_path):
+    """Check if a PDF has extractable text.
+
+    Returns (is_valid, reason) where reason explains why it was skipped.
+    """
+    import pypdf
+    try:
+        reader = pypdf.PdfReader(str(file_path))
+        page_count = len(reader.pages)
+        total_chars = 0
+        printable_chars = 0
+        for page in reader.pages:
+            text = page.extract_text() or ""
+            total_chars += len(text)
+            printable_chars += sum(
+                1 for c in text if c.isprintable() or c in "\n\r\t"
+            )
+
+        if total_chars < MIN_TEXT_LENGTH:
+            return False, f"too little text ({total_chars} chars, {page_count} pages)"
+
+        ratio = printable_chars / total_chars if total_chars > 0 else 0
+        if ratio < 0.5:
+            return False, f"low printable ratio ({ratio:.2f}, {page_count} pages)"
+
+        return True, None
+    except Exception as e:
+        return False, str(e)
+
+
+def load_webarchive(file_path):
+    """Extract text from a macOS .webarchive file.
+
+    Returns a LlamaIndex Document, or None if extraction fails.
+    """
+    import plistlib
+    from bs4 import BeautifulSoup
+
+    try:
+        with open(file_path, "rb") as f:
+            plist = plistlib.load(f)
+
+        resource = plist.get("WebMainResource", {})
+        html_bytes = resource.get("WebResourceData", b"")
+        if not html_bytes:
+            return None
+
+        html = html_bytes.decode("utf-8", errors="replace")
+        soup = BeautifulSoup(html, "html.parser")
+        text = soup.get_text(separator="\n", strip=True)
+
+        if len(text) < MIN_TEXT_LENGTH:
+            return None
+
+        stat = file_path.stat()
+        mdate = datetime.datetime.fromtimestamp(
+            stat.st_mtime, tz=datetime.timezone.utc
+        ).strftime("%Y-%m-%d")
+
+        return Document(
+            text=text,
+            metadata={
+                "file_name": file_path.name,
+                "file_path": str(file_path),
+                "file_size": stat.st_size,
+                "last_modified_date": mdate,
+                "file_type": "webarchive",
+            },
+        )
+    except Exception as e:
+        print(f"  Warning: could not read webarchive {file_path.name}: {e}")
+        return None
+
+
+def load_rtf(file_path):
+    """Extract text from an RTF file.
+
+    Returns a LlamaIndex Document, or None if extraction fails.
+    """
+    from striprtf.striprtf import rtf_to_text
+
+    try:
+        with open(file_path, "r", errors="replace") as f:
+            rtf_content = f.read()
+
+        text = rtf_to_text(rtf_content)
+
+        if len(text) < MIN_TEXT_LENGTH:
+            return None
+
+        stat = file_path.stat()
+        mdate = datetime.datetime.fromtimestamp(
+            stat.st_mtime, tz=datetime.timezone.utc
+        ).strftime("%Y-%m-%d")
+
+        return Document(
+            text=text,
+            metadata={
+                "file_name": file_path.name,
+                "file_path": str(file_path),
+                "file_size": stat.st_size,
+                "last_modified_date": mdate,
+                "file_type": "rtf",
+            },
+        )
+    except Exception as e:
+        print(f"  Warning: could not read RTF {file_path.name}: {e}")
+        return None
+
+
+def scan_clippings():
+    """Scan the clippings directory and classify files.
+
+    Returns (reader_files, custom_docs, skipped, ocr_needed) where:
+    - reader_files: list of Paths for SimpleDirectoryReader (PDF + TXT)
+    - custom_docs: list of Document objects from custom loaders
+    - skipped: list of (Path, reason) tuples
+    - ocr_needed: list of Paths for PDFs that need OCR
+    """
+    reader_files = []
+    custom_docs = []
+    skipped = []
+    ocr_needed = []
+
+    for fpath in sorted(DATA_DIR.rglob("*")):
+        if not fpath.is_file():
+            continue
+        if fpath.name.startswith("."):
+            continue
+
+        ext = fpath.suffix.lower()
+
+        if ext not in SUPPORTED_EXTS:
+            skipped.append((fpath, f"unsupported type: {ext}"))
+            continue
+
+        if ext == ".pdf":
+            is_valid, reason = validate_pdf(fpath)
+            if not is_valid:
+                skipped.append((fpath, f"no extractable text: {reason}"))
+                ocr_needed.append(fpath)
+                continue
+            reader_files.append(fpath)
+
+        elif ext == ".txt":
+            reader_files.append(fpath)
+
+        elif ext == ".webarchive":
+            doc = load_webarchive(fpath)
+            if doc:
+                custom_docs.append(doc)
+            else:
+                skipped.append((fpath, "no extractable text from webarchive"))
+
+        elif ext == ".rtf":
+            doc = load_rtf(fpath)
+            if doc:
+                custom_docs.append(doc)
+            else:
+                skipped.append((fpath, "no extractable text from RTF"))
+
+    return reader_files, custom_docs, skipped, ocr_needed
+
+
+def write_ocr_list(ocr_needed):
+    """Write the list of PDFs needing OCR to ocr_needed.txt."""
+    with open("ocr_needed.txt", "w") as f:
+        for fpath in ocr_needed:
+            f.write(f"{fpath}\n")
+    print(f"Wrote {len(ocr_needed)} file(s) to ocr_needed.txt")
+
+
+def load_all_documents(reader_files, custom_docs):
+    """Load documents from SimpleDirectoryReader and merge with custom docs."""
+    documents = []
+
+    if reader_files:
+        print(f"Loading {len(reader_files)} PDF/TXT files...")
+        reader_docs = SimpleDirectoryReader(
+            input_files=[str(f) for f in reader_files],
+            filename_as_id=True,
+        ).load_data()
+        documents.extend(reader_docs)
+
+    if custom_docs:
+        print(f"Adding {len(custom_docs)} webarchive/RTF documents...")
+        documents.extend(custom_docs)
+
+    return documents
+
+
+def rebuild(reader_files, custom_docs):
+    """Full rebuild: delete existing collection and recreate from scratch."""
+    client = chromadb.PersistentClient(path=PERSIST_DIR)
+    # Delete existing collection if present
+    try:
+        client.delete_collection(COLLECTION_NAME)
+        print(f"Deleted existing collection '{COLLECTION_NAME}'")
+    except Exception:
+        pass
+
+    collection = client.get_or_create_collection(COLLECTION_NAME)
+    vector_store = ChromaVectorStore(chroma_collection=collection)
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+
+    documents = load_all_documents(reader_files, custom_docs)
+    if not documents:
+        raise ValueError("No documents loaded")
+
+    print(f"Loaded {len(documents)} document(s) total")
+    print("Building vector index...")
+
+    index = VectorStoreIndex.from_documents(
+        documents,
+        storage_context=storage_context,
+        transformations=[get_text_splitter()],
+        show_progress=True,
+    )
+
+    print(f"Index built. Collection has {collection.count()} vectors.")
+    return index
+
+
+def update(reader_files, custom_docs):
+    """Incremental update: add new, re-index modified, remove deleted files."""
+    client = chromadb.PersistentClient(path=PERSIST_DIR)
+    collection = client.get_collection(COLLECTION_NAME)
+    count = collection.count()
+    print(f"Existing collection has {count} vectors")
+
+    # Get all stored metadata to find what's indexed
+    # Key on file_path (not file_name) to handle duplicate names across subdirs
+    indexed = {}  # file_path -> {"ids": [], "file_size": ..., "last_modified_date": ...}
+    if count > 0:
+        results = collection.get(include=["metadatas"])
+        for i, meta in enumerate(results["metadatas"]):
+            fpath = meta.get("file_path", "")
+            if fpath not in indexed:
+                indexed[fpath] = {
+                    "ids": [],
+                    "file_size": meta.get("file_size"),
+                    "last_modified_date": meta.get("last_modified_date"),
+                }
+            indexed[fpath]["ids"].append(results["ids"][i])
+
+    print(f"Index contains {len(indexed)} unique files")
+
+    # Build disk file lookup: file_path_str -> Path
+    # For reader_files, match the path format SimpleDirectoryReader would store
+    disk_files = {}
+    for f in reader_files:
+        disk_files[str(f)] = f
+    for doc in custom_docs:
+        disk_files[doc.metadata["file_path"]] = Path(doc.metadata["file_path"])
+
+    # Classify files
+    new_reader = []
+    new_custom = []
+    modified_reader = []
+    modified_custom = []
+    deleted_paths = []
+    unchanged = 0
+
+    for path_str, fpath in disk_files.items():
+        if path_str not in indexed:
+            # Check if it's a custom doc
+            if fpath.suffix.lower() in CUSTOM_EXTS:
+                matching = [d for d in custom_docs if d.metadata["file_path"] == path_str]
+                if matching:
+                    new_custom.extend(matching)
+            else:
+                new_reader.append(fpath)
+        else:
+            info = indexed[path_str]
+            stat = fpath.stat()
+            disk_mdate = datetime.datetime.fromtimestamp(
+                stat.st_mtime, tz=datetime.timezone.utc
+            ).strftime("%Y-%m-%d")
+
+            if stat.st_size != info["file_size"] or disk_mdate != info["last_modified_date"]:
+                if fpath.suffix.lower() in CUSTOM_EXTS:
+                    matching = [d for d in custom_docs if d.metadata["file_path"] == path_str]
+                    if matching:
+                        modified_custom.extend(matching)
+                else:
+                    modified_reader.append(fpath)
+            else:
+                unchanged += 1
+
+    for path_str in indexed:
+        if path_str not in disk_files:
+            deleted_paths.append(path_str)
+
+    n_new = len(new_reader) + len(new_custom)
+    n_modified = len(modified_reader) + len(modified_custom)
+    print(f"\n  New:       {n_new}")
+    print(f"  Modified:  {n_modified}")
+    print(f"  Deleted:   {len(deleted_paths)}")
+    print(f"  Unchanged: {unchanged}")
+
+    if n_new == 0 and n_modified == 0 and len(deleted_paths) == 0:
+        print("\nNothing to do.")
+        return
+
+    # Delete chunks for removed and modified files
+    for path_str in deleted_paths:
+        ids = indexed[path_str]["ids"]
+        fname = Path(path_str).name
+        print(f"  Removing {fname} ({len(ids)} chunks)")
+        collection.delete(ids=ids)
+
+    for fpath in modified_reader:
+        path_str = str(fpath)
+        ids = indexed[path_str]["ids"]
+        print(f"  Re-indexing {fpath.name} ({len(ids)} chunks)")
+        collection.delete(ids=ids)
+
+    for doc in modified_custom:
+        path_str = doc.metadata["file_path"]
+        if path_str in indexed:
+            ids = indexed[path_str]["ids"]
+            print(f"  Re-indexing {doc.metadata['file_name']} ({len(ids)} chunks)")
+            collection.delete(ids=ids)
+
+    # Add new and modified files
+    files_to_add = new_reader + modified_reader
+    docs_to_add = new_custom + modified_custom
+
+    if files_to_add or docs_to_add:
+        documents = load_all_documents(files_to_add, docs_to_add)
+        if documents:
+            print(f"Indexing {len(documents)} document(s)...")
+            vector_store = ChromaVectorStore(chroma_collection=collection)
+            storage_context = StorageContext.from_defaults(vector_store=vector_store)
+
+            VectorStoreIndex.from_documents(
+                documents,
+                storage_context=storage_context,
+                transformations=[get_text_splitter()],
+                show_progress=True,
+            )
+
+    print(f"\nIndex updated. Collection now has {collection.count()} vectors.")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Build or update the clippings vector store (ChromaDB)."
+    )
+    parser.add_argument(
+        "--rebuild",
+        action="store_true",
+        help="Full rebuild from scratch (default: incremental update)",
+    )
+    args = parser.parse_args()
+
+    # Configure embedding model (offline, cached in ./models)
+    embed_model = HuggingFaceEmbedding(
+        model_name=EMBED_MODEL_NAME,
+        cache_folder="./models",
+        local_files_only=True,
+    )
+    Settings.embed_model = embed_model
+
+    if not DATA_DIR.exists():
+        raise FileNotFoundError(
+            f"Clippings directory not found: {DATA_DIR.absolute()}\n"
+            f"Create symlink: ln -s ../clippings ./clippings"
+        )
+
+    start = time.time()
+
+    # Scan and classify files
+    print(f"Scanning {DATA_DIR.absolute()}...")
+    reader_files, custom_docs, skipped, ocr_needed = scan_clippings()
+
+    n_valid = len(reader_files) + len(custom_docs)
+    print(f"\nFiles to index: {n_valid}")
+    print(f"  PDF/TXT:       {len(reader_files)}")
+    print(f"  Webarchive/RTF: {len(custom_docs)}")
+    print(f"Files skipped:   {len(skipped)}")
+    for fpath, reason in skipped:
+        print(f"  SKIP: {fpath.name} -- {reason}")
+
+    if ocr_needed:
+        write_ocr_list(ocr_needed)
+
+    if n_valid == 0:
+        raise ValueError("No valid files found to index")
+
+    if args.rebuild:
+        print("\nMode: full rebuild")
+        rebuild(reader_files, custom_docs)
+    else:
+        print("\nMode: incremental update")
+        if not Path(PERSIST_DIR).exists():
+            print(f"No existing index at {PERSIST_DIR}, doing full rebuild.")
+            rebuild(reader_files, custom_docs)
+        else:
+            update(reader_files, custom_docs)
+
+    elapsed = time.time() - start
+    print(f"Done in {elapsed:.1f}s")
+
+
+if __name__ == "__main__":
+    main()
--- a/clippings_search/retrieve_clippings.py
+++ b/clippings_search/retrieve_clippings.py
@ -0,0 +1,138 @@
+# retrieve_clippings.py
+# Verbatim chunk retrieval from clippings index (ChromaDB).
+# Vector search + cross-encoder re-ranking, no LLM.
+#
+# Returns the top re-ranked chunks with their full text, file metadata, and
+# scores. Includes page numbers for PDF sources when available.
+#
+# E.M.F. February 2026
+
+# Environment vars must be set before importing huggingface/transformers
+# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
+# at import time.
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
+os.environ["HF_HUB_OFFLINE"] = "1"
+
+import chromadb
+from llama_index.core import VectorStoreIndex, Settings
+from llama_index.vector_stores.chroma import ChromaVectorStore
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core.postprocessor import SentenceTransformerRerank
+import sys
+import textwrap
+
+#
+# Globals
+#
+
+PERSIST_DIR = "./clippings_search/store_clippings"
+COLLECTION_NAME = "clippings"
+
+# Embedding model (must match build_clippings.py)
+EMBED_MODEL = HuggingFaceEmbedding(
+    cache_folder="./models",
+    model_name="BAAI/bge-large-en-v1.5",
+    local_files_only=True,
+)
+
+# Cross-encoder model for re-ranking (cached in ./models/)
+RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2"
+RERANK_TOP_N = 15
+RETRIEVE_TOP_K = 30
+
+# Output formatting
+WRAP_WIDTH = 80
+
+
+def main():
+    # No LLM needed -- set embed model only
+    Settings.embed_model = EMBED_MODEL
+
+    # Load ChromaDB collection
+    client = chromadb.PersistentClient(path=PERSIST_DIR)
+    collection = client.get_collection(COLLECTION_NAME)
+
+    # Build index from existing vector store
+    vector_store = ChromaVectorStore(chroma_collection=collection)
+    index = VectorStoreIndex.from_vector_store(vector_store)
+
+    # Build retriever (vector search only, no query engine / LLM)
+    retriever = index.as_retriever(similarity_top_k=RETRIEVE_TOP_K)
+
+    # Cross-encoder re-ranker
+    reranker = SentenceTransformerRerank(
+        model=RERANK_MODEL,
+        top_n=RERANK_TOP_N,
+    )
+
+    # Query
+    if len(sys.argv) < 2:
+        print("Usage: python retrieve_clippings.py QUERY_TEXT")
+        sys.exit(1)
+    q = " ".join(sys.argv[1:])
+
+    # Retrieve and re-rank
+    nodes = retriever.retrieve(q)
+    reranked = reranker.postprocess_nodes(nodes, query_str=q)
+
+    # Build result list with metadata
+    results = []
+    for i, node in enumerate(reranked, 1):
+        meta = getattr(node, "metadata", None) or node.node.metadata
+        score = getattr(node, "score", None)
+        file_name = meta.get("file_name", "unknown")
+        page_label = meta.get("page_label", "")
+        results.append((i, node, file_name, page_label, score))
+
+    # --- Summary: source files and rankings ---
+    print(f"\nQuery: {q}")
+    print(f"Retrieved {len(nodes)} chunks, re-ranked to top {len(reranked)}")
+    print(f"({collection.count()} total vectors in collection)\n")
+
+    # Unique source files in rank order
+    seen = set()
+    unique_sources = []
+    for i, node, file_name, page_label, score in results:
+        if file_name not in seen:
+            seen.add(file_name)
+            unique_sources.append(file_name)
+
+    print(f"Source files ({len(unique_sources)} unique):")
+    for j, fname in enumerate(unique_sources, 1):
+        print(f"  {j}. {fname}")
+
+    print(f"\nRankings:")
+    for i, node, file_name, page_label, score in results:
+        line = f"  [{i:2d}]  {score:+7.3f}  {file_name}"
+        if page_label:
+            line += f"  (p. {page_label})"
+        print(line)
+
+    # --- Full chunk text ---
+    print(f"\n{'=' * WRAP_WIDTH}")
+    print("CHUNKS")
+    print("=" * WRAP_WIDTH)
+
+    for i, node, file_name, page_label, score in results:
+        header = f"=== [{i}] {file_name}"
+        if page_label:
+            header += f"  (p. {page_label})"
+        header += f"  (score: {score:.3f})"
+
+        print("\n" + "=" * WRAP_WIDTH)
+        print(header)
+        print("=" * WRAP_WIDTH)
+
+        text = node.get_content()
+        for line in text.splitlines():
+            if line.strip():
+                print(textwrap.fill(line, width=WRAP_WIDTH))
+            else:
+                print()
+        print()
+
+
+if __name__ == "__main__":
+    main()