Test clean deploy

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Eric Furst 2026-02-27 05:59:01 -05:00
commit 42e5e20e17
11 changed files with 1790 additions and 0 deletions

View file

@ -0,0 +1,471 @@
# build_clippings.py
#
# Build or update the ChromaDB vector store from clippings in ./clippings.
#
# Default mode (incremental): loads the existing index and adds only
# new or modified files. Use --rebuild for a full rebuild from scratch.
#
# Handles PDFs, TXT, webarchive, and RTF files. Skips non-extractable PDFs
# and writes them to ocr_needed.txt for later OCR processing.
#
# February 2026
# E. M. Furst
# Environment vars must be set before importing huggingface/transformers
# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
# at import time.
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
os.environ["HF_HUB_OFFLINE"] = "1"
import chromadb
from llama_index.core import (
SimpleDirectoryReader,
StorageContext,
VectorStoreIndex,
Settings,
Document,
)
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from pathlib import Path
import argparse
import datetime
import time
# Shared constants
DATA_DIR = Path("./clippings")
PERSIST_DIR = "./clippings_search/store_clippings"
COLLECTION_NAME = "clippings"
EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5"
CHUNK_SIZE = 256
CHUNK_OVERLAP = 25
# File types handled by SimpleDirectoryReader (PDF + TXT)
READER_EXTS = {".pdf", ".txt"}
# File types handled by custom loaders
CUSTOM_EXTS = {".webarchive", ".rtf"}
# All supported extensions
SUPPORTED_EXTS = READER_EXTS | CUSTOM_EXTS
# Minimum extracted text length to consider a PDF valid (characters)
MIN_TEXT_LENGTH = 100
def get_text_splitter():
return SentenceSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
paragraph_separator="\n\n",
)
def validate_pdf(file_path):
"""Check if a PDF has extractable text.
Returns (is_valid, reason) where reason explains why it was skipped.
"""
import pypdf
try:
reader = pypdf.PdfReader(str(file_path))
page_count = len(reader.pages)
total_chars = 0
printable_chars = 0
for page in reader.pages:
text = page.extract_text() or ""
total_chars += len(text)
printable_chars += sum(
1 for c in text if c.isprintable() or c in "\n\r\t"
)
if total_chars < MIN_TEXT_LENGTH:
return False, f"too little text ({total_chars} chars, {page_count} pages)"
ratio = printable_chars / total_chars if total_chars > 0 else 0
if ratio < 0.5:
return False, f"low printable ratio ({ratio:.2f}, {page_count} pages)"
return True, None
except Exception as e:
return False, str(e)
def load_webarchive(file_path):
"""Extract text from a macOS .webarchive file.
Returns a LlamaIndex Document, or None if extraction fails.
"""
import plistlib
from bs4 import BeautifulSoup
try:
with open(file_path, "rb") as f:
plist = plistlib.load(f)
resource = plist.get("WebMainResource", {})
html_bytes = resource.get("WebResourceData", b"")
if not html_bytes:
return None
html = html_bytes.decode("utf-8", errors="replace")
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text(separator="\n", strip=True)
if len(text) < MIN_TEXT_LENGTH:
return None
stat = file_path.stat()
mdate = datetime.datetime.fromtimestamp(
stat.st_mtime, tz=datetime.timezone.utc
).strftime("%Y-%m-%d")
return Document(
text=text,
metadata={
"file_name": file_path.name,
"file_path": str(file_path),
"file_size": stat.st_size,
"last_modified_date": mdate,
"file_type": "webarchive",
},
)
except Exception as e:
print(f" Warning: could not read webarchive {file_path.name}: {e}")
return None
def load_rtf(file_path):
"""Extract text from an RTF file.
Returns a LlamaIndex Document, or None if extraction fails.
"""
from striprtf.striprtf import rtf_to_text
try:
with open(file_path, "r", errors="replace") as f:
rtf_content = f.read()
text = rtf_to_text(rtf_content)
if len(text) < MIN_TEXT_LENGTH:
return None
stat = file_path.stat()
mdate = datetime.datetime.fromtimestamp(
stat.st_mtime, tz=datetime.timezone.utc
).strftime("%Y-%m-%d")
return Document(
text=text,
metadata={
"file_name": file_path.name,
"file_path": str(file_path),
"file_size": stat.st_size,
"last_modified_date": mdate,
"file_type": "rtf",
},
)
except Exception as e:
print(f" Warning: could not read RTF {file_path.name}: {e}")
return None
def scan_clippings():
"""Scan the clippings directory and classify files.
Returns (reader_files, custom_docs, skipped, ocr_needed) where:
- reader_files: list of Paths for SimpleDirectoryReader (PDF + TXT)
- custom_docs: list of Document objects from custom loaders
- skipped: list of (Path, reason) tuples
- ocr_needed: list of Paths for PDFs that need OCR
"""
reader_files = []
custom_docs = []
skipped = []
ocr_needed = []
for fpath in sorted(DATA_DIR.rglob("*")):
if not fpath.is_file():
continue
if fpath.name.startswith("."):
continue
ext = fpath.suffix.lower()
if ext not in SUPPORTED_EXTS:
skipped.append((fpath, f"unsupported type: {ext}"))
continue
if ext == ".pdf":
is_valid, reason = validate_pdf(fpath)
if not is_valid:
skipped.append((fpath, f"no extractable text: {reason}"))
ocr_needed.append(fpath)
continue
reader_files.append(fpath)
elif ext == ".txt":
reader_files.append(fpath)
elif ext == ".webarchive":
doc = load_webarchive(fpath)
if doc:
custom_docs.append(doc)
else:
skipped.append((fpath, "no extractable text from webarchive"))
elif ext == ".rtf":
doc = load_rtf(fpath)
if doc:
custom_docs.append(doc)
else:
skipped.append((fpath, "no extractable text from RTF"))
return reader_files, custom_docs, skipped, ocr_needed
def write_ocr_list(ocr_needed):
"""Write the list of PDFs needing OCR to ocr_needed.txt."""
with open("ocr_needed.txt", "w") as f:
for fpath in ocr_needed:
f.write(f"{fpath}\n")
print(f"Wrote {len(ocr_needed)} file(s) to ocr_needed.txt")
def load_all_documents(reader_files, custom_docs):
"""Load documents from SimpleDirectoryReader and merge with custom docs."""
documents = []
if reader_files:
print(f"Loading {len(reader_files)} PDF/TXT files...")
reader_docs = SimpleDirectoryReader(
input_files=[str(f) for f in reader_files],
filename_as_id=True,
).load_data()
documents.extend(reader_docs)
if custom_docs:
print(f"Adding {len(custom_docs)} webarchive/RTF documents...")
documents.extend(custom_docs)
return documents
def rebuild(reader_files, custom_docs):
"""Full rebuild: delete existing collection and recreate from scratch."""
client = chromadb.PersistentClient(path=PERSIST_DIR)
# Delete existing collection if present
try:
client.delete_collection(COLLECTION_NAME)
print(f"Deleted existing collection '{COLLECTION_NAME}'")
except Exception:
pass
collection = client.get_or_create_collection(COLLECTION_NAME)
vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
documents = load_all_documents(reader_files, custom_docs)
if not documents:
raise ValueError("No documents loaded")
print(f"Loaded {len(documents)} document(s) total")
print("Building vector index...")
index = VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
transformations=[get_text_splitter()],
show_progress=True,
)
print(f"Index built. Collection has {collection.count()} vectors.")
return index
def update(reader_files, custom_docs):
"""Incremental update: add new, re-index modified, remove deleted files."""
client = chromadb.PersistentClient(path=PERSIST_DIR)
collection = client.get_collection(COLLECTION_NAME)
count = collection.count()
print(f"Existing collection has {count} vectors")
# Get all stored metadata to find what's indexed
# Key on file_path (not file_name) to handle duplicate names across subdirs
indexed = {} # file_path -> {"ids": [], "file_size": ..., "last_modified_date": ...}
if count > 0:
results = collection.get(include=["metadatas"])
for i, meta in enumerate(results["metadatas"]):
fpath = meta.get("file_path", "")
if fpath not in indexed:
indexed[fpath] = {
"ids": [],
"file_size": meta.get("file_size"),
"last_modified_date": meta.get("last_modified_date"),
}
indexed[fpath]["ids"].append(results["ids"][i])
print(f"Index contains {len(indexed)} unique files")
# Build disk file lookup: file_path_str -> Path
# For reader_files, match the path format SimpleDirectoryReader would store
disk_files = {}
for f in reader_files:
disk_files[str(f)] = f
for doc in custom_docs:
disk_files[doc.metadata["file_path"]] = Path(doc.metadata["file_path"])
# Classify files
new_reader = []
new_custom = []
modified_reader = []
modified_custom = []
deleted_paths = []
unchanged = 0
for path_str, fpath in disk_files.items():
if path_str not in indexed:
# Check if it's a custom doc
if fpath.suffix.lower() in CUSTOM_EXTS:
matching = [d for d in custom_docs if d.metadata["file_path"] == path_str]
if matching:
new_custom.extend(matching)
else:
new_reader.append(fpath)
else:
info = indexed[path_str]
stat = fpath.stat()
disk_mdate = datetime.datetime.fromtimestamp(
stat.st_mtime, tz=datetime.timezone.utc
).strftime("%Y-%m-%d")
if stat.st_size != info["file_size"] or disk_mdate != info["last_modified_date"]:
if fpath.suffix.lower() in CUSTOM_EXTS:
matching = [d for d in custom_docs if d.metadata["file_path"] == path_str]
if matching:
modified_custom.extend(matching)
else:
modified_reader.append(fpath)
else:
unchanged += 1
for path_str in indexed:
if path_str not in disk_files:
deleted_paths.append(path_str)
n_new = len(new_reader) + len(new_custom)
n_modified = len(modified_reader) + len(modified_custom)
print(f"\n New: {n_new}")
print(f" Modified: {n_modified}")
print(f" Deleted: {len(deleted_paths)}")
print(f" Unchanged: {unchanged}")
if n_new == 0 and n_modified == 0 and len(deleted_paths) == 0:
print("\nNothing to do.")
return
# Delete chunks for removed and modified files
for path_str in deleted_paths:
ids = indexed[path_str]["ids"]
fname = Path(path_str).name
print(f" Removing {fname} ({len(ids)} chunks)")
collection.delete(ids=ids)
for fpath in modified_reader:
path_str = str(fpath)
ids = indexed[path_str]["ids"]
print(f" Re-indexing {fpath.name} ({len(ids)} chunks)")
collection.delete(ids=ids)
for doc in modified_custom:
path_str = doc.metadata["file_path"]
if path_str in indexed:
ids = indexed[path_str]["ids"]
print(f" Re-indexing {doc.metadata['file_name']} ({len(ids)} chunks)")
collection.delete(ids=ids)
# Add new and modified files
files_to_add = new_reader + modified_reader
docs_to_add = new_custom + modified_custom
if files_to_add or docs_to_add:
documents = load_all_documents(files_to_add, docs_to_add)
if documents:
print(f"Indexing {len(documents)} document(s)...")
vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
transformations=[get_text_splitter()],
show_progress=True,
)
print(f"\nIndex updated. Collection now has {collection.count()} vectors.")
def main():
parser = argparse.ArgumentParser(
description="Build or update the clippings vector store (ChromaDB)."
)
parser.add_argument(
"--rebuild",
action="store_true",
help="Full rebuild from scratch (default: incremental update)",
)
args = parser.parse_args()
# Configure embedding model (offline, cached in ./models)
embed_model = HuggingFaceEmbedding(
model_name=EMBED_MODEL_NAME,
cache_folder="./models",
local_files_only=True,
)
Settings.embed_model = embed_model
if not DATA_DIR.exists():
raise FileNotFoundError(
f"Clippings directory not found: {DATA_DIR.absolute()}\n"
f"Create symlink: ln -s ../clippings ./clippings"
)
start = time.time()
# Scan and classify files
print(f"Scanning {DATA_DIR.absolute()}...")
reader_files, custom_docs, skipped, ocr_needed = scan_clippings()
n_valid = len(reader_files) + len(custom_docs)
print(f"\nFiles to index: {n_valid}")
print(f" PDF/TXT: {len(reader_files)}")
print(f" Webarchive/RTF: {len(custom_docs)}")
print(f"Files skipped: {len(skipped)}")
for fpath, reason in skipped:
print(f" SKIP: {fpath.name} -- {reason}")
if ocr_needed:
write_ocr_list(ocr_needed)
if n_valid == 0:
raise ValueError("No valid files found to index")
if args.rebuild:
print("\nMode: full rebuild")
rebuild(reader_files, custom_docs)
else:
print("\nMode: incremental update")
if not Path(PERSIST_DIR).exists():
print(f"No existing index at {PERSIST_DIR}, doing full rebuild.")
rebuild(reader_files, custom_docs)
else:
update(reader_files, custom_docs)
elapsed = time.time() - start
print(f"Done in {elapsed:.1f}s")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,138 @@
# retrieve_clippings.py
# Verbatim chunk retrieval from clippings index (ChromaDB).
# Vector search + cross-encoder re-ranking, no LLM.
#
# Returns the top re-ranked chunks with their full text, file metadata, and
# scores. Includes page numbers for PDF sources when available.
#
# E.M.F. February 2026
# Environment vars must be set before importing huggingface/transformers
# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
# at import time.
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
os.environ["HF_HUB_OFFLINE"] = "1"
import chromadb
from llama_index.core import VectorStoreIndex, Settings
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.postprocessor import SentenceTransformerRerank
import sys
import textwrap
#
# Globals
#
PERSIST_DIR = "./clippings_search/store_clippings"
COLLECTION_NAME = "clippings"
# Embedding model (must match build_clippings.py)
EMBED_MODEL = HuggingFaceEmbedding(
cache_folder="./models",
model_name="BAAI/bge-large-en-v1.5",
local_files_only=True,
)
# Cross-encoder model for re-ranking (cached in ./models/)
RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2"
RERANK_TOP_N = 15
RETRIEVE_TOP_K = 30
# Output formatting
WRAP_WIDTH = 80
def main():
# No LLM needed -- set embed model only
Settings.embed_model = EMBED_MODEL
# Load ChromaDB collection
client = chromadb.PersistentClient(path=PERSIST_DIR)
collection = client.get_collection(COLLECTION_NAME)
# Build index from existing vector store
vector_store = ChromaVectorStore(chroma_collection=collection)
index = VectorStoreIndex.from_vector_store(vector_store)
# Build retriever (vector search only, no query engine / LLM)
retriever = index.as_retriever(similarity_top_k=RETRIEVE_TOP_K)
# Cross-encoder re-ranker
reranker = SentenceTransformerRerank(
model=RERANK_MODEL,
top_n=RERANK_TOP_N,
)
# Query
if len(sys.argv) < 2:
print("Usage: python retrieve_clippings.py QUERY_TEXT")
sys.exit(1)
q = " ".join(sys.argv[1:])
# Retrieve and re-rank
nodes = retriever.retrieve(q)
reranked = reranker.postprocess_nodes(nodes, query_str=q)
# Build result list with metadata
results = []
for i, node in enumerate(reranked, 1):
meta = getattr(node, "metadata", None) or node.node.metadata
score = getattr(node, "score", None)
file_name = meta.get("file_name", "unknown")
page_label = meta.get("page_label", "")
results.append((i, node, file_name, page_label, score))
# --- Summary: source files and rankings ---
print(f"\nQuery: {q}")
print(f"Retrieved {len(nodes)} chunks, re-ranked to top {len(reranked)}")
print(f"({collection.count()} total vectors in collection)\n")
# Unique source files in rank order
seen = set()
unique_sources = []
for i, node, file_name, page_label, score in results:
if file_name not in seen:
seen.add(file_name)
unique_sources.append(file_name)
print(f"Source files ({len(unique_sources)} unique):")
for j, fname in enumerate(unique_sources, 1):
print(f" {j}. {fname}")
print(f"\nRankings:")
for i, node, file_name, page_label, score in results:
line = f" [{i:2d}] {score:+7.3f} {file_name}"
if page_label:
line += f" (p. {page_label})"
print(line)
# --- Full chunk text ---
print(f"\n{'=' * WRAP_WIDTH}")
print("CHUNKS")
print("=" * WRAP_WIDTH)
for i, node, file_name, page_label, score in results:
header = f"=== [{i}] {file_name}"
if page_label:
header += f" (p. {page_label})"
header += f" (score: {score:.3f})"
print("\n" + "=" * WRAP_WIDTH)
print(header)
print("=" * WRAP_WIDTH)
text = node.get_content()
for line in text.splitlines():
if line.strip():
print(textwrap.fill(line, width=WRAP_WIDTH))
else:
print()
print()
if __name__ == "__main__":
main()