Built semantic search over clippings files.
Embedded text as a ChromaDB to learn that. Updated requirements to include new depenendencies
This commit is contained in:
parent
c47c5e5c4f
commit
b4bf89ce4b
4 changed files with 667 additions and 8 deletions
9
.gitignore
vendored
9
.gitignore
vendored
|
|
@ -6,12 +6,17 @@ __pycache__/
|
|||
# HuggingFace cached models (large, ~2 GB)
|
||||
models/
|
||||
|
||||
# Vector stores (large, rebuild with build_exp_claude.py)
|
||||
# Vector stores (large, rebuild with build scripts)
|
||||
storage_exp/
|
||||
storage/
|
||||
storage_clippings/
|
||||
|
||||
# Data (symlink to private journal files)
|
||||
# Data (symlinks to private files)
|
||||
data
|
||||
clippings
|
||||
|
||||
# Generated file lists
|
||||
ocr_needed.txt
|
||||
|
||||
# IDE and OS
|
||||
.DS_Store
|
||||
|
|
|
|||
471
build_clippings.py
Normal file
471
build_clippings.py
Normal file
|
|
@ -0,0 +1,471 @@
|
|||
# build_clippings.py
|
||||
#
|
||||
# Build or update the ChromaDB vector store from clippings in ./clippings.
|
||||
#
|
||||
# Default mode (incremental): loads the existing index and adds only
|
||||
# new or modified files. Use --rebuild for a full rebuild from scratch.
|
||||
#
|
||||
# Handles PDFs, TXT, webarchive, and RTF files. Skips non-extractable PDFs
|
||||
# and writes them to ocr_needed.txt for later OCR processing.
|
||||
#
|
||||
# February 2026
|
||||
# E. M. Furst
|
||||
|
||||
# Environment vars must be set before importing huggingface/transformers
|
||||
# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
|
||||
# at import time.
|
||||
import os
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
|
||||
os.environ["HF_HUB_OFFLINE"] = "1"
|
||||
|
||||
import chromadb
|
||||
from llama_index.core import (
|
||||
SimpleDirectoryReader,
|
||||
StorageContext,
|
||||
VectorStoreIndex,
|
||||
Settings,
|
||||
Document,
|
||||
)
|
||||
from llama_index.vector_stores.chroma import ChromaVectorStore
|
||||
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
import datetime
|
||||
import time
|
||||
|
||||
# Shared constants
|
||||
DATA_DIR = Path("./clippings")
|
||||
PERSIST_DIR = "./storage_clippings"
|
||||
COLLECTION_NAME = "clippings"
|
||||
EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5"
|
||||
CHUNK_SIZE = 256
|
||||
CHUNK_OVERLAP = 25
|
||||
|
||||
# File types handled by SimpleDirectoryReader (PDF + TXT)
|
||||
READER_EXTS = {".pdf", ".txt"}
|
||||
# File types handled by custom loaders
|
||||
CUSTOM_EXTS = {".webarchive", ".rtf"}
|
||||
# All supported extensions
|
||||
SUPPORTED_EXTS = READER_EXTS | CUSTOM_EXTS
|
||||
|
||||
# Minimum extracted text length to consider a PDF valid (characters)
|
||||
MIN_TEXT_LENGTH = 100
|
||||
|
||||
|
||||
def get_text_splitter():
|
||||
return SentenceSplitter(
|
||||
chunk_size=CHUNK_SIZE,
|
||||
chunk_overlap=CHUNK_OVERLAP,
|
||||
paragraph_separator="\n\n",
|
||||
)
|
||||
|
||||
|
||||
def validate_pdf(file_path):
|
||||
"""Check if a PDF has extractable text.
|
||||
|
||||
Returns (is_valid, reason) where reason explains why it was skipped.
|
||||
"""
|
||||
import pypdf
|
||||
try:
|
||||
reader = pypdf.PdfReader(str(file_path))
|
||||
page_count = len(reader.pages)
|
||||
total_chars = 0
|
||||
printable_chars = 0
|
||||
for page in reader.pages:
|
||||
text = page.extract_text() or ""
|
||||
total_chars += len(text)
|
||||
printable_chars += sum(
|
||||
1 for c in text if c.isprintable() or c in "\n\r\t"
|
||||
)
|
||||
|
||||
if total_chars < MIN_TEXT_LENGTH:
|
||||
return False, f"too little text ({total_chars} chars, {page_count} pages)"
|
||||
|
||||
ratio = printable_chars / total_chars if total_chars > 0 else 0
|
||||
if ratio < 0.5:
|
||||
return False, f"low printable ratio ({ratio:.2f}, {page_count} pages)"
|
||||
|
||||
return True, None
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
|
||||
def load_webarchive(file_path):
|
||||
"""Extract text from a macOS .webarchive file.
|
||||
|
||||
Returns a LlamaIndex Document, or None if extraction fails.
|
||||
"""
|
||||
import plistlib
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
try:
|
||||
with open(file_path, "rb") as f:
|
||||
plist = plistlib.load(f)
|
||||
|
||||
resource = plist.get("WebMainResource", {})
|
||||
html_bytes = resource.get("WebResourceData", b"")
|
||||
if not html_bytes:
|
||||
return None
|
||||
|
||||
html = html_bytes.decode("utf-8", errors="replace")
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
text = soup.get_text(separator="\n", strip=True)
|
||||
|
||||
if len(text) < MIN_TEXT_LENGTH:
|
||||
return None
|
||||
|
||||
stat = file_path.stat()
|
||||
mdate = datetime.datetime.fromtimestamp(
|
||||
stat.st_mtime, tz=datetime.timezone.utc
|
||||
).strftime("%Y-%m-%d")
|
||||
|
||||
return Document(
|
||||
text=text,
|
||||
metadata={
|
||||
"file_name": file_path.name,
|
||||
"file_path": str(file_path),
|
||||
"file_size": stat.st_size,
|
||||
"last_modified_date": mdate,
|
||||
"file_type": "webarchive",
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
print(f" Warning: could not read webarchive {file_path.name}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def load_rtf(file_path):
|
||||
"""Extract text from an RTF file.
|
||||
|
||||
Returns a LlamaIndex Document, or None if extraction fails.
|
||||
"""
|
||||
from striprtf.striprtf import rtf_to_text
|
||||
|
||||
try:
|
||||
with open(file_path, "r", errors="replace") as f:
|
||||
rtf_content = f.read()
|
||||
|
||||
text = rtf_to_text(rtf_content)
|
||||
|
||||
if len(text) < MIN_TEXT_LENGTH:
|
||||
return None
|
||||
|
||||
stat = file_path.stat()
|
||||
mdate = datetime.datetime.fromtimestamp(
|
||||
stat.st_mtime, tz=datetime.timezone.utc
|
||||
).strftime("%Y-%m-%d")
|
||||
|
||||
return Document(
|
||||
text=text,
|
||||
metadata={
|
||||
"file_name": file_path.name,
|
||||
"file_path": str(file_path),
|
||||
"file_size": stat.st_size,
|
||||
"last_modified_date": mdate,
|
||||
"file_type": "rtf",
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
print(f" Warning: could not read RTF {file_path.name}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def scan_clippings():
|
||||
"""Scan the clippings directory and classify files.
|
||||
|
||||
Returns (reader_files, custom_docs, skipped, ocr_needed) where:
|
||||
- reader_files: list of Paths for SimpleDirectoryReader (PDF + TXT)
|
||||
- custom_docs: list of Document objects from custom loaders
|
||||
- skipped: list of (Path, reason) tuples
|
||||
- ocr_needed: list of Paths for PDFs that need OCR
|
||||
"""
|
||||
reader_files = []
|
||||
custom_docs = []
|
||||
skipped = []
|
||||
ocr_needed = []
|
||||
|
||||
for fpath in sorted(DATA_DIR.rglob("*")):
|
||||
if not fpath.is_file():
|
||||
continue
|
||||
if fpath.name.startswith("."):
|
||||
continue
|
||||
|
||||
ext = fpath.suffix.lower()
|
||||
|
||||
if ext not in SUPPORTED_EXTS:
|
||||
skipped.append((fpath, f"unsupported type: {ext}"))
|
||||
continue
|
||||
|
||||
if ext == ".pdf":
|
||||
is_valid, reason = validate_pdf(fpath)
|
||||
if not is_valid:
|
||||
skipped.append((fpath, f"no extractable text: {reason}"))
|
||||
ocr_needed.append(fpath)
|
||||
continue
|
||||
reader_files.append(fpath)
|
||||
|
||||
elif ext == ".txt":
|
||||
reader_files.append(fpath)
|
||||
|
||||
elif ext == ".webarchive":
|
||||
doc = load_webarchive(fpath)
|
||||
if doc:
|
||||
custom_docs.append(doc)
|
||||
else:
|
||||
skipped.append((fpath, "no extractable text from webarchive"))
|
||||
|
||||
elif ext == ".rtf":
|
||||
doc = load_rtf(fpath)
|
||||
if doc:
|
||||
custom_docs.append(doc)
|
||||
else:
|
||||
skipped.append((fpath, "no extractable text from RTF"))
|
||||
|
||||
return reader_files, custom_docs, skipped, ocr_needed
|
||||
|
||||
|
||||
def write_ocr_list(ocr_needed):
|
||||
"""Write the list of PDFs needing OCR to ocr_needed.txt."""
|
||||
with open("ocr_needed.txt", "w") as f:
|
||||
for fpath in ocr_needed:
|
||||
f.write(f"{fpath}\n")
|
||||
print(f"Wrote {len(ocr_needed)} file(s) to ocr_needed.txt")
|
||||
|
||||
|
||||
def load_all_documents(reader_files, custom_docs):
|
||||
"""Load documents from SimpleDirectoryReader and merge with custom docs."""
|
||||
documents = []
|
||||
|
||||
if reader_files:
|
||||
print(f"Loading {len(reader_files)} PDF/TXT files...")
|
||||
reader_docs = SimpleDirectoryReader(
|
||||
input_files=[str(f) for f in reader_files],
|
||||
filename_as_id=True,
|
||||
).load_data()
|
||||
documents.extend(reader_docs)
|
||||
|
||||
if custom_docs:
|
||||
print(f"Adding {len(custom_docs)} webarchive/RTF documents...")
|
||||
documents.extend(custom_docs)
|
||||
|
||||
return documents
|
||||
|
||||
|
||||
def rebuild(reader_files, custom_docs):
|
||||
"""Full rebuild: delete existing collection and recreate from scratch."""
|
||||
client = chromadb.PersistentClient(path=PERSIST_DIR)
|
||||
# Delete existing collection if present
|
||||
try:
|
||||
client.delete_collection(COLLECTION_NAME)
|
||||
print(f"Deleted existing collection '{COLLECTION_NAME}'")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
collection = client.get_or_create_collection(COLLECTION_NAME)
|
||||
vector_store = ChromaVectorStore(chroma_collection=collection)
|
||||
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
||||
|
||||
documents = load_all_documents(reader_files, custom_docs)
|
||||
if not documents:
|
||||
raise ValueError("No documents loaded")
|
||||
|
||||
print(f"Loaded {len(documents)} document(s) total")
|
||||
print("Building vector index...")
|
||||
|
||||
index = VectorStoreIndex.from_documents(
|
||||
documents,
|
||||
storage_context=storage_context,
|
||||
transformations=[get_text_splitter()],
|
||||
show_progress=True,
|
||||
)
|
||||
|
||||
print(f"Index built. Collection has {collection.count()} vectors.")
|
||||
return index
|
||||
|
||||
|
||||
def update(reader_files, custom_docs):
|
||||
"""Incremental update: add new, re-index modified, remove deleted files."""
|
||||
client = chromadb.PersistentClient(path=PERSIST_DIR)
|
||||
collection = client.get_collection(COLLECTION_NAME)
|
||||
count = collection.count()
|
||||
print(f"Existing collection has {count} vectors")
|
||||
|
||||
# Get all stored metadata to find what's indexed
|
||||
# Key on file_path (not file_name) to handle duplicate names across subdirs
|
||||
indexed = {} # file_path -> {"ids": [], "file_size": ..., "last_modified_date": ...}
|
||||
if count > 0:
|
||||
results = collection.get(include=["metadatas"])
|
||||
for i, meta in enumerate(results["metadatas"]):
|
||||
fpath = meta.get("file_path", "")
|
||||
if fpath not in indexed:
|
||||
indexed[fpath] = {
|
||||
"ids": [],
|
||||
"file_size": meta.get("file_size"),
|
||||
"last_modified_date": meta.get("last_modified_date"),
|
||||
}
|
||||
indexed[fpath]["ids"].append(results["ids"][i])
|
||||
|
||||
print(f"Index contains {len(indexed)} unique files")
|
||||
|
||||
# Build disk file lookup: file_path_str -> Path
|
||||
# For reader_files, match the path format SimpleDirectoryReader would store
|
||||
disk_files = {}
|
||||
for f in reader_files:
|
||||
disk_files[str(f)] = f
|
||||
for doc in custom_docs:
|
||||
disk_files[doc.metadata["file_path"]] = Path(doc.metadata["file_path"])
|
||||
|
||||
# Classify files
|
||||
new_reader = []
|
||||
new_custom = []
|
||||
modified_reader = []
|
||||
modified_custom = []
|
||||
deleted_paths = []
|
||||
unchanged = 0
|
||||
|
||||
for path_str, fpath in disk_files.items():
|
||||
if path_str not in indexed:
|
||||
# Check if it's a custom doc
|
||||
if fpath.suffix.lower() in CUSTOM_EXTS:
|
||||
matching = [d for d in custom_docs if d.metadata["file_path"] == path_str]
|
||||
if matching:
|
||||
new_custom.extend(matching)
|
||||
else:
|
||||
new_reader.append(fpath)
|
||||
else:
|
||||
info = indexed[path_str]
|
||||
stat = fpath.stat()
|
||||
disk_mdate = datetime.datetime.fromtimestamp(
|
||||
stat.st_mtime, tz=datetime.timezone.utc
|
||||
).strftime("%Y-%m-%d")
|
||||
|
||||
if stat.st_size != info["file_size"] or disk_mdate != info["last_modified_date"]:
|
||||
if fpath.suffix.lower() in CUSTOM_EXTS:
|
||||
matching = [d for d in custom_docs if d.metadata["file_path"] == path_str]
|
||||
if matching:
|
||||
modified_custom.extend(matching)
|
||||
else:
|
||||
modified_reader.append(fpath)
|
||||
else:
|
||||
unchanged += 1
|
||||
|
||||
for path_str in indexed:
|
||||
if path_str not in disk_files:
|
||||
deleted_paths.append(path_str)
|
||||
|
||||
n_new = len(new_reader) + len(new_custom)
|
||||
n_modified = len(modified_reader) + len(modified_custom)
|
||||
print(f"\n New: {n_new}")
|
||||
print(f" Modified: {n_modified}")
|
||||
print(f" Deleted: {len(deleted_paths)}")
|
||||
print(f" Unchanged: {unchanged}")
|
||||
|
||||
if n_new == 0 and n_modified == 0 and len(deleted_paths) == 0:
|
||||
print("\nNothing to do.")
|
||||
return
|
||||
|
||||
# Delete chunks for removed and modified files
|
||||
for path_str in deleted_paths:
|
||||
ids = indexed[path_str]["ids"]
|
||||
fname = Path(path_str).name
|
||||
print(f" Removing {fname} ({len(ids)} chunks)")
|
||||
collection.delete(ids=ids)
|
||||
|
||||
for fpath in modified_reader:
|
||||
path_str = str(fpath)
|
||||
ids = indexed[path_str]["ids"]
|
||||
print(f" Re-indexing {fpath.name} ({len(ids)} chunks)")
|
||||
collection.delete(ids=ids)
|
||||
|
||||
for doc in modified_custom:
|
||||
path_str = doc.metadata["file_path"]
|
||||
if path_str in indexed:
|
||||
ids = indexed[path_str]["ids"]
|
||||
print(f" Re-indexing {doc.metadata['file_name']} ({len(ids)} chunks)")
|
||||
collection.delete(ids=ids)
|
||||
|
||||
# Add new and modified files
|
||||
files_to_add = new_reader + modified_reader
|
||||
docs_to_add = new_custom + modified_custom
|
||||
|
||||
if files_to_add or docs_to_add:
|
||||
documents = load_all_documents(files_to_add, docs_to_add)
|
||||
if documents:
|
||||
print(f"Indexing {len(documents)} document(s)...")
|
||||
vector_store = ChromaVectorStore(chroma_collection=collection)
|
||||
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
||||
|
||||
VectorStoreIndex.from_documents(
|
||||
documents,
|
||||
storage_context=storage_context,
|
||||
transformations=[get_text_splitter()],
|
||||
show_progress=True,
|
||||
)
|
||||
|
||||
print(f"\nIndex updated. Collection now has {collection.count()} vectors.")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Build or update the clippings vector store (ChromaDB)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rebuild",
|
||||
action="store_true",
|
||||
help="Full rebuild from scratch (default: incremental update)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Configure embedding model (offline, cached in ./models)
|
||||
embed_model = HuggingFaceEmbedding(
|
||||
model_name=EMBED_MODEL_NAME,
|
||||
cache_folder="./models",
|
||||
local_files_only=True,
|
||||
)
|
||||
Settings.embed_model = embed_model
|
||||
|
||||
if not DATA_DIR.exists():
|
||||
raise FileNotFoundError(
|
||||
f"Clippings directory not found: {DATA_DIR.absolute()}\n"
|
||||
f"Create symlink: ln -s ../clippings ./clippings"
|
||||
)
|
||||
|
||||
start = time.time()
|
||||
|
||||
# Scan and classify files
|
||||
print(f"Scanning {DATA_DIR.absolute()}...")
|
||||
reader_files, custom_docs, skipped, ocr_needed = scan_clippings()
|
||||
|
||||
n_valid = len(reader_files) + len(custom_docs)
|
||||
print(f"\nFiles to index: {n_valid}")
|
||||
print(f" PDF/TXT: {len(reader_files)}")
|
||||
print(f" Webarchive/RTF: {len(custom_docs)}")
|
||||
print(f"Files skipped: {len(skipped)}")
|
||||
for fpath, reason in skipped:
|
||||
print(f" SKIP: {fpath.name} -- {reason}")
|
||||
|
||||
if ocr_needed:
|
||||
write_ocr_list(ocr_needed)
|
||||
|
||||
if n_valid == 0:
|
||||
raise ValueError("No valid files found to index")
|
||||
|
||||
if args.rebuild:
|
||||
print("\nMode: full rebuild")
|
||||
rebuild(reader_files, custom_docs)
|
||||
else:
|
||||
print("\nMode: incremental update")
|
||||
if not Path(PERSIST_DIR).exists():
|
||||
print(f"No existing index at {PERSIST_DIR}, doing full rebuild.")
|
||||
rebuild(reader_files, custom_docs)
|
||||
else:
|
||||
update(reader_files, custom_docs)
|
||||
|
||||
elapsed = time.time() - start
|
||||
print(f"Done in {elapsed:.1f}s")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -2,6 +2,7 @@ aiohappyeyeballs==2.6.1
|
|||
aiohttp==3.12.15
|
||||
aiosignal==1.4.0
|
||||
aiosqlite==0.21.0
|
||||
annotated-doc==0.0.4
|
||||
annotated-types==0.7.0
|
||||
anyio==4.10.0
|
||||
appnope==0.1.4
|
||||
|
|
@ -12,13 +13,17 @@ asttokens==3.0.0
|
|||
async-lru==2.0.5
|
||||
attrs==25.3.0
|
||||
babel==2.17.0
|
||||
backoff==2.2.1
|
||||
banks==2.2.0
|
||||
bcrypt==5.0.0
|
||||
beautifulsoup4==4.13.4
|
||||
bleach==6.2.0
|
||||
bm25s==0.2.14
|
||||
build==1.4.0
|
||||
certifi==2025.8.3
|
||||
cffi==1.17.1
|
||||
charset-normalizer==3.4.3
|
||||
chromadb==1.5.1
|
||||
click==8.2.1
|
||||
colorama==0.4.6
|
||||
comm==0.2.3
|
||||
|
|
@ -30,22 +35,30 @@ decorator==5.2.1
|
|||
defusedxml==0.7.1
|
||||
Deprecated==1.2.18
|
||||
dirtyjson==1.0.8
|
||||
distro==1.9.0
|
||||
durationpy==0.10
|
||||
executing==2.2.0
|
||||
fastjsonschema==2.21.1
|
||||
filelock==3.18.0
|
||||
filetype==1.2.0
|
||||
flatbuffers==25.12.19
|
||||
fonttools==4.59.1
|
||||
fqdn==1.5.1
|
||||
frozenlist==1.7.0
|
||||
fsspec==2025.7.0
|
||||
googleapis-common-protos==1.72.0
|
||||
greenlet==3.2.4
|
||||
griffe==1.11.0
|
||||
grpcio==1.78.1
|
||||
h11==0.16.0
|
||||
hf-xet==1.1.7
|
||||
httpcore==1.0.9
|
||||
httptools==0.7.1
|
||||
httpx==0.28.1
|
||||
huggingface-hub==0.34.4
|
||||
idna==3.10
|
||||
importlib_metadata==8.7.1
|
||||
importlib_resources==6.5.2
|
||||
ipykernel==6.30.1
|
||||
ipython==9.4.0
|
||||
ipython_pygments_lexers==1.1.1
|
||||
|
|
@ -53,6 +66,7 @@ ipywidgets==8.1.7
|
|||
isoduration==20.11.0
|
||||
jedi==0.19.2
|
||||
Jinja2==3.1.6
|
||||
jiter==0.13.0
|
||||
joblib==1.5.1
|
||||
json5==0.12.1
|
||||
jsonpointer==3.0.0
|
||||
|
|
@ -71,19 +85,25 @@ jupyterlab_pygments==0.3.0
|
|||
jupyterlab_server==2.27.3
|
||||
jupyterlab_widgets==3.0.15
|
||||
kiwisolver==1.4.9
|
||||
kubernetes==35.0.0
|
||||
lark==1.2.2
|
||||
llama-index-core==0.13.1
|
||||
llama-index-embeddings-huggingface==0.6.0
|
||||
llama-index-core==0.14.14
|
||||
llama-index-embeddings-huggingface==0.6.1
|
||||
llama-index-instrumentation==0.4.0
|
||||
llama-index-llms-ollama==0.7.0
|
||||
llama-index-readers-file==0.5.0
|
||||
llama-index-llms-ollama==0.9.1
|
||||
llama-index-llms-openai==0.6.18
|
||||
llama-index-readers-file==0.5.6
|
||||
llama-index-retrievers-bm25==0.6.5
|
||||
llama-index-workflows==1.3.0
|
||||
llama-index-vector-stores-chroma==0.5.5
|
||||
llama-index-workflows==2.14.2
|
||||
markdown-it-py==4.0.0
|
||||
MarkupSafe==3.0.2
|
||||
marshmallow==3.26.1
|
||||
matplotlib==3.10.5
|
||||
matplotlib-inline==0.1.7
|
||||
mdurl==0.1.2
|
||||
mistune==3.1.3
|
||||
mmh3==5.2.0
|
||||
mpmath==1.3.0
|
||||
multidict==6.6.3
|
||||
mypy_extensions==1.1.0
|
||||
|
|
@ -96,7 +116,17 @@ nltk==3.9.1
|
|||
notebook==7.4.5
|
||||
notebook_shim==0.2.4
|
||||
numpy==2.3.2
|
||||
oauthlib==3.3.1
|
||||
ollama==0.5.3
|
||||
onnxruntime==1.24.2
|
||||
openai==2.21.0
|
||||
opentelemetry-api==1.39.1
|
||||
opentelemetry-exporter-otlp-proto-common==1.39.1
|
||||
opentelemetry-exporter-otlp-proto-grpc==1.39.1
|
||||
opentelemetry-proto==1.39.1
|
||||
opentelemetry-sdk==1.39.1
|
||||
opentelemetry-semantic-conventions==0.60b1
|
||||
orjson==3.11.7
|
||||
overrides==7.7.0
|
||||
packaging==25.0
|
||||
pandas==2.2.3
|
||||
|
|
@ -105,20 +135,26 @@ parso==0.8.4
|
|||
pexpect==4.9.0
|
||||
pillow==11.3.0
|
||||
platformdirs==4.3.8
|
||||
posthog==5.4.0
|
||||
prometheus_client==0.22.1
|
||||
prompt_toolkit==3.0.51
|
||||
propcache==0.3.2
|
||||
protobuf==6.33.5
|
||||
psutil==7.0.0
|
||||
ptyprocess==0.7.0
|
||||
pure_eval==0.2.3
|
||||
pybase64==1.4.3
|
||||
pycparser==2.22
|
||||
pydantic==2.11.7
|
||||
pydantic_core==2.33.2
|
||||
Pygments==2.19.2
|
||||
pyparsing==3.2.3
|
||||
pypdf==5.9.0
|
||||
pypdf==6.7.1
|
||||
PyPika==0.51.1
|
||||
pyproject_hooks==1.2.0
|
||||
PyStemmer==2.2.0.3
|
||||
python-dateutil==2.9.0.post0
|
||||
python-dotenv==1.2.1
|
||||
python-json-logger==3.3.0
|
||||
pytz==2025.2
|
||||
PyYAML==6.0.2
|
||||
|
|
@ -126,9 +162,11 @@ pyzmq==27.0.1
|
|||
referencing==0.36.2
|
||||
regex==2025.7.34
|
||||
requests==2.32.4
|
||||
requests-oauthlib==2.0.0
|
||||
rfc3339-validator==0.1.4
|
||||
rfc3986-validator==0.1.1
|
||||
rfc3987-syntax==1.1.0
|
||||
rich==14.3.3
|
||||
rpds-py==0.27.0
|
||||
safetensors==0.6.2
|
||||
scikit-learn==1.7.1
|
||||
|
|
@ -137,6 +175,7 @@ seaborn==0.13.2
|
|||
Send2Trash==1.8.3
|
||||
sentence-transformers==5.1.0
|
||||
setuptools==80.9.0
|
||||
shellingham==1.5.4
|
||||
six==1.17.0
|
||||
sniffio==1.3.1
|
||||
soupsieve==2.7
|
||||
|
|
@ -155,6 +194,7 @@ tornado==6.5.2
|
|||
tqdm==4.67.1
|
||||
traitlets==5.14.3
|
||||
transformers==4.55.0
|
||||
typer==0.24.1
|
||||
types-python-dateutil==2.9.0.20250809
|
||||
typing-inspect==0.9.0
|
||||
typing-inspection==0.4.1
|
||||
|
|
@ -162,10 +202,15 @@ typing_extensions==4.14.1
|
|||
tzdata==2025.2
|
||||
uri-template==1.3.0
|
||||
urllib3==2.5.0
|
||||
uvicorn==0.41.0
|
||||
uvloop==0.22.1
|
||||
watchfiles==1.1.1
|
||||
wcwidth==0.2.13
|
||||
webcolors==24.11.1
|
||||
webencodings==0.5.1
|
||||
websocket-client==1.8.0
|
||||
websockets==16.0
|
||||
widgetsnbextension==4.0.14
|
||||
wrapt==1.17.2
|
||||
yarl==1.20.1
|
||||
zipp==3.23.0
|
||||
|
|
|
|||
138
retrieve_clippings.py
Normal file
138
retrieve_clippings.py
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
# retrieve_clippings.py
|
||||
# Verbatim chunk retrieval from clippings index (ChromaDB).
|
||||
# Vector search + cross-encoder re-ranking, no LLM.
|
||||
#
|
||||
# Returns the top re-ranked chunks with their full text, file metadata, and
|
||||
# scores. Includes page numbers for PDF sources when available.
|
||||
#
|
||||
# E.M.F. February 2026
|
||||
|
||||
# Environment vars must be set before importing huggingface/transformers
|
||||
# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
|
||||
# at import time.
|
||||
import os
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
|
||||
os.environ["HF_HUB_OFFLINE"] = "1"
|
||||
|
||||
import chromadb
|
||||
from llama_index.core import VectorStoreIndex, Settings
|
||||
from llama_index.vector_stores.chroma import ChromaVectorStore
|
||||
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
||||
from llama_index.core.postprocessor import SentenceTransformerRerank
|
||||
import sys
|
||||
import textwrap
|
||||
|
||||
#
|
||||
# Globals
|
||||
#
|
||||
|
||||
PERSIST_DIR = "./storage_clippings"
|
||||
COLLECTION_NAME = "clippings"
|
||||
|
||||
# Embedding model (must match build_clippings.py)
|
||||
EMBED_MODEL = HuggingFaceEmbedding(
|
||||
cache_folder="./models",
|
||||
model_name="BAAI/bge-large-en-v1.5",
|
||||
local_files_only=True,
|
||||
)
|
||||
|
||||
# Cross-encoder model for re-ranking (cached in ./models/)
|
||||
RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2"
|
||||
RERANK_TOP_N = 15
|
||||
RETRIEVE_TOP_K = 30
|
||||
|
||||
# Output formatting
|
||||
WRAP_WIDTH = 80
|
||||
|
||||
|
||||
def main():
|
||||
# No LLM needed -- set embed model only
|
||||
Settings.embed_model = EMBED_MODEL
|
||||
|
||||
# Load ChromaDB collection
|
||||
client = chromadb.PersistentClient(path=PERSIST_DIR)
|
||||
collection = client.get_collection(COLLECTION_NAME)
|
||||
|
||||
# Build index from existing vector store
|
||||
vector_store = ChromaVectorStore(chroma_collection=collection)
|
||||
index = VectorStoreIndex.from_vector_store(vector_store)
|
||||
|
||||
# Build retriever (vector search only, no query engine / LLM)
|
||||
retriever = index.as_retriever(similarity_top_k=RETRIEVE_TOP_K)
|
||||
|
||||
# Cross-encoder re-ranker
|
||||
reranker = SentenceTransformerRerank(
|
||||
model=RERANK_MODEL,
|
||||
top_n=RERANK_TOP_N,
|
||||
)
|
||||
|
||||
# Query
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python retrieve_clippings.py QUERY_TEXT")
|
||||
sys.exit(1)
|
||||
q = " ".join(sys.argv[1:])
|
||||
|
||||
# Retrieve and re-rank
|
||||
nodes = retriever.retrieve(q)
|
||||
reranked = reranker.postprocess_nodes(nodes, query_str=q)
|
||||
|
||||
# Build result list with metadata
|
||||
results = []
|
||||
for i, node in enumerate(reranked, 1):
|
||||
meta = getattr(node, "metadata", None) or node.node.metadata
|
||||
score = getattr(node, "score", None)
|
||||
file_name = meta.get("file_name", "unknown")
|
||||
page_label = meta.get("page_label", "")
|
||||
results.append((i, node, file_name, page_label, score))
|
||||
|
||||
# --- Summary: source files and rankings ---
|
||||
print(f"\nQuery: {q}")
|
||||
print(f"Retrieved {len(nodes)} chunks, re-ranked to top {len(reranked)}")
|
||||
print(f"({collection.count()} total vectors in collection)\n")
|
||||
|
||||
# Unique source files in rank order
|
||||
seen = set()
|
||||
unique_sources = []
|
||||
for i, node, file_name, page_label, score in results:
|
||||
if file_name not in seen:
|
||||
seen.add(file_name)
|
||||
unique_sources.append(file_name)
|
||||
|
||||
print(f"Source files ({len(unique_sources)} unique):")
|
||||
for j, fname in enumerate(unique_sources, 1):
|
||||
print(f" {j}. {fname}")
|
||||
|
||||
print(f"\nRankings:")
|
||||
for i, node, file_name, page_label, score in results:
|
||||
line = f" [{i:2d}] {score:+7.3f} {file_name}"
|
||||
if page_label:
|
||||
line += f" (p. {page_label})"
|
||||
print(line)
|
||||
|
||||
# --- Full chunk text ---
|
||||
print(f"\n{'=' * WRAP_WIDTH}")
|
||||
print("CHUNKS")
|
||||
print("=" * WRAP_WIDTH)
|
||||
|
||||
for i, node, file_name, page_label, score in results:
|
||||
header = f"=== [{i}] {file_name}"
|
||||
if page_label:
|
||||
header += f" (p. {page_label})"
|
||||
header += f" (score: {score:.3f})"
|
||||
|
||||
print("\n" + "=" * WRAP_WIDTH)
|
||||
print(header)
|
||||
print("=" * WRAP_WIDTH)
|
||||
|
||||
text = node.get_content()
|
||||
for line in text.splitlines():
|
||||
if line.strip():
|
||||
print(textwrap.fill(line, width=WRAP_WIDTH))
|
||||
else:
|
||||
print()
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue