From b4bf89ce4b0902a721d7ba439234b24cb4ce2192 Mon Sep 17 00:00:00 2001 From: Eric Date: Sun, 22 Feb 2026 07:48:48 -0500 Subject: [PATCH] Built semantic search over clippings files. Embedded text as a ChromaDB to learn that. Updated requirements to include new depenendencies --- .gitignore | 9 +- build_clippings.py | 471 ++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 57 ++++- retrieve_clippings.py | 138 +++++++++++++ 4 files changed, 667 insertions(+), 8 deletions(-) create mode 100644 build_clippings.py create mode 100644 retrieve_clippings.py diff --git a/.gitignore b/.gitignore index 2188ace..7e4dde6 100644 --- a/.gitignore +++ b/.gitignore @@ -6,12 +6,17 @@ __pycache__/ # HuggingFace cached models (large, ~2 GB) models/ -# Vector stores (large, rebuild with build_exp_claude.py) +# Vector stores (large, rebuild with build scripts) storage_exp/ storage/ +storage_clippings/ -# Data (symlink to private journal files) +# Data (symlinks to private files) data +clippings + +# Generated file lists +ocr_needed.txt # IDE and OS .DS_Store diff --git a/build_clippings.py b/build_clippings.py new file mode 100644 index 0000000..b808e92 --- /dev/null +++ b/build_clippings.py @@ -0,0 +1,471 @@ +# build_clippings.py +# +# Build or update the ChromaDB vector store from clippings in ./clippings. +# +# Default mode (incremental): loads the existing index and adds only +# new or modified files. Use --rebuild for a full rebuild from scratch. +# +# Handles PDFs, TXT, webarchive, and RTF files. Skips non-extractable PDFs +# and writes them to ocr_needed.txt for later OCR processing. +# +# February 2026 +# E. M. Furst + +# Environment vars must be set before importing huggingface/transformers +# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE +# at import time. +import os +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models" +os.environ["HF_HUB_OFFLINE"] = "1" + +import chromadb +from llama_index.core import ( + SimpleDirectoryReader, + StorageContext, + VectorStoreIndex, + Settings, + Document, +) +from llama_index.vector_stores.chroma import ChromaVectorStore +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.core.node_parser import SentenceSplitter +from pathlib import Path +import argparse +import datetime +import time + +# Shared constants +DATA_DIR = Path("./clippings") +PERSIST_DIR = "./storage_clippings" +COLLECTION_NAME = "clippings" +EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5" +CHUNK_SIZE = 256 +CHUNK_OVERLAP = 25 + +# File types handled by SimpleDirectoryReader (PDF + TXT) +READER_EXTS = {".pdf", ".txt"} +# File types handled by custom loaders +CUSTOM_EXTS = {".webarchive", ".rtf"} +# All supported extensions +SUPPORTED_EXTS = READER_EXTS | CUSTOM_EXTS + +# Minimum extracted text length to consider a PDF valid (characters) +MIN_TEXT_LENGTH = 100 + + +def get_text_splitter(): + return SentenceSplitter( + chunk_size=CHUNK_SIZE, + chunk_overlap=CHUNK_OVERLAP, + paragraph_separator="\n\n", + ) + + +def validate_pdf(file_path): + """Check if a PDF has extractable text. + + Returns (is_valid, reason) where reason explains why it was skipped. + """ + import pypdf + try: + reader = pypdf.PdfReader(str(file_path)) + page_count = len(reader.pages) + total_chars = 0 + printable_chars = 0 + for page in reader.pages: + text = page.extract_text() or "" + total_chars += len(text) + printable_chars += sum( + 1 for c in text if c.isprintable() or c in "\n\r\t" + ) + + if total_chars < MIN_TEXT_LENGTH: + return False, f"too little text ({total_chars} chars, {page_count} pages)" + + ratio = printable_chars / total_chars if total_chars > 0 else 0 + if ratio < 0.5: + return False, f"low printable ratio ({ratio:.2f}, {page_count} pages)" + + return True, None + except Exception as e: + return False, str(e) + + +def load_webarchive(file_path): + """Extract text from a macOS .webarchive file. + + Returns a LlamaIndex Document, or None if extraction fails. + """ + import plistlib + from bs4 import BeautifulSoup + + try: + with open(file_path, "rb") as f: + plist = plistlib.load(f) + + resource = plist.get("WebMainResource", {}) + html_bytes = resource.get("WebResourceData", b"") + if not html_bytes: + return None + + html = html_bytes.decode("utf-8", errors="replace") + soup = BeautifulSoup(html, "html.parser") + text = soup.get_text(separator="\n", strip=True) + + if len(text) < MIN_TEXT_LENGTH: + return None + + stat = file_path.stat() + mdate = datetime.datetime.fromtimestamp( + stat.st_mtime, tz=datetime.timezone.utc + ).strftime("%Y-%m-%d") + + return Document( + text=text, + metadata={ + "file_name": file_path.name, + "file_path": str(file_path), + "file_size": stat.st_size, + "last_modified_date": mdate, + "file_type": "webarchive", + }, + ) + except Exception as e: + print(f" Warning: could not read webarchive {file_path.name}: {e}") + return None + + +def load_rtf(file_path): + """Extract text from an RTF file. + + Returns a LlamaIndex Document, or None if extraction fails. + """ + from striprtf.striprtf import rtf_to_text + + try: + with open(file_path, "r", errors="replace") as f: + rtf_content = f.read() + + text = rtf_to_text(rtf_content) + + if len(text) < MIN_TEXT_LENGTH: + return None + + stat = file_path.stat() + mdate = datetime.datetime.fromtimestamp( + stat.st_mtime, tz=datetime.timezone.utc + ).strftime("%Y-%m-%d") + + return Document( + text=text, + metadata={ + "file_name": file_path.name, + "file_path": str(file_path), + "file_size": stat.st_size, + "last_modified_date": mdate, + "file_type": "rtf", + }, + ) + except Exception as e: + print(f" Warning: could not read RTF {file_path.name}: {e}") + return None + + +def scan_clippings(): + """Scan the clippings directory and classify files. + + Returns (reader_files, custom_docs, skipped, ocr_needed) where: + - reader_files: list of Paths for SimpleDirectoryReader (PDF + TXT) + - custom_docs: list of Document objects from custom loaders + - skipped: list of (Path, reason) tuples + - ocr_needed: list of Paths for PDFs that need OCR + """ + reader_files = [] + custom_docs = [] + skipped = [] + ocr_needed = [] + + for fpath in sorted(DATA_DIR.rglob("*")): + if not fpath.is_file(): + continue + if fpath.name.startswith("."): + continue + + ext = fpath.suffix.lower() + + if ext not in SUPPORTED_EXTS: + skipped.append((fpath, f"unsupported type: {ext}")) + continue + + if ext == ".pdf": + is_valid, reason = validate_pdf(fpath) + if not is_valid: + skipped.append((fpath, f"no extractable text: {reason}")) + ocr_needed.append(fpath) + continue + reader_files.append(fpath) + + elif ext == ".txt": + reader_files.append(fpath) + + elif ext == ".webarchive": + doc = load_webarchive(fpath) + if doc: + custom_docs.append(doc) + else: + skipped.append((fpath, "no extractable text from webarchive")) + + elif ext == ".rtf": + doc = load_rtf(fpath) + if doc: + custom_docs.append(doc) + else: + skipped.append((fpath, "no extractable text from RTF")) + + return reader_files, custom_docs, skipped, ocr_needed + + +def write_ocr_list(ocr_needed): + """Write the list of PDFs needing OCR to ocr_needed.txt.""" + with open("ocr_needed.txt", "w") as f: + for fpath in ocr_needed: + f.write(f"{fpath}\n") + print(f"Wrote {len(ocr_needed)} file(s) to ocr_needed.txt") + + +def load_all_documents(reader_files, custom_docs): + """Load documents from SimpleDirectoryReader and merge with custom docs.""" + documents = [] + + if reader_files: + print(f"Loading {len(reader_files)} PDF/TXT files...") + reader_docs = SimpleDirectoryReader( + input_files=[str(f) for f in reader_files], + filename_as_id=True, + ).load_data() + documents.extend(reader_docs) + + if custom_docs: + print(f"Adding {len(custom_docs)} webarchive/RTF documents...") + documents.extend(custom_docs) + + return documents + + +def rebuild(reader_files, custom_docs): + """Full rebuild: delete existing collection and recreate from scratch.""" + client = chromadb.PersistentClient(path=PERSIST_DIR) + # Delete existing collection if present + try: + client.delete_collection(COLLECTION_NAME) + print(f"Deleted existing collection '{COLLECTION_NAME}'") + except Exception: + pass + + collection = client.get_or_create_collection(COLLECTION_NAME) + vector_store = ChromaVectorStore(chroma_collection=collection) + storage_context = StorageContext.from_defaults(vector_store=vector_store) + + documents = load_all_documents(reader_files, custom_docs) + if not documents: + raise ValueError("No documents loaded") + + print(f"Loaded {len(documents)} document(s) total") + print("Building vector index...") + + index = VectorStoreIndex.from_documents( + documents, + storage_context=storage_context, + transformations=[get_text_splitter()], + show_progress=True, + ) + + print(f"Index built. Collection has {collection.count()} vectors.") + return index + + +def update(reader_files, custom_docs): + """Incremental update: add new, re-index modified, remove deleted files.""" + client = chromadb.PersistentClient(path=PERSIST_DIR) + collection = client.get_collection(COLLECTION_NAME) + count = collection.count() + print(f"Existing collection has {count} vectors") + + # Get all stored metadata to find what's indexed + # Key on file_path (not file_name) to handle duplicate names across subdirs + indexed = {} # file_path -> {"ids": [], "file_size": ..., "last_modified_date": ...} + if count > 0: + results = collection.get(include=["metadatas"]) + for i, meta in enumerate(results["metadatas"]): + fpath = meta.get("file_path", "") + if fpath not in indexed: + indexed[fpath] = { + "ids": [], + "file_size": meta.get("file_size"), + "last_modified_date": meta.get("last_modified_date"), + } + indexed[fpath]["ids"].append(results["ids"][i]) + + print(f"Index contains {len(indexed)} unique files") + + # Build disk file lookup: file_path_str -> Path + # For reader_files, match the path format SimpleDirectoryReader would store + disk_files = {} + for f in reader_files: + disk_files[str(f)] = f + for doc in custom_docs: + disk_files[doc.metadata["file_path"]] = Path(doc.metadata["file_path"]) + + # Classify files + new_reader = [] + new_custom = [] + modified_reader = [] + modified_custom = [] + deleted_paths = [] + unchanged = 0 + + for path_str, fpath in disk_files.items(): + if path_str not in indexed: + # Check if it's a custom doc + if fpath.suffix.lower() in CUSTOM_EXTS: + matching = [d for d in custom_docs if d.metadata["file_path"] == path_str] + if matching: + new_custom.extend(matching) + else: + new_reader.append(fpath) + else: + info = indexed[path_str] + stat = fpath.stat() + disk_mdate = datetime.datetime.fromtimestamp( + stat.st_mtime, tz=datetime.timezone.utc + ).strftime("%Y-%m-%d") + + if stat.st_size != info["file_size"] or disk_mdate != info["last_modified_date"]: + if fpath.suffix.lower() in CUSTOM_EXTS: + matching = [d for d in custom_docs if d.metadata["file_path"] == path_str] + if matching: + modified_custom.extend(matching) + else: + modified_reader.append(fpath) + else: + unchanged += 1 + + for path_str in indexed: + if path_str not in disk_files: + deleted_paths.append(path_str) + + n_new = len(new_reader) + len(new_custom) + n_modified = len(modified_reader) + len(modified_custom) + print(f"\n New: {n_new}") + print(f" Modified: {n_modified}") + print(f" Deleted: {len(deleted_paths)}") + print(f" Unchanged: {unchanged}") + + if n_new == 0 and n_modified == 0 and len(deleted_paths) == 0: + print("\nNothing to do.") + return + + # Delete chunks for removed and modified files + for path_str in deleted_paths: + ids = indexed[path_str]["ids"] + fname = Path(path_str).name + print(f" Removing {fname} ({len(ids)} chunks)") + collection.delete(ids=ids) + + for fpath in modified_reader: + path_str = str(fpath) + ids = indexed[path_str]["ids"] + print(f" Re-indexing {fpath.name} ({len(ids)} chunks)") + collection.delete(ids=ids) + + for doc in modified_custom: + path_str = doc.metadata["file_path"] + if path_str in indexed: + ids = indexed[path_str]["ids"] + print(f" Re-indexing {doc.metadata['file_name']} ({len(ids)} chunks)") + collection.delete(ids=ids) + + # Add new and modified files + files_to_add = new_reader + modified_reader + docs_to_add = new_custom + modified_custom + + if files_to_add or docs_to_add: + documents = load_all_documents(files_to_add, docs_to_add) + if documents: + print(f"Indexing {len(documents)} document(s)...") + vector_store = ChromaVectorStore(chroma_collection=collection) + storage_context = StorageContext.from_defaults(vector_store=vector_store) + + VectorStoreIndex.from_documents( + documents, + storage_context=storage_context, + transformations=[get_text_splitter()], + show_progress=True, + ) + + print(f"\nIndex updated. Collection now has {collection.count()} vectors.") + + +def main(): + parser = argparse.ArgumentParser( + description="Build or update the clippings vector store (ChromaDB)." + ) + parser.add_argument( + "--rebuild", + action="store_true", + help="Full rebuild from scratch (default: incremental update)", + ) + args = parser.parse_args() + + # Configure embedding model (offline, cached in ./models) + embed_model = HuggingFaceEmbedding( + model_name=EMBED_MODEL_NAME, + cache_folder="./models", + local_files_only=True, + ) + Settings.embed_model = embed_model + + if not DATA_DIR.exists(): + raise FileNotFoundError( + f"Clippings directory not found: {DATA_DIR.absolute()}\n" + f"Create symlink: ln -s ../clippings ./clippings" + ) + + start = time.time() + + # Scan and classify files + print(f"Scanning {DATA_DIR.absolute()}...") + reader_files, custom_docs, skipped, ocr_needed = scan_clippings() + + n_valid = len(reader_files) + len(custom_docs) + print(f"\nFiles to index: {n_valid}") + print(f" PDF/TXT: {len(reader_files)}") + print(f" Webarchive/RTF: {len(custom_docs)}") + print(f"Files skipped: {len(skipped)}") + for fpath, reason in skipped: + print(f" SKIP: {fpath.name} -- {reason}") + + if ocr_needed: + write_ocr_list(ocr_needed) + + if n_valid == 0: + raise ValueError("No valid files found to index") + + if args.rebuild: + print("\nMode: full rebuild") + rebuild(reader_files, custom_docs) + else: + print("\nMode: incremental update") + if not Path(PERSIST_DIR).exists(): + print(f"No existing index at {PERSIST_DIR}, doing full rebuild.") + rebuild(reader_files, custom_docs) + else: + update(reader_files, custom_docs) + + elapsed = time.time() - start + print(f"Done in {elapsed:.1f}s") + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt index 4530ee5..85f1abf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ aiohappyeyeballs==2.6.1 aiohttp==3.12.15 aiosignal==1.4.0 aiosqlite==0.21.0 +annotated-doc==0.0.4 annotated-types==0.7.0 anyio==4.10.0 appnope==0.1.4 @@ -12,13 +13,17 @@ asttokens==3.0.0 async-lru==2.0.5 attrs==25.3.0 babel==2.17.0 +backoff==2.2.1 banks==2.2.0 +bcrypt==5.0.0 beautifulsoup4==4.13.4 bleach==6.2.0 bm25s==0.2.14 +build==1.4.0 certifi==2025.8.3 cffi==1.17.1 charset-normalizer==3.4.3 +chromadb==1.5.1 click==8.2.1 colorama==0.4.6 comm==0.2.3 @@ -30,22 +35,30 @@ decorator==5.2.1 defusedxml==0.7.1 Deprecated==1.2.18 dirtyjson==1.0.8 +distro==1.9.0 +durationpy==0.10 executing==2.2.0 fastjsonschema==2.21.1 filelock==3.18.0 filetype==1.2.0 +flatbuffers==25.12.19 fonttools==4.59.1 fqdn==1.5.1 frozenlist==1.7.0 fsspec==2025.7.0 +googleapis-common-protos==1.72.0 greenlet==3.2.4 griffe==1.11.0 +grpcio==1.78.1 h11==0.16.0 hf-xet==1.1.7 httpcore==1.0.9 +httptools==0.7.1 httpx==0.28.1 huggingface-hub==0.34.4 idna==3.10 +importlib_metadata==8.7.1 +importlib_resources==6.5.2 ipykernel==6.30.1 ipython==9.4.0 ipython_pygments_lexers==1.1.1 @@ -53,6 +66,7 @@ ipywidgets==8.1.7 isoduration==20.11.0 jedi==0.19.2 Jinja2==3.1.6 +jiter==0.13.0 joblib==1.5.1 json5==0.12.1 jsonpointer==3.0.0 @@ -71,19 +85,25 @@ jupyterlab_pygments==0.3.0 jupyterlab_server==2.27.3 jupyterlab_widgets==3.0.15 kiwisolver==1.4.9 +kubernetes==35.0.0 lark==1.2.2 -llama-index-core==0.13.1 -llama-index-embeddings-huggingface==0.6.0 +llama-index-core==0.14.14 +llama-index-embeddings-huggingface==0.6.1 llama-index-instrumentation==0.4.0 -llama-index-llms-ollama==0.7.0 -llama-index-readers-file==0.5.0 +llama-index-llms-ollama==0.9.1 +llama-index-llms-openai==0.6.18 +llama-index-readers-file==0.5.6 llama-index-retrievers-bm25==0.6.5 -llama-index-workflows==1.3.0 +llama-index-vector-stores-chroma==0.5.5 +llama-index-workflows==2.14.2 +markdown-it-py==4.0.0 MarkupSafe==3.0.2 marshmallow==3.26.1 matplotlib==3.10.5 matplotlib-inline==0.1.7 +mdurl==0.1.2 mistune==3.1.3 +mmh3==5.2.0 mpmath==1.3.0 multidict==6.6.3 mypy_extensions==1.1.0 @@ -96,7 +116,17 @@ nltk==3.9.1 notebook==7.4.5 notebook_shim==0.2.4 numpy==2.3.2 +oauthlib==3.3.1 ollama==0.5.3 +onnxruntime==1.24.2 +openai==2.21.0 +opentelemetry-api==1.39.1 +opentelemetry-exporter-otlp-proto-common==1.39.1 +opentelemetry-exporter-otlp-proto-grpc==1.39.1 +opentelemetry-proto==1.39.1 +opentelemetry-sdk==1.39.1 +opentelemetry-semantic-conventions==0.60b1 +orjson==3.11.7 overrides==7.7.0 packaging==25.0 pandas==2.2.3 @@ -105,20 +135,26 @@ parso==0.8.4 pexpect==4.9.0 pillow==11.3.0 platformdirs==4.3.8 +posthog==5.4.0 prometheus_client==0.22.1 prompt_toolkit==3.0.51 propcache==0.3.2 +protobuf==6.33.5 psutil==7.0.0 ptyprocess==0.7.0 pure_eval==0.2.3 +pybase64==1.4.3 pycparser==2.22 pydantic==2.11.7 pydantic_core==2.33.2 Pygments==2.19.2 pyparsing==3.2.3 -pypdf==5.9.0 +pypdf==6.7.1 +PyPika==0.51.1 +pyproject_hooks==1.2.0 PyStemmer==2.2.0.3 python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 python-json-logger==3.3.0 pytz==2025.2 PyYAML==6.0.2 @@ -126,9 +162,11 @@ pyzmq==27.0.1 referencing==0.36.2 regex==2025.7.34 requests==2.32.4 +requests-oauthlib==2.0.0 rfc3339-validator==0.1.4 rfc3986-validator==0.1.1 rfc3987-syntax==1.1.0 +rich==14.3.3 rpds-py==0.27.0 safetensors==0.6.2 scikit-learn==1.7.1 @@ -137,6 +175,7 @@ seaborn==0.13.2 Send2Trash==1.8.3 sentence-transformers==5.1.0 setuptools==80.9.0 +shellingham==1.5.4 six==1.17.0 sniffio==1.3.1 soupsieve==2.7 @@ -155,6 +194,7 @@ tornado==6.5.2 tqdm==4.67.1 traitlets==5.14.3 transformers==4.55.0 +typer==0.24.1 types-python-dateutil==2.9.0.20250809 typing-inspect==0.9.0 typing-inspection==0.4.1 @@ -162,10 +202,15 @@ typing_extensions==4.14.1 tzdata==2025.2 uri-template==1.3.0 urllib3==2.5.0 +uvicorn==0.41.0 +uvloop==0.22.1 +watchfiles==1.1.1 wcwidth==0.2.13 webcolors==24.11.1 webencodings==0.5.1 websocket-client==1.8.0 +websockets==16.0 widgetsnbextension==4.0.14 wrapt==1.17.2 yarl==1.20.1 +zipp==3.23.0 diff --git a/retrieve_clippings.py b/retrieve_clippings.py new file mode 100644 index 0000000..2cb0c9e --- /dev/null +++ b/retrieve_clippings.py @@ -0,0 +1,138 @@ +# retrieve_clippings.py +# Verbatim chunk retrieval from clippings index (ChromaDB). +# Vector search + cross-encoder re-ranking, no LLM. +# +# Returns the top re-ranked chunks with their full text, file metadata, and +# scores. Includes page numbers for PDF sources when available. +# +# E.M.F. February 2026 + +# Environment vars must be set before importing huggingface/transformers +# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE +# at import time. +import os +os.environ["TOKENIZERS_PARALLELISM"] = "false" +os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models" +os.environ["HF_HUB_OFFLINE"] = "1" + +import chromadb +from llama_index.core import VectorStoreIndex, Settings +from llama_index.vector_stores.chroma import ChromaVectorStore +from llama_index.embeddings.huggingface import HuggingFaceEmbedding +from llama_index.core.postprocessor import SentenceTransformerRerank +import sys +import textwrap + +# +# Globals +# + +PERSIST_DIR = "./storage_clippings" +COLLECTION_NAME = "clippings" + +# Embedding model (must match build_clippings.py) +EMBED_MODEL = HuggingFaceEmbedding( + cache_folder="./models", + model_name="BAAI/bge-large-en-v1.5", + local_files_only=True, +) + +# Cross-encoder model for re-ranking (cached in ./models/) +RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2" +RERANK_TOP_N = 15 +RETRIEVE_TOP_K = 30 + +# Output formatting +WRAP_WIDTH = 80 + + +def main(): + # No LLM needed -- set embed model only + Settings.embed_model = EMBED_MODEL + + # Load ChromaDB collection + client = chromadb.PersistentClient(path=PERSIST_DIR) + collection = client.get_collection(COLLECTION_NAME) + + # Build index from existing vector store + vector_store = ChromaVectorStore(chroma_collection=collection) + index = VectorStoreIndex.from_vector_store(vector_store) + + # Build retriever (vector search only, no query engine / LLM) + retriever = index.as_retriever(similarity_top_k=RETRIEVE_TOP_K) + + # Cross-encoder re-ranker + reranker = SentenceTransformerRerank( + model=RERANK_MODEL, + top_n=RERANK_TOP_N, + ) + + # Query + if len(sys.argv) < 2: + print("Usage: python retrieve_clippings.py QUERY_TEXT") + sys.exit(1) + q = " ".join(sys.argv[1:]) + + # Retrieve and re-rank + nodes = retriever.retrieve(q) + reranked = reranker.postprocess_nodes(nodes, query_str=q) + + # Build result list with metadata + results = [] + for i, node in enumerate(reranked, 1): + meta = getattr(node, "metadata", None) or node.node.metadata + score = getattr(node, "score", None) + file_name = meta.get("file_name", "unknown") + page_label = meta.get("page_label", "") + results.append((i, node, file_name, page_label, score)) + + # --- Summary: source files and rankings --- + print(f"\nQuery: {q}") + print(f"Retrieved {len(nodes)} chunks, re-ranked to top {len(reranked)}") + print(f"({collection.count()} total vectors in collection)\n") + + # Unique source files in rank order + seen = set() + unique_sources = [] + for i, node, file_name, page_label, score in results: + if file_name not in seen: + seen.add(file_name) + unique_sources.append(file_name) + + print(f"Source files ({len(unique_sources)} unique):") + for j, fname in enumerate(unique_sources, 1): + print(f" {j}. {fname}") + + print(f"\nRankings:") + for i, node, file_name, page_label, score in results: + line = f" [{i:2d}] {score:+7.3f} {file_name}" + if page_label: + line += f" (p. {page_label})" + print(line) + + # --- Full chunk text --- + print(f"\n{'=' * WRAP_WIDTH}") + print("CHUNKS") + print("=" * WRAP_WIDTH) + + for i, node, file_name, page_label, score in results: + header = f"=== [{i}] {file_name}" + if page_label: + header += f" (p. {page_label})" + header += f" (score: {score:.3f})" + + print("\n" + "=" * WRAP_WIDTH) + print(header) + print("=" * WRAP_WIDTH) + + text = node.get_content() + for line in text.splitlines(): + if line.strip(): + print(textwrap.fill(line, width=WRAP_WIDTH)) + else: + print() + print() + + +if __name__ == "__main__": + main()