Test clean deploy

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-27 05:59:01 -05:00 · 2026-02-27 05:59:01 -05:00 · 42e5e20e17
commit 42e5e20e17
11 changed files with 1790 additions and 0 deletions
--- a/build_store.py
+++ b/build_store.py
@ -0,0 +1,193 @@
+# build_store.py
+#
+# Build or update the vector store from journal entries in ./data.
+#
+# Default mode (incremental): loads the existing index and adds only
+# new or modified files.  Use --rebuild for a full rebuild from scratch.
+#
+# January 2026
+# E. M. Furst
+# Used Sonnet 4.5 to suggest changes; Opus 4.6 for incremental update
+
+from llama_index.core import (
+    SimpleDirectoryReader,
+    StorageContext,
+    VectorStoreIndex,
+    load_index_from_storage,
+    Settings,
+)
+from pathlib import Path
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core.node_parser import SentenceSplitter
+import argparse
+import datetime
+import os
+import time
+
+# Shared constants
+DATA_DIR = Path("./data")
+PERSIST_DIR = "./store"
+EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5"
+CHUNK_SIZE = 256
+CHUNK_OVERLAP = 25
+
+
+def get_text_splitter():
+    return SentenceSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP,
+        paragraph_separator="\n\n",
+    )
+
+
+def rebuild():
+    """Full rebuild: delete and recreate the vector store from scratch."""
+    if not DATA_DIR.exists():
+        raise FileNotFoundError(f"Data directory not found: {DATA_DIR.absolute()}")
+
+    print(f"Loading documents from {DATA_DIR.absolute()}...")
+    documents = SimpleDirectoryReader(str(DATA_DIR)).load_data()
+
+    if not documents:
+        raise ValueError("No documents found in data directory")
+
+    print(f"Loaded {len(documents)} document(s)")
+
+    print("Building vector index...")
+    index = VectorStoreIndex.from_documents(
+        documents,
+        transformations=[get_text_splitter()],
+        show_progress=True,
+    )
+
+    index.storage_context.persist(persist_dir=PERSIST_DIR)
+    print(f"Index built and saved to {PERSIST_DIR}")
+
+
+def update():
+    """Incremental update: add new files, re-index modified files, remove deleted files."""
+    if not DATA_DIR.exists():
+        raise FileNotFoundError(f"Data directory not found: {DATA_DIR.absolute()}")
+
+    # Load existing index
+    print(f"Loading existing index from {PERSIST_DIR}...")
+    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
+    index = load_index_from_storage(storage_context)
+
+    # Set transformations so index.insert() chunks correctly
+    Settings.transformations = [get_text_splitter()]
+
+    # Build lookup of indexed files: file_name -> (ref_doc_id, metadata)
+    all_ref_docs = index.docstore.get_all_ref_doc_info()
+    indexed = {}
+    for ref_id, info in all_ref_docs.items():
+        fname = info.metadata.get("file_name")
+        if fname:
+            indexed[fname] = (ref_id, info.metadata)
+
+    print(f"Index contains {len(indexed)} documents")
+
+    # Scan current files on disk
+    disk_files = {f.name: f for f in sorted(DATA_DIR.glob("*.txt"))}
+    print(f"Data directory contains {len(disk_files)} files")
+
+    # Classify files
+    new_files = []
+    modified_files = []
+    deleted_files = []
+    unchanged = 0
+
+    for fname, fpath in disk_files.items():
+        if fname not in indexed:
+            new_files.append(fpath)
+        else:
+            ref_id, meta = indexed[fname]
+            # Compare file size and modification date
+            stat = fpath.stat()
+            disk_size = stat.st_size
+            # Must use UTC to match SimpleDirectoryReader's date format
+            disk_mdate = datetime.datetime.fromtimestamp(
+                stat.st_mtime, tz=datetime.timezone.utc
+            ).strftime("%Y-%m-%d")
+
+            stored_size = meta.get("file_size")
+            stored_mdate = meta.get("last_modified_date")
+
+            if disk_size != stored_size or disk_mdate != stored_mdate:
+                modified_files.append((fpath, ref_id))
+            else:
+                unchanged += 1
+
+    for fname, (ref_id, meta) in indexed.items():
+        if fname not in disk_files:
+            deleted_files.append((fname, ref_id))
+
+    # Report
+    print(f"\n  New:       {len(new_files)}")
+    print(f"  Modified:  {len(modified_files)}")
+    print(f"  Deleted:   {len(deleted_files)}")
+    print(f"  Unchanged: {unchanged}")
+
+    if not new_files and not modified_files and not deleted_files:
+        print("\nNothing to do.")
+        return
+
+    # Process deletions (including modified files that need re-indexing)
+    for fname, ref_id in deleted_files:
+        print(f"  Removing {fname}")
+        index.delete_ref_doc(ref_id, delete_from_docstore=True)
+
+    for fpath, ref_id in modified_files:
+        print(f"  Re-indexing {fpath.name} (modified)")
+        index.delete_ref_doc(ref_id, delete_from_docstore=True)
+
+    # Process additions (new files + modified files)
+    files_to_add = new_files + [fpath for fpath, _ in modified_files]
+    if files_to_add:
+        print(f"\nIndexing {len(files_to_add)} file(s)...")
+        # Use "./" prefix to match paths from full build (pathlib strips it)
+        docs = SimpleDirectoryReader(
+            input_files=[f"./{f}" for f in files_to_add]
+        ).load_data()
+        for doc in docs:
+            index.insert(doc)
+
+    # Persist
+    index.storage_context.persist(persist_dir=PERSIST_DIR)
+    print(f"\nIndex updated and saved to {PERSIST_DIR}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Build or update the vector store from journal entries."
+    )
+    parser.add_argument(
+        "--rebuild",
+        action="store_true",
+        help="Full rebuild from scratch (default: incremental update)",
+    )
+    args = parser.parse_args()
+
+    # Configure embedding model
+    embed_model = HuggingFaceEmbedding(model_name=EMBED_MODEL_NAME)
+    Settings.embed_model = embed_model
+
+    start = time.time()
+
+    if args.rebuild:
+        print("Mode: full rebuild")
+        rebuild()
+    else:
+        print("Mode: incremental update")
+        if not Path(PERSIST_DIR).exists():
+            print(f"No existing index at {PERSIST_DIR}, doing full rebuild.")
+            rebuild()
+        else:
+            update()
+
+    elapsed = time.time() - start
+    print(f"Done in {elapsed:.1f}s")
+
+
+if __name__ == "__main__":
+    main()