# build_store.py # # Build or update the vector store from journal entries in ./data. # # Default mode (incremental): loads the existing index and adds only # new or modified files. Use --rebuild for a full rebuild from scratch. # # January 2026 # E. M. Furst # Used Sonnet 4.5 to suggest changes; Opus 4.6 for incremental update from llama_index.core import ( SimpleDirectoryReader, StorageContext, VectorStoreIndex, load_index_from_storage, Settings, ) from pathlib import Path from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.core.node_parser import SentenceSplitter import argparse import datetime import os import time # Shared constants DATA_DIR = Path("./data") PERSIST_DIR = "./store" EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5" CHUNK_SIZE = 256 CHUNK_OVERLAP = 25 def get_text_splitter(): return SentenceSplitter( chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, paragraph_separator="\n\n", ) def rebuild(): """Full rebuild: delete and recreate the vector store from scratch.""" if not DATA_DIR.exists(): raise FileNotFoundError(f"Data directory not found: {DATA_DIR.absolute()}") print(f"Loading documents from {DATA_DIR.absolute()}...") documents = SimpleDirectoryReader(str(DATA_DIR)).load_data() if not documents: raise ValueError("No documents found in data directory") print(f"Loaded {len(documents)} document(s)") print("Building vector index...") index = VectorStoreIndex.from_documents( documents, transformations=[get_text_splitter()], show_progress=True, ) index.storage_context.persist(persist_dir=PERSIST_DIR) print(f"Index built and saved to {PERSIST_DIR}") def update(): """Incremental update: add new files, re-index modified files, remove deleted files.""" if not DATA_DIR.exists(): raise FileNotFoundError(f"Data directory not found: {DATA_DIR.absolute()}") # Load existing index print(f"Loading existing index from {PERSIST_DIR}...") storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR) index = load_index_from_storage(storage_context) # Set transformations so index.insert() chunks correctly Settings.transformations = [get_text_splitter()] # Build lookup of indexed files: file_name -> (ref_doc_id, metadata) all_ref_docs = index.docstore.get_all_ref_doc_info() indexed = {} for ref_id, info in all_ref_docs.items(): fname = info.metadata.get("file_name") if fname: indexed[fname] = (ref_id, info.metadata) print(f"Index contains {len(indexed)} documents") # Scan current files on disk disk_files = {f.name: f for f in sorted(DATA_DIR.glob("*.txt"))} print(f"Data directory contains {len(disk_files)} files") # Classify files new_files = [] modified_files = [] deleted_files = [] unchanged = 0 for fname, fpath in disk_files.items(): if fname not in indexed: new_files.append(fpath) else: ref_id, meta = indexed[fname] # Compare file size and modification date stat = fpath.stat() disk_size = stat.st_size # Must use UTC to match SimpleDirectoryReader's date format disk_mdate = datetime.datetime.fromtimestamp( stat.st_mtime, tz=datetime.timezone.utc ).strftime("%Y-%m-%d") stored_size = meta.get("file_size") stored_mdate = meta.get("last_modified_date") if disk_size != stored_size or disk_mdate != stored_mdate: modified_files.append((fpath, ref_id)) else: unchanged += 1 for fname, (ref_id, meta) in indexed.items(): if fname not in disk_files: deleted_files.append((fname, ref_id)) # Report print(f"\n New: {len(new_files)}") print(f" Modified: {len(modified_files)}") print(f" Deleted: {len(deleted_files)}") print(f" Unchanged: {unchanged}") if not new_files and not modified_files and not deleted_files: print("\nNothing to do.") return # Process deletions (including modified files that need re-indexing) for fname, ref_id in deleted_files: print(f" Removing {fname}") index.delete_ref_doc(ref_id, delete_from_docstore=True) for fpath, ref_id in modified_files: print(f" Re-indexing {fpath.name} (modified)") index.delete_ref_doc(ref_id, delete_from_docstore=True) # Process additions (new files + modified files) files_to_add = new_files + [fpath for fpath, _ in modified_files] if files_to_add: print(f"\nIndexing {len(files_to_add)} file(s)...") # Use "./" prefix to match paths from full build (pathlib strips it) docs = SimpleDirectoryReader( input_files=[f"./{f}" for f in files_to_add] ).load_data() for doc in docs: index.insert(doc) # Persist index.storage_context.persist(persist_dir=PERSIST_DIR) print(f"\nIndex updated and saved to {PERSIST_DIR}") def main(): parser = argparse.ArgumentParser( description="Build or update the vector store from journal entries." ) parser.add_argument( "--rebuild", action="store_true", help="Full rebuild from scratch (default: incremental update)", ) args = parser.parse_args() # Configure embedding model embed_model = HuggingFaceEmbedding(model_name=EMBED_MODEL_NAME) Settings.embed_model = embed_model start = time.time() if args.rebuild: print("Mode: full rebuild") rebuild() else: print("Mode: incremental update") if not Path(PERSIST_DIR).exists(): print(f"No existing index at {PERSIST_DIR}, doing full rebuild.") rebuild() else: update() elapsed = time.time() - start print(f"Done in {elapsed:.1f}s") if __name__ == "__main__": main()