llm-workshop/04-semantic-search/build_store.py

# build_store.py
#
# Build or update the vector store from journal entries in ./data.
#
# Default mode (incremental): loads the existing index and adds only
# new or modified files.  Use --rebuild for a full rebuild from scratch.
#
# January 2026
# E. M. Furst
# Used Sonnet 4.5 to suggest changes; Opus 4.6 for incremental update

from llama_index.core import (
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex,
    load_index_from_storage,
    Settings,
)
from pathlib import Path
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
import argparse
import datetime
import os
import time

# Shared constants
DATA_DIR = Path("./data")
PERSIST_DIR = "./store"
EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5"
CHUNK_SIZE = 256
CHUNK_OVERLAP = 25


def get_text_splitter():
    return SentenceSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        paragraph_separator="\n\n",
    )


def rebuild():
    """Full rebuild: delete and recreate the vector store from scratch."""
    if not DATA_DIR.exists():
        raise FileNotFoundError(f"Data directory not found: {DATA_DIR.absolute()}")

    print(f"Loading documents from {DATA_DIR.absolute()}...")
    documents = SimpleDirectoryReader(str(DATA_DIR)).load_data()

    if not documents:
        raise ValueError("No documents found in data directory")

    print(f"Loaded {len(documents)} document(s)")

    print("Building vector index...")
    index = VectorStoreIndex.from_documents(
        documents,
        transformations=[get_text_splitter()],
        show_progress=True,
    )

    index.storage_context.persist(persist_dir=PERSIST_DIR)
    print(f"Index built and saved to {PERSIST_DIR}")


def update():
    """Incremental update: add new files, re-index modified files, remove deleted files."""
    if not DATA_DIR.exists():
        raise FileNotFoundError(f"Data directory not found: {DATA_DIR.absolute()}")

    # Load existing index
    print(f"Loading existing index from {PERSIST_DIR}...")
    storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
    index = load_index_from_storage(storage_context)

    # Set transformations so index.insert() chunks correctly
    Settings.transformations = [get_text_splitter()]

    # Build lookup of indexed files: file_name -> (ref_doc_id, metadata)
    all_ref_docs = index.docstore.get_all_ref_doc_info()
    indexed = {}
    for ref_id, info in all_ref_docs.items():
        fname = info.metadata.get("file_name")
        if fname:
            indexed[fname] = (ref_id, info.metadata)

    print(f"Index contains {len(indexed)} documents")

    # Scan current files on disk
    disk_files = {f.name: f for f in sorted(DATA_DIR.glob("*.txt"))}
    print(f"Data directory contains {len(disk_files)} files")

    # Classify files
    new_files = []
    modified_files = []
    deleted_files = []
    unchanged = 0

    for fname, fpath in disk_files.items():
        if fname not in indexed:
            new_files.append(fpath)
        else:
            ref_id, meta = indexed[fname]
            # Compare file size and modification date
            stat = fpath.stat()
            disk_size = stat.st_size
            # Must use UTC to match SimpleDirectoryReader's date format
            disk_mdate = datetime.datetime.fromtimestamp(
                stat.st_mtime, tz=datetime.timezone.utc
            ).strftime("%Y-%m-%d")

            stored_size = meta.get("file_size")
            stored_mdate = meta.get("last_modified_date")

            if disk_size != stored_size or disk_mdate != stored_mdate:
                modified_files.append((fpath, ref_id))
            else:
                unchanged += 1

    for fname, (ref_id, meta) in indexed.items():
        if fname not in disk_files:
            deleted_files.append((fname, ref_id))

    # Report
    print(f"\n  New:       {len(new_files)}")
    print(f"  Modified:  {len(modified_files)}")
    print(f"  Deleted:   {len(deleted_files)}")
    print(f"  Unchanged: {unchanged}")

    if not new_files and not modified_files and not deleted_files:
        print("\nNothing to do.")
        return

    # Process deletions (including modified files that need re-indexing)
    for fname, ref_id in deleted_files:
        print(f"  Removing {fname}")
        index.delete_ref_doc(ref_id, delete_from_docstore=True)

    for fpath, ref_id in modified_files:
        print(f"  Re-indexing {fpath.name} (modified)")
        index.delete_ref_doc(ref_id, delete_from_docstore=True)

    # Process additions (new files + modified files)
    files_to_add = new_files + [fpath for fpath, _ in modified_files]
    if files_to_add:
        print(f"\nIndexing {len(files_to_add)} file(s)...")
        # Use "./" prefix to match paths from full build (pathlib strips it)
        docs = SimpleDirectoryReader(
            input_files=[f"./{f}" for f in files_to_add]
        ).load_data()
        for doc in docs:
            index.insert(doc)

    # Persist
    index.storage_context.persist(persist_dir=PERSIST_DIR)
    print(f"\nIndex updated and saved to {PERSIST_DIR}")


def main():
    parser = argparse.ArgumentParser(
        description="Build or update the vector store from journal entries."
    )
    parser.add_argument(
        "--rebuild",
        action="store_true",
        help="Full rebuild from scratch (default: incremental update)",
    )
    args = parser.parse_args()

    # Configure embedding model
    embed_model = HuggingFaceEmbedding(model_name=EMBED_MODEL_NAME)
    Settings.embed_model = embed_model

    start = time.time()

    if args.rebuild:
        print("Mode: full rebuild")
        rebuild()
    else:
        print("Mode: incremental update")
        if not Path(PERSIST_DIR).exists():
            print(f"No existing index at {PERSIST_DIR}, doing full rebuild.")
            rebuild()
        else:
            update()

    elapsed = time.time() - start
    print(f"Done in {elapsed:.1f}s")


if __name__ == "__main__":
    main()