llm-workshop/04-semantic-search/build_store.py
Eric 1604671d36 Initial commit: LLM workshop materials
Five modules covering nanoGPT, Ollama, RAG, semantic search, and neural networks.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-28 07:11:01 -04:00

193 lines
5.9 KiB
Python

# build_store.py
#
# Build or update the vector store from journal entries in ./data.
#
# Default mode (incremental): loads the existing index and adds only
# new or modified files. Use --rebuild for a full rebuild from scratch.
#
# January 2026
# E. M. Furst
# Used Sonnet 4.5 to suggest changes; Opus 4.6 for incremental update
from llama_index.core import (
SimpleDirectoryReader,
StorageContext,
VectorStoreIndex,
load_index_from_storage,
Settings,
)
from pathlib import Path
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
import argparse
import datetime
import os
import time
# Shared constants
DATA_DIR = Path("./data")
PERSIST_DIR = "./store"
EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5"
CHUNK_SIZE = 256
CHUNK_OVERLAP = 25
def get_text_splitter():
return SentenceSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
paragraph_separator="\n\n",
)
def rebuild():
"""Full rebuild: delete and recreate the vector store from scratch."""
if not DATA_DIR.exists():
raise FileNotFoundError(f"Data directory not found: {DATA_DIR.absolute()}")
print(f"Loading documents from {DATA_DIR.absolute()}...")
documents = SimpleDirectoryReader(str(DATA_DIR)).load_data()
if not documents:
raise ValueError("No documents found in data directory")
print(f"Loaded {len(documents)} document(s)")
print("Building vector index...")
index = VectorStoreIndex.from_documents(
documents,
transformations=[get_text_splitter()],
show_progress=True,
)
index.storage_context.persist(persist_dir=PERSIST_DIR)
print(f"Index built and saved to {PERSIST_DIR}")
def update():
"""Incremental update: add new files, re-index modified files, remove deleted files."""
if not DATA_DIR.exists():
raise FileNotFoundError(f"Data directory not found: {DATA_DIR.absolute()}")
# Load existing index
print(f"Loading existing index from {PERSIST_DIR}...")
storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
index = load_index_from_storage(storage_context)
# Set transformations so index.insert() chunks correctly
Settings.transformations = [get_text_splitter()]
# Build lookup of indexed files: file_name -> (ref_doc_id, metadata)
all_ref_docs = index.docstore.get_all_ref_doc_info()
indexed = {}
for ref_id, info in all_ref_docs.items():
fname = info.metadata.get("file_name")
if fname:
indexed[fname] = (ref_id, info.metadata)
print(f"Index contains {len(indexed)} documents")
# Scan current files on disk
disk_files = {f.name: f for f in sorted(DATA_DIR.glob("*.txt"))}
print(f"Data directory contains {len(disk_files)} files")
# Classify files
new_files = []
modified_files = []
deleted_files = []
unchanged = 0
for fname, fpath in disk_files.items():
if fname not in indexed:
new_files.append(fpath)
else:
ref_id, meta = indexed[fname]
# Compare file size and modification date
stat = fpath.stat()
disk_size = stat.st_size
# Must use UTC to match SimpleDirectoryReader's date format
disk_mdate = datetime.datetime.fromtimestamp(
stat.st_mtime, tz=datetime.timezone.utc
).strftime("%Y-%m-%d")
stored_size = meta.get("file_size")
stored_mdate = meta.get("last_modified_date")
if disk_size != stored_size or disk_mdate != stored_mdate:
modified_files.append((fpath, ref_id))
else:
unchanged += 1
for fname, (ref_id, meta) in indexed.items():
if fname not in disk_files:
deleted_files.append((fname, ref_id))
# Report
print(f"\n New: {len(new_files)}")
print(f" Modified: {len(modified_files)}")
print(f" Deleted: {len(deleted_files)}")
print(f" Unchanged: {unchanged}")
if not new_files and not modified_files and not deleted_files:
print("\nNothing to do.")
return
# Process deletions (including modified files that need re-indexing)
for fname, ref_id in deleted_files:
print(f" Removing {fname}")
index.delete_ref_doc(ref_id, delete_from_docstore=True)
for fpath, ref_id in modified_files:
print(f" Re-indexing {fpath.name} (modified)")
index.delete_ref_doc(ref_id, delete_from_docstore=True)
# Process additions (new files + modified files)
files_to_add = new_files + [fpath for fpath, _ in modified_files]
if files_to_add:
print(f"\nIndexing {len(files_to_add)} file(s)...")
# Use "./" prefix to match paths from full build (pathlib strips it)
docs = SimpleDirectoryReader(
input_files=[f"./{f}" for f in files_to_add]
).load_data()
for doc in docs:
index.insert(doc)
# Persist
index.storage_context.persist(persist_dir=PERSIST_DIR)
print(f"\nIndex updated and saved to {PERSIST_DIR}")
def main():
parser = argparse.ArgumentParser(
description="Build or update the vector store from journal entries."
)
parser.add_argument(
"--rebuild",
action="store_true",
help="Full rebuild from scratch (default: incremental update)",
)
args = parser.parse_args()
# Configure embedding model
embed_model = HuggingFaceEmbedding(model_name=EMBED_MODEL_NAME)
Settings.embed_model = embed_model
start = time.time()
if args.rebuild:
print("Mode: full rebuild")
rebuild()
else:
print("Mode: incremental update")
if not Path(PERSIST_DIR).exists():
print(f"No existing index at {PERSIST_DIR}, doing full rebuild.")
rebuild()
else:
update()
elapsed = time.time() - start
print(f"Done in {elapsed:.1f}s")
if __name__ == "__main__":
main()