- Rename build_exp_claude.py → build_store.py - Rename query_hybrid_bm25_v4.py → query_hybrid.py - Rename retrieve_hybrid_raw.py → retrieve.py - Archive query_topk_prompt_engine_v3.py (superseded by hybrid) - Archive retrieve_raw.py (superseded by hybrid) - Move build_clippings.py, retrieve_clippings.py → clippings_search/ - Update run_query.sh, README.md, CLAUDE.md for new names
193 lines
5.9 KiB
Python
193 lines
5.9 KiB
Python
# build_exp_claude.py
|
|
#
|
|
# Build or update the vector store from journal entries in ./data.
|
|
#
|
|
# Default mode (incremental): loads the existing index and adds only
|
|
# new or modified files. Use --rebuild for a full rebuild from scratch.
|
|
#
|
|
# January 2026
|
|
# E. M. Furst
|
|
# Used Sonnet 4.5 to suggest changes; Opus 4.6 for incremental update
|
|
|
|
from llama_index.core import (
|
|
SimpleDirectoryReader,
|
|
StorageContext,
|
|
VectorStoreIndex,
|
|
load_index_from_storage,
|
|
Settings,
|
|
)
|
|
from pathlib import Path
|
|
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
|
from llama_index.core.node_parser import SentenceSplitter
|
|
import argparse
|
|
import datetime
|
|
import os
|
|
import time
|
|
|
|
# Shared constants
|
|
DATA_DIR = Path("./data")
|
|
PERSIST_DIR = "./storage_exp"
|
|
EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5"
|
|
CHUNK_SIZE = 256
|
|
CHUNK_OVERLAP = 25
|
|
|
|
|
|
def get_text_splitter():
|
|
return SentenceSplitter(
|
|
chunk_size=CHUNK_SIZE,
|
|
chunk_overlap=CHUNK_OVERLAP,
|
|
paragraph_separator="\n\n",
|
|
)
|
|
|
|
|
|
def rebuild():
|
|
"""Full rebuild: delete and recreate the vector store from scratch."""
|
|
if not DATA_DIR.exists():
|
|
raise FileNotFoundError(f"Data directory not found: {DATA_DIR.absolute()}")
|
|
|
|
print(f"Loading documents from {DATA_DIR.absolute()}...")
|
|
documents = SimpleDirectoryReader(str(DATA_DIR)).load_data()
|
|
|
|
if not documents:
|
|
raise ValueError("No documents found in data directory")
|
|
|
|
print(f"Loaded {len(documents)} document(s)")
|
|
|
|
print("Building vector index...")
|
|
index = VectorStoreIndex.from_documents(
|
|
documents,
|
|
transformations=[get_text_splitter()],
|
|
show_progress=True,
|
|
)
|
|
|
|
index.storage_context.persist(persist_dir=PERSIST_DIR)
|
|
print(f"Index built and saved to {PERSIST_DIR}")
|
|
|
|
|
|
def update():
|
|
"""Incremental update: add new files, re-index modified files, remove deleted files."""
|
|
if not DATA_DIR.exists():
|
|
raise FileNotFoundError(f"Data directory not found: {DATA_DIR.absolute()}")
|
|
|
|
# Load existing index
|
|
print(f"Loading existing index from {PERSIST_DIR}...")
|
|
storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
|
|
index = load_index_from_storage(storage_context)
|
|
|
|
# Set transformations so index.insert() chunks correctly
|
|
Settings.transformations = [get_text_splitter()]
|
|
|
|
# Build lookup of indexed files: file_name -> (ref_doc_id, metadata)
|
|
all_ref_docs = index.docstore.get_all_ref_doc_info()
|
|
indexed = {}
|
|
for ref_id, info in all_ref_docs.items():
|
|
fname = info.metadata.get("file_name")
|
|
if fname:
|
|
indexed[fname] = (ref_id, info.metadata)
|
|
|
|
print(f"Index contains {len(indexed)} documents")
|
|
|
|
# Scan current files on disk
|
|
disk_files = {f.name: f for f in sorted(DATA_DIR.glob("*.txt"))}
|
|
print(f"Data directory contains {len(disk_files)} files")
|
|
|
|
# Classify files
|
|
new_files = []
|
|
modified_files = []
|
|
deleted_files = []
|
|
unchanged = 0
|
|
|
|
for fname, fpath in disk_files.items():
|
|
if fname not in indexed:
|
|
new_files.append(fpath)
|
|
else:
|
|
ref_id, meta = indexed[fname]
|
|
# Compare file size and modification date
|
|
stat = fpath.stat()
|
|
disk_size = stat.st_size
|
|
# Must use UTC to match SimpleDirectoryReader's date format
|
|
disk_mdate = datetime.datetime.fromtimestamp(
|
|
stat.st_mtime, tz=datetime.timezone.utc
|
|
).strftime("%Y-%m-%d")
|
|
|
|
stored_size = meta.get("file_size")
|
|
stored_mdate = meta.get("last_modified_date")
|
|
|
|
if disk_size != stored_size or disk_mdate != stored_mdate:
|
|
modified_files.append((fpath, ref_id))
|
|
else:
|
|
unchanged += 1
|
|
|
|
for fname, (ref_id, meta) in indexed.items():
|
|
if fname not in disk_files:
|
|
deleted_files.append((fname, ref_id))
|
|
|
|
# Report
|
|
print(f"\n New: {len(new_files)}")
|
|
print(f" Modified: {len(modified_files)}")
|
|
print(f" Deleted: {len(deleted_files)}")
|
|
print(f" Unchanged: {unchanged}")
|
|
|
|
if not new_files and not modified_files and not deleted_files:
|
|
print("\nNothing to do.")
|
|
return
|
|
|
|
# Process deletions (including modified files that need re-indexing)
|
|
for fname, ref_id in deleted_files:
|
|
print(f" Removing {fname}")
|
|
index.delete_ref_doc(ref_id, delete_from_docstore=True)
|
|
|
|
for fpath, ref_id in modified_files:
|
|
print(f" Re-indexing {fpath.name} (modified)")
|
|
index.delete_ref_doc(ref_id, delete_from_docstore=True)
|
|
|
|
# Process additions (new files + modified files)
|
|
files_to_add = new_files + [fpath for fpath, _ in modified_files]
|
|
if files_to_add:
|
|
print(f"\nIndexing {len(files_to_add)} file(s)...")
|
|
# Use "./" prefix to match paths from full build (pathlib strips it)
|
|
docs = SimpleDirectoryReader(
|
|
input_files=[f"./{f}" for f in files_to_add]
|
|
).load_data()
|
|
for doc in docs:
|
|
index.insert(doc)
|
|
|
|
# Persist
|
|
index.storage_context.persist(persist_dir=PERSIST_DIR)
|
|
print(f"\nIndex updated and saved to {PERSIST_DIR}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Build or update the vector store from journal entries."
|
|
)
|
|
parser.add_argument(
|
|
"--rebuild",
|
|
action="store_true",
|
|
help="Full rebuild from scratch (default: incremental update)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
# Configure embedding model
|
|
embed_model = HuggingFaceEmbedding(model_name=EMBED_MODEL_NAME)
|
|
Settings.embed_model = embed_model
|
|
|
|
start = time.time()
|
|
|
|
if args.rebuild:
|
|
print("Mode: full rebuild")
|
|
rebuild()
|
|
else:
|
|
print("Mode: incremental update")
|
|
if not Path(PERSIST_DIR).exists():
|
|
print(f"No existing index at {PERSIST_DIR}, doing full rebuild.")
|
|
rebuild()
|
|
else:
|
|
update()
|
|
|
|
elapsed = time.time() - start
|
|
print(f"Done in {elapsed:.1f}s")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|