Test clean deploy
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
commit
42e5e20e17
11 changed files with 1790 additions and 0 deletions
193
build_store.py
Normal file
193
build_store.py
Normal file
|
|
@ -0,0 +1,193 @@
|
|||
# build_store.py
|
||||
#
|
||||
# Build or update the vector store from journal entries in ./data.
|
||||
#
|
||||
# Default mode (incremental): loads the existing index and adds only
|
||||
# new or modified files. Use --rebuild for a full rebuild from scratch.
|
||||
#
|
||||
# January 2026
|
||||
# E. M. Furst
|
||||
# Used Sonnet 4.5 to suggest changes; Opus 4.6 for incremental update
|
||||
|
||||
from llama_index.core import (
|
||||
SimpleDirectoryReader,
|
||||
StorageContext,
|
||||
VectorStoreIndex,
|
||||
load_index_from_storage,
|
||||
Settings,
|
||||
)
|
||||
from pathlib import Path
|
||||
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
import argparse
|
||||
import datetime
|
||||
import os
|
||||
import time
|
||||
|
||||
# Shared constants
|
||||
DATA_DIR = Path("./data")
|
||||
PERSIST_DIR = "./store"
|
||||
EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5"
|
||||
CHUNK_SIZE = 256
|
||||
CHUNK_OVERLAP = 25
|
||||
|
||||
|
||||
def get_text_splitter():
|
||||
return SentenceSplitter(
|
||||
chunk_size=CHUNK_SIZE,
|
||||
chunk_overlap=CHUNK_OVERLAP,
|
||||
paragraph_separator="\n\n",
|
||||
)
|
||||
|
||||
|
||||
def rebuild():
|
||||
"""Full rebuild: delete and recreate the vector store from scratch."""
|
||||
if not DATA_DIR.exists():
|
||||
raise FileNotFoundError(f"Data directory not found: {DATA_DIR.absolute()}")
|
||||
|
||||
print(f"Loading documents from {DATA_DIR.absolute()}...")
|
||||
documents = SimpleDirectoryReader(str(DATA_DIR)).load_data()
|
||||
|
||||
if not documents:
|
||||
raise ValueError("No documents found in data directory")
|
||||
|
||||
print(f"Loaded {len(documents)} document(s)")
|
||||
|
||||
print("Building vector index...")
|
||||
index = VectorStoreIndex.from_documents(
|
||||
documents,
|
||||
transformations=[get_text_splitter()],
|
||||
show_progress=True,
|
||||
)
|
||||
|
||||
index.storage_context.persist(persist_dir=PERSIST_DIR)
|
||||
print(f"Index built and saved to {PERSIST_DIR}")
|
||||
|
||||
|
||||
def update():
|
||||
"""Incremental update: add new files, re-index modified files, remove deleted files."""
|
||||
if not DATA_DIR.exists():
|
||||
raise FileNotFoundError(f"Data directory not found: {DATA_DIR.absolute()}")
|
||||
|
||||
# Load existing index
|
||||
print(f"Loading existing index from {PERSIST_DIR}...")
|
||||
storage_context = StorageContext.from_defaults(persist_dir=PERSIST_DIR)
|
||||
index = load_index_from_storage(storage_context)
|
||||
|
||||
# Set transformations so index.insert() chunks correctly
|
||||
Settings.transformations = [get_text_splitter()]
|
||||
|
||||
# Build lookup of indexed files: file_name -> (ref_doc_id, metadata)
|
||||
all_ref_docs = index.docstore.get_all_ref_doc_info()
|
||||
indexed = {}
|
||||
for ref_id, info in all_ref_docs.items():
|
||||
fname = info.metadata.get("file_name")
|
||||
if fname:
|
||||
indexed[fname] = (ref_id, info.metadata)
|
||||
|
||||
print(f"Index contains {len(indexed)} documents")
|
||||
|
||||
# Scan current files on disk
|
||||
disk_files = {f.name: f for f in sorted(DATA_DIR.glob("*.txt"))}
|
||||
print(f"Data directory contains {len(disk_files)} files")
|
||||
|
||||
# Classify files
|
||||
new_files = []
|
||||
modified_files = []
|
||||
deleted_files = []
|
||||
unchanged = 0
|
||||
|
||||
for fname, fpath in disk_files.items():
|
||||
if fname not in indexed:
|
||||
new_files.append(fpath)
|
||||
else:
|
||||
ref_id, meta = indexed[fname]
|
||||
# Compare file size and modification date
|
||||
stat = fpath.stat()
|
||||
disk_size = stat.st_size
|
||||
# Must use UTC to match SimpleDirectoryReader's date format
|
||||
disk_mdate = datetime.datetime.fromtimestamp(
|
||||
stat.st_mtime, tz=datetime.timezone.utc
|
||||
).strftime("%Y-%m-%d")
|
||||
|
||||
stored_size = meta.get("file_size")
|
||||
stored_mdate = meta.get("last_modified_date")
|
||||
|
||||
if disk_size != stored_size or disk_mdate != stored_mdate:
|
||||
modified_files.append((fpath, ref_id))
|
||||
else:
|
||||
unchanged += 1
|
||||
|
||||
for fname, (ref_id, meta) in indexed.items():
|
||||
if fname not in disk_files:
|
||||
deleted_files.append((fname, ref_id))
|
||||
|
||||
# Report
|
||||
print(f"\n New: {len(new_files)}")
|
||||
print(f" Modified: {len(modified_files)}")
|
||||
print(f" Deleted: {len(deleted_files)}")
|
||||
print(f" Unchanged: {unchanged}")
|
||||
|
||||
if not new_files and not modified_files and not deleted_files:
|
||||
print("\nNothing to do.")
|
||||
return
|
||||
|
||||
# Process deletions (including modified files that need re-indexing)
|
||||
for fname, ref_id in deleted_files:
|
||||
print(f" Removing {fname}")
|
||||
index.delete_ref_doc(ref_id, delete_from_docstore=True)
|
||||
|
||||
for fpath, ref_id in modified_files:
|
||||
print(f" Re-indexing {fpath.name} (modified)")
|
||||
index.delete_ref_doc(ref_id, delete_from_docstore=True)
|
||||
|
||||
# Process additions (new files + modified files)
|
||||
files_to_add = new_files + [fpath for fpath, _ in modified_files]
|
||||
if files_to_add:
|
||||
print(f"\nIndexing {len(files_to_add)} file(s)...")
|
||||
# Use "./" prefix to match paths from full build (pathlib strips it)
|
||||
docs = SimpleDirectoryReader(
|
||||
input_files=[f"./{f}" for f in files_to_add]
|
||||
).load_data()
|
||||
for doc in docs:
|
||||
index.insert(doc)
|
||||
|
||||
# Persist
|
||||
index.storage_context.persist(persist_dir=PERSIST_DIR)
|
||||
print(f"\nIndex updated and saved to {PERSIST_DIR}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Build or update the vector store from journal entries."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rebuild",
|
||||
action="store_true",
|
||||
help="Full rebuild from scratch (default: incremental update)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Configure embedding model
|
||||
embed_model = HuggingFaceEmbedding(model_name=EMBED_MODEL_NAME)
|
||||
Settings.embed_model = embed_model
|
||||
|
||||
start = time.time()
|
||||
|
||||
if args.rebuild:
|
||||
print("Mode: full rebuild")
|
||||
rebuild()
|
||||
else:
|
||||
print("Mode: incremental update")
|
||||
if not Path(PERSIST_DIR).exists():
|
||||
print(f"No existing index at {PERSIST_DIR}, doing full rebuild.")
|
||||
rebuild()
|
||||
else:
|
||||
update()
|
||||
|
||||
elapsed = time.time() - start
|
||||
print(f"Done in {elapsed:.1f}s")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue