From 13785d667a355edf55432a903b78fad0b90e5fac Mon Sep 17 00:00:00 2001 From: Eric Furst Date: Thu, 26 Feb 2026 16:36:57 -0500 Subject: [PATCH] Rename storage_exp/ to store/, remove unused storage/ Update all active scripts, .gitignore, CLAUDE.md, and README.md. Also fix stale filename references in script header comments. --- .gitignore | 3 +-- README.md | 2 +- build_store.py | 4 ++-- query_hybrid.py | 6 +++--- retrieve.py | 10 +++++----- 5 files changed, 12 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index 7e4dde6..247d3bd 100644 --- a/.gitignore +++ b/.gitignore @@ -7,8 +7,7 @@ __pycache__/ models/ # Vector stores (large, rebuild with build scripts) -storage_exp/ -storage/ +store/ storage_clippings/ # Data (symlinks to private files) diff --git a/README.md b/README.md index a2d5ded..42ae8cd 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ ssearch/ │ └── retrieve_clippings.py # Verbatim clippings chunk retrieval ├── data/ # Symlink to journal .txt files ├── clippings/ # Symlink to clippings (PDFs, TXT, webarchive, RTF) -├── storage_exp/ # Persisted journal vector store (~242 MB) +├── store/ # Persisted journal vector store (~242 MB) ├── storage_clippings/ # Persisted clippings vector store (ChromaDB) ├── models/ # Cached HuggingFace models (offline) ├── archived/ # Superseded script versions diff --git a/build_store.py b/build_store.py index 891507b..add3db3 100644 --- a/build_store.py +++ b/build_store.py @@ -1,4 +1,4 @@ -# build_exp_claude.py +# build_store.py # # Build or update the vector store from journal entries in ./data. # @@ -26,7 +26,7 @@ import time # Shared constants DATA_DIR = Path("./data") -PERSIST_DIR = "./storage_exp" +PERSIST_DIR = "./store" EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5" CHUNK_SIZE = 256 CHUNK_OVERLAP = 25 diff --git a/query_hybrid.py b/query_hybrid.py index 277f870..e32d942 100644 --- a/query_hybrid.py +++ b/query_hybrid.py @@ -1,4 +1,4 @@ -# query_hybrid_bm25_v4.py +# query_hybrid.py # Hybrid retrieval: BM25 (sparse) + vector similarity (dense) + cross-encoder # # Combines two retrieval strategies to catch both exact term matches and @@ -40,7 +40,7 @@ import sys # Globals # -# Embedding model (must match build_exp_claude.py) +# Embedding model (must match build_store.py) EMBED_MODEL = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5", local_files_only=True) # LLM model for generation @@ -105,7 +105,7 @@ def main(): # Load persisted vector store - storage_context = StorageContext.from_defaults(persist_dir="./storage_exp") + storage_context = StorageContext.from_defaults(persist_dir="./store") index = load_index_from_storage(storage_context) # --- Retrievers --- diff --git a/retrieve.py b/retrieve.py index ff64140..28f92e1 100644 --- a/retrieve.py +++ b/retrieve.py @@ -1,9 +1,9 @@ -# retrieve_hybrid_raw.py +# retrieve.py # Hybrid verbatim chunk retrieval: BM25 + vector search + cross-encoder, no LLM. # -# Same hybrid retrieval as query_hybrid_bm25_v4.py but outputs raw chunk text +# Same hybrid retrieval as query_hybrid.py but outputs raw chunk text # instead of LLM synthesis. Useful for inspecting what the hybrid pipeline -# retrieves and comparing against retrieve_raw.py (vector-only). +# retrieves. # # Each chunk is annotated with its source (vector, BM25, or both) so you can # see which retriever nominated it. @@ -33,7 +33,7 @@ import textwrap # Globals # -# Embedding model (must match build_exp_claude.py) +# Embedding model (must match build_store.py) EMBED_MODEL = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5", local_files_only=True) # Cross-encoder model for re-ranking (cached in ./models/) @@ -53,7 +53,7 @@ def main(): Settings.embed_model = EMBED_MODEL # Load persisted vector store - storage_context = StorageContext.from_defaults(persist_dir="./storage_exp") + storage_context = StorageContext.from_defaults(persist_dir="./store") index = load_index_from_storage(storage_context) # --- Retrievers ---