Rename storage_exp/ to store/, remove unused storage/
Update all active scripts, .gitignore, CLAUDE.md, and README.md. Also fix stale filename references in script header comments.
This commit is contained in:
parent
5a3294f74c
commit
13785d667a
5 changed files with 12 additions and 13 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -7,8 +7,7 @@ __pycache__/
|
||||||
models/
|
models/
|
||||||
|
|
||||||
# Vector stores (large, rebuild with build scripts)
|
# Vector stores (large, rebuild with build scripts)
|
||||||
storage_exp/
|
store/
|
||||||
storage/
|
|
||||||
storage_clippings/
|
storage_clippings/
|
||||||
|
|
||||||
# Data (symlinks to private files)
|
# Data (symlinks to private files)
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,7 @@ ssearch/
|
||||||
│ └── retrieve_clippings.py # Verbatim clippings chunk retrieval
|
│ └── retrieve_clippings.py # Verbatim clippings chunk retrieval
|
||||||
├── data/ # Symlink to journal .txt files
|
├── data/ # Symlink to journal .txt files
|
||||||
├── clippings/ # Symlink to clippings (PDFs, TXT, webarchive, RTF)
|
├── clippings/ # Symlink to clippings (PDFs, TXT, webarchive, RTF)
|
||||||
├── storage_exp/ # Persisted journal vector store (~242 MB)
|
├── store/ # Persisted journal vector store (~242 MB)
|
||||||
├── storage_clippings/ # Persisted clippings vector store (ChromaDB)
|
├── storage_clippings/ # Persisted clippings vector store (ChromaDB)
|
||||||
├── models/ # Cached HuggingFace models (offline)
|
├── models/ # Cached HuggingFace models (offline)
|
||||||
├── archived/ # Superseded script versions
|
├── archived/ # Superseded script versions
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
# build_exp_claude.py
|
# build_store.py
|
||||||
#
|
#
|
||||||
# Build or update the vector store from journal entries in ./data.
|
# Build or update the vector store from journal entries in ./data.
|
||||||
#
|
#
|
||||||
|
|
@ -26,7 +26,7 @@ import time
|
||||||
|
|
||||||
# Shared constants
|
# Shared constants
|
||||||
DATA_DIR = Path("./data")
|
DATA_DIR = Path("./data")
|
||||||
PERSIST_DIR = "./storage_exp"
|
PERSIST_DIR = "./store"
|
||||||
EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5"
|
EMBED_MODEL_NAME = "BAAI/bge-large-en-v1.5"
|
||||||
CHUNK_SIZE = 256
|
CHUNK_SIZE = 256
|
||||||
CHUNK_OVERLAP = 25
|
CHUNK_OVERLAP = 25
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
# query_hybrid_bm25_v4.py
|
# query_hybrid.py
|
||||||
# Hybrid retrieval: BM25 (sparse) + vector similarity (dense) + cross-encoder
|
# Hybrid retrieval: BM25 (sparse) + vector similarity (dense) + cross-encoder
|
||||||
#
|
#
|
||||||
# Combines two retrieval strategies to catch both exact term matches and
|
# Combines two retrieval strategies to catch both exact term matches and
|
||||||
|
|
@ -40,7 +40,7 @@ import sys
|
||||||
# Globals
|
# Globals
|
||||||
#
|
#
|
||||||
|
|
||||||
# Embedding model (must match build_exp_claude.py)
|
# Embedding model (must match build_store.py)
|
||||||
EMBED_MODEL = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5", local_files_only=True)
|
EMBED_MODEL = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5", local_files_only=True)
|
||||||
|
|
||||||
# LLM model for generation
|
# LLM model for generation
|
||||||
|
|
@ -105,7 +105,7 @@ def main():
|
||||||
|
|
||||||
|
|
||||||
# Load persisted vector store
|
# Load persisted vector store
|
||||||
storage_context = StorageContext.from_defaults(persist_dir="./storage_exp")
|
storage_context = StorageContext.from_defaults(persist_dir="./store")
|
||||||
index = load_index_from_storage(storage_context)
|
index = load_index_from_storage(storage_context)
|
||||||
|
|
||||||
# --- Retrievers ---
|
# --- Retrievers ---
|
||||||
|
|
|
||||||
10
retrieve.py
10
retrieve.py
|
|
@ -1,9 +1,9 @@
|
||||||
# retrieve_hybrid_raw.py
|
# retrieve.py
|
||||||
# Hybrid verbatim chunk retrieval: BM25 + vector search + cross-encoder, no LLM.
|
# Hybrid verbatim chunk retrieval: BM25 + vector search + cross-encoder, no LLM.
|
||||||
#
|
#
|
||||||
# Same hybrid retrieval as query_hybrid_bm25_v4.py but outputs raw chunk text
|
# Same hybrid retrieval as query_hybrid.py but outputs raw chunk text
|
||||||
# instead of LLM synthesis. Useful for inspecting what the hybrid pipeline
|
# instead of LLM synthesis. Useful for inspecting what the hybrid pipeline
|
||||||
# retrieves and comparing against retrieve_raw.py (vector-only).
|
# retrieves.
|
||||||
#
|
#
|
||||||
# Each chunk is annotated with its source (vector, BM25, or both) so you can
|
# Each chunk is annotated with its source (vector, BM25, or both) so you can
|
||||||
# see which retriever nominated it.
|
# see which retriever nominated it.
|
||||||
|
|
@ -33,7 +33,7 @@ import textwrap
|
||||||
# Globals
|
# Globals
|
||||||
#
|
#
|
||||||
|
|
||||||
# Embedding model (must match build_exp_claude.py)
|
# Embedding model (must match build_store.py)
|
||||||
EMBED_MODEL = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5", local_files_only=True)
|
EMBED_MODEL = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5", local_files_only=True)
|
||||||
|
|
||||||
# Cross-encoder model for re-ranking (cached in ./models/)
|
# Cross-encoder model for re-ranking (cached in ./models/)
|
||||||
|
|
@ -53,7 +53,7 @@ def main():
|
||||||
Settings.embed_model = EMBED_MODEL
|
Settings.embed_model = EMBED_MODEL
|
||||||
|
|
||||||
# Load persisted vector store
|
# Load persisted vector store
|
||||||
storage_context = StorageContext.from_defaults(persist_dir="./storage_exp")
|
storage_context = StorageContext.from_defaults(persist_dir="./store")
|
||||||
index = load_index_from_storage(storage_context)
|
index = load_index_from_storage(storage_context)
|
||||||
|
|
||||||
# --- Retrievers ---
|
# --- Retrievers ---
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue