Initial commit: RAG pipeline for semantic search over personal journal archive

Vector search with cross-encoder re-ranking, hybrid BM25+vector retrieval, incremental index updates, and multiple LLM backends (Ollama local, OpenAI API).
2026-02-20 06:02:28 -05:00 · 2026-02-20 06:02:28 -05:00 · e9fc99ddc6
commit e9fc99ddc6
43 changed files with 7349 additions and 0 deletions
--- a/archived/vs_metrics.py
+++ b/archived/vs_metrics.py
@ -0,0 +1,27 @@
+# vs_metrics.py
+# Quantify vector store properties and performance
+#
+# E.M.F. August 2025
+
+# Read in vector store
+
+# What are properties of the vector store?
+# - number of vectors
+# - distribution of distances
+# - clustering?
+
+from llama_index.core import (
+    StorageContext,
+    load_index_from_storage,
+    ServiceContext,
+    Settings,
+)
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+
+# Load embedding model (same as used for vector store)
+embed_model = HuggingFaceEmbedding(model_name="all-mpnet-base-v2")
+Settings.embed_model = embed_model
+
+# Load persisted vector store + metadata
+storage_context = StorageContext.from_defaults(persist_dir="./storage")
+index = load_index_from_storage(storage_context)