Initial commit: RAG pipeline for semantic search over personal journal archive

Vector search with cross-encoder re-ranking, hybrid BM25+vector retrieval, incremental index updates, and multiple LLM backends (Ollama local, OpenAI API).
2026-02-20 06:02:28 -05:00 · 2026-02-20 06:02:28 -05:00 · e9fc99ddc6
commit e9fc99ddc6
43 changed files with 7349 additions and 0 deletions
--- a/archived/build_exp.py
+++ b/archived/build_exp.py
@ -0,0 +1,68 @@
+# build_exp.py
+#
+# Import document from data, generate embedded vector store
+# and save to disk
+#
+# Experiment to include text chunking with a textsplitter
+#
+# August 2025
+# E. M. Furst
+
+from llama_index.core import (
+    SimpleDirectoryReader,
+    VectorStoreIndex,
+    Settings,
+)
+
+from pathlib import Path
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core.node_parser import SentenceSplitter
+
+def main():
+    # Choose your embedding model
+    #embed_model = HuggingFaceEmbedding(model_name="all-mpnet-base-v2")
+    # embedding is slower with BAAI/bge-large-en-v1.5
+    embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5")
+
+    # Configure global settings for LlamaIndex
+    Settings.embed_model = embed_model
+
+    # Load documents (capabilities?)
+    documents = SimpleDirectoryReader(
+        "./data",
+        # # p is a string path
+        # file_metadata=lambda p: {
+        #     "filename": Path(p).name,            # just the file name
+        #     "filepath": str(Path(p).resolve()),  # absolute path (handy for tracing)
+        # },
+    ).load_data()
+
+    # Create the custom textsplitter
+    # Set chunk size and overlap (e.g., 512 tokens, 10 toekns overlap)
+    # see https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/sentence_splitter/#llama_index.core.node_parser.SentenceSplitter
+    text_splitter = SentenceSplitter(
+        chunk_size=256, 
+        chunk_overlap=25,
+        paragraph_separator="\n\n",  # use double newlines to separate paragraphs
+    )
+    # b/c passing text_splitter in the index build, this may cause problems
+    # test with it commented out...
+    #    Settings.text_splitter = text_splitter
+
+    # Build the index 
+    index = VectorStoreIndex.from_documents(
+        documents, transformations=[text_splitter],
+        show_progress=True,
+    )
+
+    # Persist both vector store and index metadata
+    index.storage_context.persist(persist_dir="./storage_exp")
+    
+#    storage_context = StorageContext.from_defaults(vector_store=index.vector_store)
+#    storage_context.persist(persist_dir="./storage")
+
+    print("Index built and saved to ./storage_exp")
+
+if __name__ == "__main__":
+    main()
+