Initial commit: RAG demo with build and query scripts

2026-02-22 12:41:55 -05:00 · 2026-02-22 12:41:55 -05:00 · 39f1f73e2a
commit 39f1f73e2a
6 changed files with 214 additions and 0 deletions
--- a/build.py
+++ b/build.py
@ -0,0 +1,49 @@
+# build.py
+#
+# Import documents from data, generate embedded vector store
+# and save to disk in directory ./storage
+#
+# August 2025
+# E. M. Furst
+
+from llama_index.core import (
+    SimpleDirectoryReader,
+    VectorStoreIndex,
+    Settings,
+)
+
+from llama_index.embeddings.huggingface import HuggingFaceEmbedding
+from llama_index.core.node_parser import SentenceSplitter
+
+def main():
+    # Choose your embedding model
+    embed_model = HuggingFaceEmbedding(cache_folder="./models",
+        model_name="BAAI/bge-large-en-v1.5")
+
+    # Configure global settings for LlamaIndex
+    Settings.embed_model = embed_model
+
+    # Load documents
+    documents = SimpleDirectoryReader("./data").load_data()
+
+    # Create the custom textsplitter
+    # Set chunk size and overlap (e.g., 256 tokens, 25 tokens overlap)
+    text_splitter = SentenceSplitter(
+        chunk_size=500,
+        chunk_overlap=50,
+    )
+    Settings.text_splitter = text_splitter
+
+    # Build the index
+    index = VectorStoreIndex.from_documents(
+        documents, transformations=[text_splitter],
+        show_progress=True,
+    )
+
+    # Persist both vector store and index metadata
+    index.storage_context.persist(persist_dir="./storage")
+
+    print("Index built and saved to ./storage")
+
+if __name__ == "__main__":
+    main()