ssearch/archived/build.py
Eric e9fc99ddc6 Initial commit: RAG pipeline for semantic search over personal journal archive
Vector search with cross-encoder re-ranking, hybrid BM25+vector retrieval,
incremental index updates, and multiple LLM backends (Ollama local, OpenAI API).
2026-02-20 06:02:28 -05:00

51 lines
1.5 KiB
Python

# build.py
#
# Import documents from data, generate embedded vector store
# and save to disk in directory ./storage
#
# August 2025
# E. M. Furst
from llama_index.core import (
SimpleDirectoryReader,
VectorStoreIndex,
Settings,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
def main():
# Choose your embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5")
# Configure global settings for LlamaIndex
Settings.embed_model = embed_model
# Load documents
documents = SimpleDirectoryReader("./data").load_data()
# Create the custom textsplitter
# Set chunk size and overlap (e.g., 256 tokens, 25 tokens overlap)
# see https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/sentence_splitter/#llama_index.core.node_parser.SentenceSplitter
text_splitter = SentenceSplitter(
chunk_size=256,
chunk_overlap=25,
paragraph_separator="\n\n", # use double newlines to separate paragraphs
)
Settings.text_splitter = text_splitter
# Build the index
index = VectorStoreIndex.from_documents(
documents, transformations=[text_splitter],
show_progress=True,
)
# Persist both vector store and index metadata
index.storage_context.persist(persist_dir="./storage")
print("Index built and saved to ./storage")
if __name__ == "__main__":
main()