Initial commit: RAG pipeline for semantic search over personal journal archive
Vector search with cross-encoder re-ranking, hybrid BM25+vector retrieval, incremental index updates, and multiple LLM backends (Ollama local, OpenAI API).
This commit is contained in:
commit
e9fc99ddc6
43 changed files with 7349 additions and 0 deletions
68
archived/build_exp.py
Normal file
68
archived/build_exp.py
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
# build_exp.py
|
||||
#
|
||||
# Import document from data, generate embedded vector store
|
||||
# and save to disk
|
||||
#
|
||||
# Experiment to include text chunking with a textsplitter
|
||||
#
|
||||
# August 2025
|
||||
# E. M. Furst
|
||||
|
||||
from llama_index.core import (
|
||||
SimpleDirectoryReader,
|
||||
VectorStoreIndex,
|
||||
Settings,
|
||||
)
|
||||
|
||||
from pathlib import Path
|
||||
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
|
||||
def main():
|
||||
# Choose your embedding model
|
||||
#embed_model = HuggingFaceEmbedding(model_name="all-mpnet-base-v2")
|
||||
# embedding is slower with BAAI/bge-large-en-v1.5
|
||||
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5")
|
||||
|
||||
# Configure global settings for LlamaIndex
|
||||
Settings.embed_model = embed_model
|
||||
|
||||
# Load documents (capabilities?)
|
||||
documents = SimpleDirectoryReader(
|
||||
"./data",
|
||||
# # p is a string path
|
||||
# file_metadata=lambda p: {
|
||||
# "filename": Path(p).name, # just the file name
|
||||
# "filepath": str(Path(p).resolve()), # absolute path (handy for tracing)
|
||||
# },
|
||||
).load_data()
|
||||
|
||||
# Create the custom textsplitter
|
||||
# Set chunk size and overlap (e.g., 512 tokens, 10 toekns overlap)
|
||||
# see https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/sentence_splitter/#llama_index.core.node_parser.SentenceSplitter
|
||||
text_splitter = SentenceSplitter(
|
||||
chunk_size=256,
|
||||
chunk_overlap=25,
|
||||
paragraph_separator="\n\n", # use double newlines to separate paragraphs
|
||||
)
|
||||
# b/c passing text_splitter in the index build, this may cause problems
|
||||
# test with it commented out...
|
||||
# Settings.text_splitter = text_splitter
|
||||
|
||||
# Build the index
|
||||
index = VectorStoreIndex.from_documents(
|
||||
documents, transformations=[text_splitter],
|
||||
show_progress=True,
|
||||
)
|
||||
|
||||
# Persist both vector store and index metadata
|
||||
index.storage_context.persist(persist_dir="./storage_exp")
|
||||
|
||||
# storage_context = StorageContext.from_defaults(vector_store=index.vector_store)
|
||||
# storage_context.persist(persist_dir="./storage")
|
||||
|
||||
print("Index built and saved to ./storage_exp")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue