ssearch/archived/build_exp.py

# build_exp.py
#
# Import document from data, generate embedded vector store
# and save to disk
#
# Experiment to include text chunking with a textsplitter
#
# August 2025
# E. M. Furst

from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    Settings,
)

from pathlib import Path
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter

def main():
    # Choose your embedding model
    #embed_model = HuggingFaceEmbedding(model_name="all-mpnet-base-v2")
    # embedding is slower with BAAI/bge-large-en-v1.5
    embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5")

    # Configure global settings for LlamaIndex
    Settings.embed_model = embed_model

    # Load documents (capabilities?)
    documents = SimpleDirectoryReader(
        "./data",
        # # p is a string path
        # file_metadata=lambda p: {
        #     "filename": Path(p).name,            # just the file name
        #     "filepath": str(Path(p).resolve()),  # absolute path (handy for tracing)
        # },
    ).load_data()

    # Create the custom textsplitter
    # Set chunk size and overlap (e.g., 512 tokens, 10 toekns overlap)
    # see https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/sentence_splitter/#llama_index.core.node_parser.SentenceSplitter
    text_splitter = SentenceSplitter(
        chunk_size=256,
        chunk_overlap=25,
        paragraph_separator="\n\n",  # use double newlines to separate paragraphs
    )
    # b/c passing text_splitter in the index build, this may cause problems
    # test with it commented out...
    #    Settings.text_splitter = text_splitter

    # Build the index
    index = VectorStoreIndex.from_documents(
        documents, transformations=[text_splitter],
        show_progress=True,
    )

    # Persist both vector store and index metadata
    index.storage_context.persist(persist_dir="./storage_exp")

#    storage_context = StorageContext.from_defaults(vector_store=index.vector_store)
#    storage_context.persist(persist_dir="./storage")

    print("Index built and saved to ./storage_exp")

if __name__ == "__main__":
    main()