# build.py # # Import documents from data, generate embedded vector store # and save to disk in directory ./storage # # August 2025 # E. M. Furst from llama_index.core import ( SimpleDirectoryReader, VectorStoreIndex, Settings, ) from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.core.node_parser import SentenceSplitter def main(): # Choose your embedding model embed_model = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5") # Configure global settings for LlamaIndex Settings.embed_model = embed_model # Load documents documents = SimpleDirectoryReader("./data").load_data() # Create the custom textsplitter # Set chunk size and overlap (e.g., 256 tokens, 25 tokens overlap) text_splitter = SentenceSplitter( chunk_size=500, chunk_overlap=50, ) Settings.text_splitter = text_splitter # Build the index index = VectorStoreIndex.from_documents( documents, transformations=[text_splitter], show_progress=True, ) # Persist both vector store and index metadata index.storage_context.persist(persist_dir="./storage") print("Index built and saved to ./storage") if __name__ == "__main__": main()