- 03-rag, 04-semantic-search: env-var-before-imports fix in build/query scripts - 03-rag: new libraries section, fetch_arxiv.py, exercises for larger corpus and finding current SOTA models, formal references (Lewis, Booth) - 04-semantic-search: libraries pointer back to Part III, larger corpus subsection, model-update exercise, formal references - 06-neural-networks: add Nielsen reference (recommended by student) - README: vocab.md link, agentic systems in description, Ollama prereq for 02-05 - New: vocab.md (glossary organized by section) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
57 lines
1.6 KiB
Python
57 lines
1.6 KiB
Python
# build.py
|
|
#
|
|
# Import documents from data, generate embedded vector store
|
|
# and save to disk in directory ./storage
|
|
#
|
|
# August 2025
|
|
# E. M. Furst
|
|
|
|
# Environment vars must be set before importing huggingface/transformers
|
|
# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
|
|
# at import time.
|
|
import os
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
|
|
os.environ["HF_HUB_OFFLINE"] = "1"
|
|
|
|
from llama_index.core import (
|
|
SimpleDirectoryReader,
|
|
VectorStoreIndex,
|
|
Settings,
|
|
)
|
|
|
|
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
|
from llama_index.core.node_parser import SentenceSplitter
|
|
|
|
def main():
|
|
# Choose your embedding model
|
|
embed_model = HuggingFaceEmbedding(cache_folder="./models",
|
|
model_name="BAAI/bge-large-en-v1.5")
|
|
|
|
# Configure global settings for LlamaIndex
|
|
Settings.embed_model = embed_model
|
|
|
|
# Load documents
|
|
documents = SimpleDirectoryReader("./data").load_data()
|
|
|
|
# Create the custom textsplitter
|
|
# Set chunk size and overlap (e.g., 256 tokens, 25 tokens overlap)
|
|
text_splitter = SentenceSplitter(
|
|
chunk_size=500,
|
|
chunk_overlap=50,
|
|
)
|
|
Settings.text_splitter = text_splitter
|
|
|
|
# Build the index
|
|
index = VectorStoreIndex.from_documents(
|
|
documents, transformations=[text_splitter],
|
|
show_progress=True,
|
|
)
|
|
|
|
# Persist both vector store and index metadata
|
|
index.storage_context.persist(persist_dir="./storage")
|
|
|
|
print("Index built and saved to ./storage")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|