Initial commit: RAG pipeline for semantic search over personal journal archive

Vector search with cross-encoder re-ranking, hybrid BM25+vector retrieval,
incremental index updates, and multiple LLM backends (Ollama local, OpenAI API).
This commit is contained in:
Eric 2026-02-20 06:02:28 -05:00
commit e9fc99ddc6
43 changed files with 7349 additions and 0 deletions

51
archived/build.py Normal file
View file

@ -0,0 +1,51 @@
# build.py
#
# Import documents from data, generate embedded vector store
# and save to disk in directory ./storage
#
# August 2025
# E. M. Furst
from llama_index.core import (
SimpleDirectoryReader,
VectorStoreIndex,
Settings,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
def main():
# Choose your embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5")
# Configure global settings for LlamaIndex
Settings.embed_model = embed_model
# Load documents
documents = SimpleDirectoryReader("./data").load_data()
# Create the custom textsplitter
# Set chunk size and overlap (e.g., 256 tokens, 25 tokens overlap)
# see https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/sentence_splitter/#llama_index.core.node_parser.SentenceSplitter
text_splitter = SentenceSplitter(
chunk_size=256,
chunk_overlap=25,
paragraph_separator="\n\n", # use double newlines to separate paragraphs
)
Settings.text_splitter = text_splitter
# Build the index
index = VectorStoreIndex.from_documents(
documents, transformations=[text_splitter],
show_progress=True,
)
# Persist both vector store and index metadata
index.storage_context.persist(persist_dir="./storage")
print("Index built and saved to ./storage")
if __name__ == "__main__":
main()

68
archived/build_exp.py Normal file
View file

@ -0,0 +1,68 @@
# build_exp.py
#
# Import document from data, generate embedded vector store
# and save to disk
#
# Experiment to include text chunking with a textsplitter
#
# August 2025
# E. M. Furst
from llama_index.core import (
SimpleDirectoryReader,
VectorStoreIndex,
Settings,
)
from pathlib import Path
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
def main():
# Choose your embedding model
#embed_model = HuggingFaceEmbedding(model_name="all-mpnet-base-v2")
# embedding is slower with BAAI/bge-large-en-v1.5
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5")
# Configure global settings for LlamaIndex
Settings.embed_model = embed_model
# Load documents (capabilities?)
documents = SimpleDirectoryReader(
"./data",
# # p is a string path
# file_metadata=lambda p: {
# "filename": Path(p).name, # just the file name
# "filepath": str(Path(p).resolve()), # absolute path (handy for tracing)
# },
).load_data()
# Create the custom textsplitter
# Set chunk size and overlap (e.g., 512 tokens, 10 toekns overlap)
# see https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/sentence_splitter/#llama_index.core.node_parser.SentenceSplitter
text_splitter = SentenceSplitter(
chunk_size=256,
chunk_overlap=25,
paragraph_separator="\n\n", # use double newlines to separate paragraphs
)
# b/c passing text_splitter in the index build, this may cause problems
# test with it commented out...
# Settings.text_splitter = text_splitter
# Build the index
index = VectorStoreIndex.from_documents(
documents, transformations=[text_splitter],
show_progress=True,
)
# Persist both vector store and index metadata
index.storage_context.persist(persist_dir="./storage_exp")
# storage_context = StorageContext.from_defaults(vector_store=index.vector_store)
# storage_context.persist(persist_dir="./storage")
print("Index built and saved to ./storage_exp")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,164 @@
# Better HyDE debugging with targeted tests
from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.core import PromptTemplate
from llama_index.core import Settings
from llama_index.core.base.base_query_engine import BaseQueryEngine
from llama_index.llms.ollama import Ollama
llm="llama3.1:8B"
# Use a local model to generate
Settings.llm = Ollama(
model=llm, # First model tested
request_timeout=360.0,
context_window=8000,
temperature=0.7,
)
# Test queries that should produce very different hypothetical documents
test_queries = [
"What is the capital of France?",
"How do you make chocolate chip cookies?",
"Explain quantum physics",
"Write a love letter",
"Describe symptoms of the common cold"
]
print("=== DEBUGGING HYDE STEP BY STEP ===\n")
# 1. Test the LLM with HyDE-style prompts directly
print("1. Testing LLM directly with HyDE-style prompts:")
print("-" * 50)
for query in test_queries[:2]: # Just test 2 to keep output manageable
direct_prompt = f"""Generate a hypothetical document that would contain the answer to this query.
Query: {query}
Hypothetical document:"""
response = Settings.llm.complete(direct_prompt)
print(f"Query: {query}")
print(f"Direct LLM Response: {response.text[:100]}...")
print()
# 2. Check HyDE internals - let's see what's actually happening
print("\n2. Examining HyDE internal behavior:")
print("-" * 50)
# Create a custom HyDE that shows us everything
class VerboseHyDETransform(HyDEQueryTransform):
def _get_prompts(self):
"""Show what prompts are being used"""
prompts = super()._get_prompts()
print(f"HyDE prompts: {prompts}")
return prompts
def _run_component(self, **kwargs):
"""Show what's being passed to the LLM"""
print(f"HyDE _run_component kwargs: {kwargs}")
result = super()._run_component(**kwargs)
print(f"HyDE _run_component result: {result}")
return result
# Test with verbose HyDE
verbose_hyde = VerboseHyDETransform(llm=Settings.llm)
test_result = verbose_hyde.run("What is machine learning?")
print(f"Final verbose result: {test_result}")
# 3. Try the most basic possible test
print("\n3. Most basic HyDE test:")
print("-" * 50)
basic_hyde = HyDEQueryTransform(llm=Settings.llm)
basic_result = basic_hyde.run("Paris")
print(f"Input: 'Paris'")
print(f"Output: '{basic_result}'")
print(f"Same as input? {basic_result.strip() == 'Paris'}")
# 4. Check if it's a version issue - try alternative approach
print("\n4. Alternative HyDE approach:")
print("-" * 50)
try:
# Some versions might need different initialization
from llama_index.core.query_engine import TransformQueryEngine
from llama_index.core.indices.query.query_transform import HyDEQueryTransform
# Try with explicit prompt template
hyde_prompt_template = PromptTemplate(
"Please write a passage to answer the question\n"
"Try to include as many key details as possible\n"
"\n"
"\n"
"Passage:{query_str}\n"
"\n"
"\n"
"Passage:"
)
alt_hyde = HyDEQueryTransform(
llm=Settings.llm,
hyde_prompt=hyde_prompt_template
)
alt_result = alt_hyde.run("What causes rain?")
print(f"Alternative approach result: {alt_result}")
except Exception as e:
print(f"Alternative approach failed: {e}")
# 5. Check what happens with different query formats
print("\n5. Testing different input formats:")
print("-" * 50)
from llama_index.core.schema import QueryBundle
# Test with QueryBundle vs string
hyde_test = HyDEQueryTransform(llm=Settings.llm)
string_result = hyde_test.run("test query")
print(f"String input result: '{string_result}'")
query_bundle = QueryBundle(query_str="test query")
bundle_result = hyde_test.run(query_bundle)
print(f"QueryBundle input result: '{bundle_result}'")
# 6. Version and import check
print("\n6. Environment check:")
print("-" * 50)
import llama_index
print(f"LlamaIndex version: {llama_index.__version__}")
# Check what LLM you're actually using
print(f"LLM type: {type(Settings.llm)}")
print(f"LLM model name: {getattr(Settings.llm, 'model', 'Unknown')}")
# 7. Try the nuclear option - completely manual implementation
print("\n7. Manual HyDE implementation:")
print("-" * 50)
def manual_hyde(query: str, llm):
"""Completely manual HyDE to see if the concept works"""
prompt = f"""You are an expert writer. Generate a realistic document excerpt that would contain the answer to this question.
Question: {query}
Document excerpt:"""
response = llm.complete(prompt)
return response.text
manual_result = manual_hyde("What is photosynthesis?", Settings.llm)
print(f"Manual HyDE result: {manual_result[:150]}...")
# 8. Final diagnostic
print("\n8. Final diagnostic questions:")
print("-" * 50)
print("If all the above show the LLM generating proper responses but HyDE still returns original:")
print("- What LLM are you using? (OpenAI, Anthropic, local model, etc.)")
print("- What's your LlamaIndex version?")
print("- Are there any error messages in the logs?")
print("- Does the LLM have any special configuration or wrappers?")

BIN
archived/output.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 785 KiB

110
archived/query.py Normal file
View file

@ -0,0 +1,110 @@
# query_topk_prompt.py
# Run a querry on a vector store
#
# E. M. Furst August 2025
from llama_index.core import (
load_index_from_storage,
StorageContext,
Settings,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core.prompts import PromptTemplate
import os
#
# Globals
#
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Embedding model used in vector store (this should match the one in build.py or equivalent)
embed_model = HuggingFaceEmbedding(cache_folder="./models",model_name="BAAI/bge-large-en-v1.5")
# LLM model to use in query transform and generation
llm="command-r7b"
#
# Custom prompt for the query engine
#
PROMPT = PromptTemplate(
"""You are an expert research assistant. You are given top-ranked writing excerpts (CONTEXT) and a user's QUERY.
Instructions:
- Base your response *only* on the CONTEXT.
- The snippets are ordered from most to least relevantprioritize insights from earlier (higher-ranked) snippets.
- Aim to reference *as many distinct* relevant files as possible (up to 10).
- Do not invent or generalize; refer to specific passages or facts only.
- If a passage only loosely matches, deprioritize it.
Format your answer in two parts:
1. **Summary Theme**
Summarize the dominant theme from the relevant context in a few sentences.
2. **Matching Files**
Make a list of 10 matching files. The format for each should be:
<filename> -
<rationale tied to content. Include date or section hints if available.>
CONTEXT:
{context_str}
QUERY:
{query_str}
Now provide the theme and list of matching files."""
)
#
# Main program routine
#
def main():
# Use a local model to generate -- in this case using Ollama
Settings.llm = Ollama(
model=llm, # First model tested
request_timeout=360.0,
context_window=8000
)
# Load embedding model (same as used for vector store)
Settings.embed_model = embed_model
# Load persisted vector store + metadata
storage_context = StorageContext.from_defaults(persist_dir="./storage_exp")
index = load_index_from_storage(storage_context)
# Build regular query engine with custom prompt
query_engine = index.as_query_engine(
similarity_top_k=15, # pull wide
#response_mode="compact" # concise synthesis
text_qa_template=PROMPT, # custom prompt
# node_postprocessors=[
# SimilarityPostprocessor(similarity_cutoff=0.75) # keep strong hits; makes result count flexible
# ],
)
# Query
while True:
q = input("\nEnter a search topic or question (or 'exit'): ").strip()
if q.lower() in ("exit", "quit"):
break
print()
# Generate the response by querying the engine
# This performes the similarity search and then applies the prompt
response = query_engine.query(q)
# Return the query response and source documents
print(response.response)
print("\nSource documents:")
for node in response.source_nodes:
meta = getattr(node, "metadata", None) or node.node.metadata
print(f"{meta.get('file_name')} {meta.get('file_path')} {getattr(node, 'score', None)}")
if __name__ == "__main__":
main()

90
archived/query_catalog.py Normal file
View file

@ -0,0 +1,90 @@
# query.py
# Run a querry on a vector store
# This version implements a CATALOG prompt
#
# E.M.F. July 2025
# August 2025 - updated for nd ssearch
from llama_index.core import (
StorageContext,
load_index_from_storage,
ServiceContext,
Settings,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core.prompts import PromptTemplate
import logging
logging.basicConfig(level=logging.DEBUG)
CATALOG_PROMPT = PromptTemplate(
"""You are a research assistant. Youre given journal snippets (CONTEXT) and a user query.
Your job is NOT to write an essay but to list the best-matching journal files with a 12 sentence rationale.
Rules:
- Use only the CONTEXT; do not invent content.
- Prefer precise references to passages over generalities.
- Output exactly:
1) A brief one-line summary of the overall theme you detect.
2) A bulleted list: **filename** brief rationale. If available in the snippet, include date or section hints.
CONTEXT:
{context_str}
QUERY: {query_str}
Now produce the summary line and the bulleted list of matching files."""
)
# Use a local model to generate
Settings.llm = Ollama(
# model="llama3.1:8B", # First model tested
# model="deepseek-r1:8B", # This model shows its reasoning
model="gemma3:1b",
request_timeout=360.0,
context_window=8000
)
def main():
# Load embedding model (same as used for vector store)
embed_model = HuggingFaceEmbedding(model_name="all-mpnet-base-v2")
Settings.embed_model = embed_model
# Load persisted vector store + metadata
storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context)
query_engine = index.as_query_engine(
similarity_top_k=10, # pull wide (tune to taste)
#response_mode="compact", # concise synthesis
text_qa_template=CATALOG_PROMPT, # <- custom prompt
# node_postprocessors=[
# SimilarityPostprocessor(similarity_cutoff=0.75) # keep strong hits; makes result count flexible
# ],
)
# Query
while True:
q = input("\nEnter your question (or 'exit'): ").strip()
if q.lower() in ("exit", "quit"):
break
print()
response = query_engine.query(q)
# Return the query response and source documents
print(response.response)
print("\nSource documents:")
for sn in response.source_nodes:
meta = getattr(sn, "metadata", None) or sn.node.metadata
print(meta.get("file_name"), "---", meta.get("file_path"), getattr(sn, "score", None))
if __name__ == "__main__":
main()

View file

@ -0,0 +1,223 @@
#!/usr/bin/env python3
"""
query_topk_prompt_engine.py
Query a vector store with a custom prompt for research assistance.
Uses BAAI/bge-large-en-v1.5 embeddings and Ollama for generation.
E.M.F. January 2026
Using Claude Sonnet 4.5 to suggest changes
"""
import argparse
import os
import sys
from pathlib import Path
from llama_index.core import (
Settings,
StorageContext,
load_index_from_storage,
)
from llama_index.core.prompts import PromptTemplate
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
# Suppress tokenizer parallelism warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Configuration defaults
DEFAULT_LLM = "command-r7b"
DEFAULT_EMBED_MODEL = "BAAI/bge-large-en-v1.5"
DEFAULT_STORAGE_DIR = "./storage_exp"
DEFAULT_TOP_K = 15
DEFAULT_SIMILARITY_CUTOFF = 0.7 # Set to None to disable
def get_prompt_template(max_files: int = 10) -> PromptTemplate:
"""Return the custom prompt template for the query engine."""
return PromptTemplate(
f"""You are an expert research assistant. You are given top-ranked writing excerpts (CONTEXT) and a user's QUERY.
Instructions:
- Base your response *only* on the CONTEXT.
- The snippets are ordered from most to least relevantprioritize insights from earlier (higher-ranked) snippets.
- Aim to reference *as many distinct* relevant files as possible (up to {max_files}).
- Do not invent or generalize; refer to specific passages or facts only.
- If a passage only loosely matches, deprioritize it.
Format your answer in two parts:
1. **Summary Theme**
Summarize the dominant theme from the relevant context in a few sentences.
2. **Matching Files**
List up to {max_files} matching files. Format each as:
<filename> - <rationale tied to content. Include date or section hints if available.>
CONTEXT:
{{context_str}}
QUERY:
{{query_str}}
Now provide the theme and list of matching files."""
)
def load_models(
llm_name: str = DEFAULT_LLM,
embed_model_name: str = DEFAULT_EMBED_MODEL,
cache_folder: str = "./models",
request_timeout: float = 360.0,
context_window: int = 8000,
):
"""Initialize and configure the LLM and embedding models."""
Settings.llm = Ollama(
model=llm_name,
request_timeout=request_timeout,
context_window=context_window,
)
Settings.embed_model = HuggingFaceEmbedding(
cache_folder=cache_folder,
model_name=embed_model_name,
local_files_only=True,
)
def load_query_engine(
storage_dir: str = DEFAULT_STORAGE_DIR,
top_k: int = DEFAULT_TOP_K,
similarity_cutoff: float | None = DEFAULT_SIMILARITY_CUTOFF,
max_files: int = 10,
):
"""Load the vector store and create a query engine with custom prompt."""
storage_path = Path(storage_dir)
if not storage_path.exists():
raise FileNotFoundError(f"Storage directory not found: {storage_dir}")
storage_context = StorageContext.from_defaults(persist_dir=str(storage_path))
index = load_index_from_storage(storage_context)
# Build postprocessors
postprocessors = []
if similarity_cutoff is not None:
postprocessors.append(SimilarityPostprocessor(similarity_cutoff=similarity_cutoff))
return index.as_query_engine(
similarity_top_k=top_k,
text_qa_template=get_prompt_template(max_files),
node_postprocessors=postprocessors if postprocessors else None,
)
def get_node_metadata(node) -> dict:
"""Safely extract metadata from a source node."""
# Handle different node structures in llamaindex
if hasattr(node, "metadata") and node.metadata:
return node.metadata
if hasattr(node, "node") and hasattr(node.node, "metadata"):
return node.node.metadata
return {}
def print_results(response):
"""Print the query response and source documents."""
print("\n" + "=" * 60)
print("RESPONSE")
print("=" * 60 + "\n")
print(response.response)
print("\n" + "=" * 60)
print("SOURCE DOCUMENTS")
print("=" * 60 + "\n")
for i, node in enumerate(response.source_nodes, 1):
meta = get_node_metadata(node)
score = getattr(node, "score", None)
file_name = meta.get("file_name", "Unknown")
file_path = meta.get("file_path", "Unknown")
score_str = f"{score:.3f}" if score is not None else "N/A"
print(f"{i:2}. [{score_str}] {file_name}")
print(f" Path: {file_path}")
def parse_args():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(
description="Query a vector store with a custom research assistant prompt.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python query_topk_prompt_engine.py "What themes appear in the documents?"
python query_topk_prompt_engine.py --top-k 20 --llm llama3.1:8B "Find references to machine learning"
""",
)
parser.add_argument("query", nargs="+", help="The query text")
parser.add_argument(
"--llm",
default=DEFAULT_LLM,
help=f"Ollama model to use for generation (default: {DEFAULT_LLM})",
)
parser.add_argument(
"--storage-dir",
default=DEFAULT_STORAGE_DIR,
help=f"Path to the vector store (default: {DEFAULT_STORAGE_DIR})",
)
parser.add_argument(
"--top-k",
type=int,
default=DEFAULT_TOP_K,
help=f"Number of similar documents to retrieve (default: {DEFAULT_TOP_K})",
)
parser.add_argument(
"--similarity-cutoff",
type=float,
default=DEFAULT_SIMILARITY_CUTOFF,
help=f"Minimum similarity score (default: {DEFAULT_SIMILARITY_CUTOFF}, use 0 to disable)",
)
parser.add_argument(
"--max-files",
type=int,
default=10,
help="Maximum files to list in response (default: 10)",
)
return parser.parse_args()
def main():
args = parse_args()
# Handle similarity cutoff of 0 as "disabled"
similarity_cutoff = args.similarity_cutoff if args.similarity_cutoff > 0 else None
try:
print(f"Loading models (LLM: {args.llm})...")
load_models(llm_name=args.llm)
print(f"Loading index from {args.storage_dir}...")
query_engine = load_query_engine(
storage_dir=args.storage_dir,
top_k=args.top_k,
similarity_cutoff=similarity_cutoff,
max_files=args.max_files,
)
query_text = " ".join(args.query)
print(f"Querying: {query_text[:100]}{'...' if len(query_text) > 100 else ''}")
response = query_engine.query(query_text)
print_results(response)
except FileNotFoundError as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error during query: {e}", file=sys.stderr)
raise
if __name__ == "__main__":
main()

106
archived/query_exp.py Normal file
View file

@ -0,0 +1,106 @@
# query_topk.py
# Run a querry on a vector store
#
# This verison implements a prompt and uses the build_exp.py vector store
# It is based on query_topk.py
# It uses 10 top-k results and a custom prompt
# The next version after this is query_rewrite.py
# build_exp.py modifies the chunk size and overlap form the orignal build.py
#
# E.M.F. August 2025
from llama_index.core import (
StorageContext,
load_index_from_storage,
ServiceContext,
Settings,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core.prompts import PromptTemplate
# LLM model to use in query transform and generation
llm="llama3.1:8B"
# Other models tried:
# llm="deepseek-r1:8B"
# llm="gemma3:1b"
# Custom prompt for the query engine
PROMPT = PromptTemplate(
"""You are an expert research assistant. You are given top-ranked journal excerpts (CONTEXT) and a users QUERY.
Instructions:
- Base your response *only* on the CONTEXT.
- The snippets are ordered from most to least relevantprioritize insights from earlier (higher-ranked) snippets.
- Aim to reference *as many distinct* relevant files as possible (up to 10).
- Do not invent or generalize; refer to specific passages or facts only.
- If a passage only loosely matches, deprioritize it.
Format your answer in two parts:
1. **Summary Theme**
Summarize the dominant theme from the relevant context.
2. **Matching Files**
Make a bullet list of 10. The format for each should be:
**<filename>** <rationale tied to content. Include date or section hints if available.>
CONTEXT:
{context_str}
QUERY:
{query_str}
Now provide the theme and list of matching files."""
)
#
# Main program routine
#
def main():
# Use a local model to generate
Settings.llm = Ollama(
model=llm, # First model tested
request_timeout=360.0,
context_window=8000
)
# Load embedding model (same as used for vector store)
embed_model = HuggingFaceEmbedding(model_name="all-mpnet-base-v2")
Settings.embed_model = embed_model
# Load persisted vector store + metadata
storage_context = StorageContext.from_defaults(persist_dir="./storage_exp")
index = load_index_from_storage(storage_context)
# Build regular query engine with custom prompt
query_engine = index.as_query_engine(
similarity_top_k=10, # pull wide
#response_mode="compact" # concise synthesis
text_qa_template=PROMPT, # custom prompt
# node_postprocessors=[
# SimilarityPostprocessor(similarity_cutoff=0.75) # keep strong hits; makes result count flexible
# ],
)
# Query
while True:
q = input("\nEnter your question (or 'exit'): ").strip()
if q.lower() in ("exit", "quit"):
break
print()
response = query_engine.query(q)
# Return the query response and source documents
print(response.response)
print("\nSource documents:")
for node in response.source_nodes:
meta = getattr(node, "metadata", None) or node.node.metadata
print(meta.get("file_name"), "---", meta.get("file_path"), getattr(node, "score", None))
if __name__ == "__main__":
main()

106
archived/query_multitool.py Normal file
View file

@ -0,0 +1,106 @@
"""
This is output generated by ChatG to implement a new regex + vector search engine
"""
from __future__ import annotations
from typing import List, Iterable
import json, re
from llama_index.core import VectorStoreIndex, Settings
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import NodeWithScore, QueryBundle
from llama_index.core.retrievers import BaseRetriever, EnsembleRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import Document
# 0) Configure your LLM + embeddings up front
# Example: Settings.llm = <your Command-R wrapper> ; Settings.embed_model = <your embeddings>
# (You can also pass an llm explicitly into the retriever if you prefer.)
# Settings.llm.complete("hello") should work in v0.10+
# 1) Prepare nodes once (so regex + vector share the same chunks)
def build_nodes(docs: List[Document], chunk_size: int = 1024, overlap: int = 100):
splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
return splitter.get_nodes_from_documents(docs)
# 2) LLM-guided regex retriever
class RegexRetriever(BaseRetriever):
def __init__(self, nodes: Iterable, llm=None, top_k: int = 5, flags=re.IGNORECASE):
super().__init__()
self._nodes = list(nodes)
self._llm = llm or Settings.llm
self._top_k = top_k
self._flags = flags
def _extract_terms(self, query: str) -> List[str]:
"""Ask the LLM for up to ~6 distinctive keywords/short phrases. Return a list of strings."""
prompt = f"""
You extract search terms for a boolean/regex search.
Query: {query}
Rules:
- Return ONLY a JSON array of strings.
- Use up to 6 concise keywords/short phrases.
- Keep phrases short (<= 3 words).
- Avoid stopwords, punctuation, and generic terms.
- No explanations, no extra text.
"""
raw = self._llm.complete(prompt).text.strip()
try:
terms = json.loads(raw)
# basic sanitize
terms = [t for t in terms if isinstance(t, str) and t.strip()]
except Exception:
# simple fall-back if JSON parse fails
terms = [w for w in re.findall(r"\w+", query) if len(w) > 2][:6]
return terms[:6]
def _compile_patterns(self, terms: List[str]) -> List[re.Pattern]:
pats = []
for t in terms:
# Escape user/LLM output, add word boundaries; allow whitespace inside short phrases
escaped = re.escape(t)
# turn '\ ' (escaped space) back into '\s+' to match any whitespace in phrases
escaped = escaped.replace(r"\ ", r"\s+")
pats.append(re.compile(rf"\b{escaped}\b", self._flags))
return pats
def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
terms = self._extract_terms(query_bundle.query_str)
patterns = self._compile_patterns(terms)
scored: List[tuple] = []
for n in self._nodes:
txt = n.get_content(metadata_mode="all")
hits = 0
for p in patterns:
if p.search(txt):
hits += 1
if hits:
# simple score = number of distinct term hits (you can weight phrase vs single word if you like)
scored.append((n, float(hits)))
scored.sort(key=lambda x: x[1], reverse=True)
return [NodeWithScore(node=n, score=s) for n, s in scored[: self._top_k]]
# 3) Wire it all together
def build_query_engine(docs: List[Document], k_vec=5, k_regex=5, weights=(0.7, 0.3)):
nodes = build_nodes(docs)
# Vector index over the SAME nodes
vindex = VectorStoreIndex(nodes)
vector_ret = vindex.as_retriever(similarity_top_k=k_vec)
regex_ret = RegexRetriever(nodes, top_k=k_regex)
ensemble = EnsembleRetriever(
retrievers=[vector_ret, regex_ret],
weights=list(weights), # tune this: more recall from regex? bump weight on regex
# uses Reciprocal Rank Fusion by default
)
return RetrieverQueryEngine(retriever=ensemble)
# 4) Use it
# docs = SimpleDirectoryReader("data").load_data()
# qe = build_query_engine(docs)
# print(qe.query("Find entries with strong feelings of depression."))

View file

@ -0,0 +1,126 @@
# query_rewrite_hyde.py
# Run a querry on a vector store
#
# Latest experiment to include query rewriting using HyDE (Hypothetial Document Embeddings)
# The goal is to reduce the semantic gap between the query and the indexed documents
# This verison implements a prompt and uses the build_exp.py vector store
# Based on query_exp.py
#
# E.M.F. July 2025
from llama_index.core import (
StorageContext,
load_index_from_storage,
ServiceContext,
Settings,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core.prompts import PromptTemplate
from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.core.query_engine.transform_query_engine import TransformQueryEngine
import os
# Globals
# Embedding model used in vector store (this should match the one in build_exp.py or equivalent)
# embed_model = HuggingFaceEmbedding(model_name="all-mpnet-base-v2")
embed_model = HuggingFaceEmbedding(cache_folder="./models",model_name="BAAI/bge-large-en-v1.5")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# LLM model to use in query transform and generation
llm="llama3.1:8B"
# Other models tried:
# llm="deepseek-r1:8B"
# llm="gemma3:1b"
# Custom prompt for the query engine
PROMPT = PromptTemplate(
"""You are an expert research assistant. You are given top-ranked writing excerpts (CONTEXT) and a user's QUERY.
Instructions:
- Base your response *only* on the CONTEXT.
- The snippets are ordered from most to least relevantprioritize insights from earlier (higher-ranked) snippets.
- Aim to reference *as many distinct* relevant files as possible (up to 10).
- Do not invent or generalize; refer to specific passages or facts only.
- If a passage only loosely matches, deprioritize it.
Format your answer in two parts:
1. **Summary Theme**
Summarize the dominant theme from the relevant context in a few sentences.
2. **Matching Files**
Make a list of 10 matching files. The format for each should be:
<filename> <rationale tied to content. Include date or section hints if available.>
CONTEXT:
{context_str}
QUERY:
{query_str}
Now provide the theme and list of matching files."""
)
#
# Main program routine
#
def main():
# Use a local model to generate
Settings.llm = Ollama(
model=llm, # First model tested
request_timeout=360.0,
context_window=8000
)
# Load embedding model (same as used for vector store)
Settings.embed_model = embed_model
# Load persisted vector store + metadata
storage_context = StorageContext.from_defaults(persist_dir="./storage_exp")
index = load_index_from_storage(storage_context)
# Build regular query engine with custom prompt
base_query_engine = index.as_query_engine(
similarity_top_k=15, # pull wide
#response_mode="compact" # concise synthesis
text_qa_template=PROMPT, # custom prompt
# node_postprocessors=[
# SimilarityPostprocessor(similarity_cutoff=0.75) # keep strong hits; makes result count flexible
# ],
)
# HyDE is "Hypothetical Document Embeddings"
# It generates a hypothetical document based on the query
# and uses that to augment the query
# Here we include the original query as well
# I get better similarity values with include_orignal=True
hyde_transform = HyDEQueryTransform(llm=Settings.llm,include_original=True)
# Query
while True:
q = input("\nEnter a search topic or question (or 'exit'): ").strip()
if q.lower() in ("exit", "quit"):
break
print()
# The query uses a HyDE trasformation to rewrite the query
query_engine = TransformQueryEngine(base_query_engine, query_transform=hyde_transform)
# Generate the response by querying the engine
# This performes the similarity search and then applies the prompt
response = query_engine.query(q)
# Return the query response and source documents
print(response.response)
print("\nSource documents:")
for node in response.source_nodes:
meta = getattr(node, "metadata", None) or node.node.metadata
print(meta.get("file_name"), "---", meta.get("file_path"), getattr(node, "score", None))
if __name__ == "__main__":
main()

58
archived/query_topk.py Normal file
View file

@ -0,0 +1,58 @@
# query_topk.py
# Run a querry on a vector store
#
# E.M.F. July 2025
# August 2025 - updated for nd ssearch
# this version uses top-k similarity
from llama_index.core import (
StorageContext,
load_index_from_storage,
ServiceContext,
Settings,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
# Use a local model to generate
Settings.llm = Ollama(
model="llama3.1:8B", # First model tested
# model="deepseek-r1:8B", # This model shows its reasoning
# model="gemma3:1b",
request_timeout=360.0,
context_window=8000
)
def main():
# Load embedding model (same as used for vector store)
embed_model = HuggingFaceEmbedding(model_name="all-mpnet-base-v2")
Settings.embed_model = embed_model
# Load persisted vector store + metadata
storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context)
query_engine = index.as_query_engine(similarity_top_k=5)
# Query
while True:
q = input("\nEnter your question (or 'exit'): ").strip()
if q.lower() in ("exit", "quit"):
break
print()
response = query_engine.query(q)
# Return the query response and source documents
print(response.response)
print("\nSource documents:")
for node in response.source_nodes:
meta = getattr(node, "metadata", None) or node.node.metadata
print(meta.get("file_name"), "---", meta.get("file_path"), getattr(node, "score", None))
if __name__ == "__main__":
main()

View file

@ -0,0 +1,123 @@
# query_topk_prompt.py
# Run a querry on a vector store
#
# This version from query_rewrite_hyde.py, but removing hyde and using a custom prompt
# This verison implements a prompt and uses the build_exp.py vector store with BAAI/bge-large-en-v1.5
# Based on query_exp.py->query_topk.py->query_rewrite_hyde.py
# The results are as good as with HyDE.
#
# E.M.F. August 2025
from llama_index.core import (
StorageContext,
load_index_from_storage,
ServiceContext,
Settings,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core.prompts import PromptTemplate
import os
#
# Globals
#
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Embedding model used in vector store (this should match the one in build_exp.py or equivalent)
# embed_model = HuggingFaceEmbedding(model_name="all-mpnet-base-v2")
embed_model = HuggingFaceEmbedding(cache_folder="./models",model_name="BAAI/bge-large-en-v1.5")
# LLM model to use in query transform and generation
# command-r7b generates about as quickly as llama3.1:8B, but provides results that stick better
# to the provided context
llm="command-r7b"
# Other models tried:
#llm="llama3.1:8B"
#llm="deepseek-r1:8B"
#llm="gemma3:1b"
#
# Custom prompt for the query engine
#
PROMPT = PromptTemplate(
"""You are an expert research assistant. You are given top-ranked writing excerpts (CONTEXT) and a user's QUERY.
Instructions:
- Base your response *only* on the CONTEXT.
- The snippets are ordered from most to least relevantprioritize insights from earlier (higher-ranked) snippets.
- Aim to reference *as many distinct* relevant files as possible (up to 10).
- Do not invent or generalize; refer to specific passages or facts only.
- If a passage only loosely matches, deprioritize it.
Format your answer in two parts:
1. **Summary Theme**
Summarize the dominant theme from the relevant context in a few sentences.
2. **Matching Files**
Make a list of 10 matching files. The format for each should be:
<filename> -
<rationale tied to content. Include date or section hints if available.>
CONTEXT:
{context_str}
QUERY:
{query_str}
Now provide the theme and list of matching files."""
)
#
# Main program routine
#
def main():
# Use a local model to generate -- in this case using Ollama
Settings.llm = Ollama(
model=llm, # First model tested
request_timeout=360.0,
context_window=8000
)
# Load embedding model (same as used for vector store)
Settings.embed_model = embed_model
# Load persisted vector store + metadata
storage_context = StorageContext.from_defaults(persist_dir="./storage_exp")
index = load_index_from_storage(storage_context)
# Build regular query engine with custom prompt
query_engine = index.as_query_engine(
similarity_top_k=15, # pull wide
#response_mode="compact" # concise synthesis
text_qa_template=PROMPT, # custom prompt
# node_postprocessors=[
# SimilarityPostprocessor(similarity_cutoff=0.75) # keep strong hits; makes result count flexible
# ],
)
# Query
while True:
q = input("\nEnter a search topic or question (or 'exit'): ").strip()
if q.lower() in ("exit", "quit"):
break
print()
# Generate the response by querying the engine
# This performes the similarity search and then applies the prompt
response = query_engine.query(q)
# Return the query response and source documents
print(response.response)
print("\nSource documents:")
for node in response.source_nodes:
meta = getattr(node, "metadata", None) or node.node.metadata
print(f"{meta.get('file_name')} {meta.get('file_path')} {getattr(node, 'score', None)}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,134 @@
# query_topk_prompt_dw.py
# Run a querry on a vector store
#
# This version from query_rewrite_hyde.py, but removing hyde and using a custom prompt
# This verison implements a prompt and uses the build_exp.py vector store with BAAI/bge-large-en-v1.5
# Based on query_exp.py->query_topk.py->query_rewrite_hyde.py
# The results are as good as with HyDE.
# Modified for terminal output (132 columns)
#
# E.M.F. August 2025
from llama_index.core import (
StorageContext,
load_index_from_storage,
ServiceContext,
Settings,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core.prompts import PromptTemplate
import os
import sys
import textwrap
# Print wrapping for terminal output
class Wrap80:
def write(self, text):
for line in text.splitlines():
sys.__stdout__.write(textwrap.fill(line, width=131) + "\n")
def flush(self):
sys.__stdout__.flush()
sys.stdout = Wrap80()
#
# Globals
#
# Embedding model used in vector store (this should match the one in build_exp.py or equivalent)
# embed_model = HuggingFaceEmbedding(model_name="all-mpnet-base-v2")
embed_model = HuggingFaceEmbedding(cache_folder="./models",model_name="BAAI/bge-large-en-v1.5")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# LLM model to use in query transform and generation
# command-r7b generates about as quickly as llama3.1:8B, but provides results that stick better
# to the provided context
llm="command-r7b"
# Other models tried:
#llm="llama3.1:8B"
# llm="deepseek-r1:8B"
# llm="gemma3:1b"
# Custom prompt for the query engine
PROMPT = PromptTemplate(
"""You are an expert research assistant. You are given top-ranked writing excerpts (CONTEXT) and a user's QUERY.
Instructions:
- Base your response *only* on the CONTEXT.
- The snippets are ordered from most to least relevantprioritize insights from earlier (higher-ranked) snippets.
- Aim to reference *as many distinct* relevant files as possible (up to 10).
- Do not invent or generalize; refer to specific passages or facts only.
- If a passage only loosely matches, deprioritize it.
Format your answer in two parts:
1. **Summary Theme**
Summarize the dominant theme from the relevant context in a few sentences.
2. **Matching Files**
Make a list of 10 matching files. The format for each should be:
<filename> -
<rationale tied to content. Include date or section hints if available.>
CONTEXT:
{context_str}
QUERY:
{query_str}
Now provide the theme and list of matching files."""
)
#
# Main program routine
#
def main():
# Use a local model to generate
Settings.llm = Ollama(
model=llm, # First model tested
request_timeout=360.0,
context_window=8000
)
# Load embedding model (same as used for vector store)
Settings.embed_model = embed_model
# Load persisted vector store + metadata
storage_context = StorageContext.from_defaults(persist_dir="./storage_exp")
index = load_index_from_storage(storage_context)
# Build regular query engine with custom prompt
query_engine = index.as_query_engine(
similarity_top_k=15, # pull wide
#response_mode="compact" # concise synthesis
text_qa_template=PROMPT, # custom prompt
# node_postprocessors=[
# SimilarityPostprocessor(similarity_cutoff=0.75) # keep strong hits; makes result count flexible
# ],
)
# Query
while True:
q = input("\nEnter a search topic or question (or 'exit'): ").strip()
if q.lower() in ("exit", "quit"):
break
print()
# Generate the response by querying the engine
# This performes the similarity search and then applies the prompt
response = query_engine.query(q)
# Return the query response and source documents
print(response.response)
print("\nSource documents:")
for node in response.source_nodes:
meta = getattr(node, "metadata", None) or node.node.metadata
print(f"{meta.get('file_name')} {meta.get('file_path')} {getattr(node, 'score', None)}", end="")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,123 @@
# query_topk_prompt_engine.py
# Run a querry on a vector store
#
# This version is query_topk_prompt.py but the query is passed though the command line.
#
# Implements a prompt and uses the build_exp.py vector store with BAAI/bge-large-en-v1.5
# Based on query_exp.py->query_topk.py
#
# E.M.F. August 2025
from llama_index.core import (
StorageContext,
load_index_from_storage,
ServiceContext,
Settings,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core.prompts import PromptTemplate
import os
import sys
#
# Globals
#
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Embedding model used in vector store (this should match the one in build_exp.py or equivalent)
# embed_model = HuggingFaceEmbedding(model_name="all-mpnet-base-v2")
embed_model = HuggingFaceEmbedding(cache_folder="./models",model_name="BAAI/bge-large-en-v1.5",local_files_only=True)
# LLM model to use in query transform and generation
# command-r7b generates about as quickly as llama3.1:8B, but provides results that stick better
# to the provided context
llm="command-r7b"
# Other models tried:
#llm="llama3.1:8B"
#llm="deepseek-r1:8B"
#llm="gemma3:1b"
#
# Custom prompt for the query engine
#
PROMPT = PromptTemplate(
"""You are an expert research assistant. You are given top-ranked writing excerpts (CONTEXT) and a user's QUERY.
Instructions:
- Base your response *only* on the CONTEXT.
- The snippets are ordered from most to least relevantprioritize insights from earlier (higher-ranked) snippets.
- Aim to reference *as many distinct* relevant files as possible (up to 10).
- Do not invent or generalize; refer to specific passages or facts only.
- If a passage only loosely matches, deprioritize it.
Format your answer in two parts:
1. **Summary Theme**
Summarize the dominant theme from the relevant context in a few sentences.
2. **Matching Files**
Make a list of 10 matching files. The format for each should be:
<filename> -
<rationale tied to content. Include date or section hints if available.>
CONTEXT:
{context_str}
QUERY:
{query_str}
Now provide the theme and list of matching files."""
)
#
# Main program routine
#
def main():
# Use a local model to generate -- in this case using Ollama
Settings.llm = Ollama(
model=llm, # First model tested
request_timeout=360.0,
context_window=8000
)
# Load embedding model (same as used for vector store)
Settings.embed_model = embed_model
# Load persisted vector store + metadata
storage_context = StorageContext.from_defaults(persist_dir="./storage_exp")
index = load_index_from_storage(storage_context)
# Build regular query engine with custom prompt
query_engine = index.as_query_engine(
similarity_top_k=15, # pull wide
#response_mode="compact" # concise synthesis
text_qa_template=PROMPT, # custom prompt
# node_postprocessors=[
# SimilarityPostprocessor(similarity_cutoff=0.75) # keep strong hits; makes result count flexible
# ],
)
# Query
if len(sys.argv) < 2:
print("Usage: python query.py QUERY_TEXT")
sys.exit(1)
q = " ".join(sys.argv[1:])
# Generate the response by querying the engine
# This performes the similarity search and then applies the prompt
response = query_engine.query(q)
# Return the query response and source documents
print("\nResponse:\n")
print(response.response)
print("\nSource documents:")
for node in response.source_nodes:
meta = getattr(node, "metadata", None) or node.node.metadata
print(f"{meta.get('file_name')} {meta.get('file_path')} {getattr(node, 'score', None):.3f}")
if __name__ == "__main__":
main()

60
archived/query_tree.py Normal file
View file

@ -0,0 +1,60 @@
# query_tree.py
#
# Run a querry on a vector store
# This is to test summarization using a tree-summarize response mode
# It doesn't work very well, perhaps because of the struture of the data
#
# E.M.F. August 2025
from llama_index.core import (
StorageContext,
load_index_from_storage,
ServiceContext,
Settings,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
# Use a local model to generate
Settings.llm = Ollama(
model="llama3.1:8B", # First model tested
# model="deepseek-r1:8B", # This model shows its reasoning
# model="gemma3:1b",
request_timeout=360.0,
context_window=8000
)
def main():
# Load embedding model (same as used for vector store)
embed_model = HuggingFaceEmbedding(model_name="all-mpnet-base-v2")
Settings.embed_model = embed_model
# Load persisted vector store + metadata
storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context)
query_engine = index.as_query_engine(response_mode="tree_summarize")
# Query
while True:
q = input("\nEnter your question (or 'exit'): ").strip()
if q.lower() in ("exit", "quit"):
break
print()
response = query_engine.query("<summarization_query>")
# Return the query response and source documents
print(response.response)
print("\nSource documents:")
for node in response.source_nodes:
meta = getattr(node, "metadata", None) or node.node.metadata
print(meta.get("file_name"), "---", meta.get("file_path"), getattr(node, "score", None))
if __name__ == "__main__":
main()

27
archived/vs_metrics.py Normal file
View file

@ -0,0 +1,27 @@
# vs_metrics.py
# Quantify vector store properties and performance
#
# E.M.F. August 2025
# Read in vector store
# What are properties of the vector store?
# - number of vectors
# - distribution of distances
# - clustering?
from llama_index.core import (
StorageContext,
load_index_from_storage,
ServiceContext,
Settings,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# Load embedding model (same as used for vector store)
embed_model = HuggingFaceEmbedding(model_name="all-mpnet-base-v2")
Settings.embed_model = embed_model
# Load persisted vector store + metadata
storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context)