ssearch/archived/query_multitool.py
Eric e9fc99ddc6 Initial commit: RAG pipeline for semantic search over personal journal archive
Vector search with cross-encoder re-ranking, hybrid BM25+vector retrieval,
incremental index updates, and multiple LLM backends (Ollama local, OpenAI API).
2026-02-20 06:02:28 -05:00

106 lines
No EOL
4.1 KiB
Python

"""
This is output generated by ChatG to implement a new regex + vector search engine
"""
from __future__ import annotations
from typing import List, Iterable
import json, re
from llama_index.core import VectorStoreIndex, Settings
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import NodeWithScore, QueryBundle
from llama_index.core.retrievers import BaseRetriever, EnsembleRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import Document
# 0) Configure your LLM + embeddings up front
# Example: Settings.llm = <your Command-R wrapper> ; Settings.embed_model = <your embeddings>
# (You can also pass an llm explicitly into the retriever if you prefer.)
# Settings.llm.complete("hello") should work in v0.10+
# 1) Prepare nodes once (so regex + vector share the same chunks)
def build_nodes(docs: List[Document], chunk_size: int = 1024, overlap: int = 100):
splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
return splitter.get_nodes_from_documents(docs)
# 2) LLM-guided regex retriever
class RegexRetriever(BaseRetriever):
def __init__(self, nodes: Iterable, llm=None, top_k: int = 5, flags=re.IGNORECASE):
super().__init__()
self._nodes = list(nodes)
self._llm = llm or Settings.llm
self._top_k = top_k
self._flags = flags
def _extract_terms(self, query: str) -> List[str]:
"""Ask the LLM for up to ~6 distinctive keywords/short phrases. Return a list of strings."""
prompt = f"""
You extract search terms for a boolean/regex search.
Query: {query}
Rules:
- Return ONLY a JSON array of strings.
- Use up to 6 concise keywords/short phrases.
- Keep phrases short (<= 3 words).
- Avoid stopwords, punctuation, and generic terms.
- No explanations, no extra text.
"""
raw = self._llm.complete(prompt).text.strip()
try:
terms = json.loads(raw)
# basic sanitize
terms = [t for t in terms if isinstance(t, str) and t.strip()]
except Exception:
# simple fall-back if JSON parse fails
terms = [w for w in re.findall(r"\w+", query) if len(w) > 2][:6]
return terms[:6]
def _compile_patterns(self, terms: List[str]) -> List[re.Pattern]:
pats = []
for t in terms:
# Escape user/LLM output, add word boundaries; allow whitespace inside short phrases
escaped = re.escape(t)
# turn '\ ' (escaped space) back into '\s+' to match any whitespace in phrases
escaped = escaped.replace(r"\ ", r"\s+")
pats.append(re.compile(rf"\b{escaped}\b", self._flags))
return pats
def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
terms = self._extract_terms(query_bundle.query_str)
patterns = self._compile_patterns(terms)
scored: List[tuple] = []
for n in self._nodes:
txt = n.get_content(metadata_mode="all")
hits = 0
for p in patterns:
if p.search(txt):
hits += 1
if hits:
# simple score = number of distinct term hits (you can weight phrase vs single word if you like)
scored.append((n, float(hits)))
scored.sort(key=lambda x: x[1], reverse=True)
return [NodeWithScore(node=n, score=s) for n, s in scored[: self._top_k]]
# 3) Wire it all together
def build_query_engine(docs: List[Document], k_vec=5, k_regex=5, weights=(0.7, 0.3)):
nodes = build_nodes(docs)
# Vector index over the SAME nodes
vindex = VectorStoreIndex(nodes)
vector_ret = vindex.as_retriever(similarity_top_k=k_vec)
regex_ret = RegexRetriever(nodes, top_k=k_regex)
ensemble = EnsembleRetriever(
retrievers=[vector_ret, regex_ret],
weights=list(weights), # tune this: more recall from regex? bump weight on regex
# uses Reciprocal Rank Fusion by default
)
return RetrieverQueryEngine(retriever=ensemble)
# 4) Use it
# docs = SimpleDirectoryReader("data").load_data()
# qe = build_query_engine(docs)
# print(qe.query("Find entries with strong feelings of depression."))