Initial commit: RAG pipeline for semantic search over personal journal archive
Vector search with cross-encoder re-ranking, hybrid BM25+vector retrieval, incremental index updates, and multiple LLM backends (Ollama local, OpenAI API).
This commit is contained in:
commit
e9fc99ddc6
43 changed files with 7349 additions and 0 deletions
106
archived/query_multitool.py
Normal file
106
archived/query_multitool.py
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
"""
|
||||
This is output generated by ChatG to implement a new regex + vector search engine
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from typing import List, Iterable
|
||||
import json, re
|
||||
|
||||
from llama_index.core import VectorStoreIndex, Settings
|
||||
from llama_index.core.node_parser import SentenceSplitter
|
||||
from llama_index.core.schema import NodeWithScore, QueryBundle
|
||||
from llama_index.core.retrievers import BaseRetriever, EnsembleRetriever
|
||||
from llama_index.core.query_engine import RetrieverQueryEngine
|
||||
from llama_index.core import Document
|
||||
|
||||
# 0) Configure your LLM + embeddings up front
|
||||
# Example: Settings.llm = <your Command-R wrapper> ; Settings.embed_model = <your embeddings>
|
||||
# (You can also pass an llm explicitly into the retriever if you prefer.)
|
||||
# Settings.llm.complete("hello") should work in v0.10+
|
||||
|
||||
# 1) Prepare nodes once (so regex + vector share the same chunks)
|
||||
def build_nodes(docs: List[Document], chunk_size: int = 1024, overlap: int = 100):
|
||||
splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
|
||||
return splitter.get_nodes_from_documents(docs)
|
||||
|
||||
# 2) LLM-guided regex retriever
|
||||
class RegexRetriever(BaseRetriever):
|
||||
def __init__(self, nodes: Iterable, llm=None, top_k: int = 5, flags=re.IGNORECASE):
|
||||
super().__init__()
|
||||
self._nodes = list(nodes)
|
||||
self._llm = llm or Settings.llm
|
||||
self._top_k = top_k
|
||||
self._flags = flags
|
||||
|
||||
def _extract_terms(self, query: str) -> List[str]:
|
||||
"""Ask the LLM for up to ~6 distinctive keywords/short phrases. Return a list of strings."""
|
||||
prompt = f"""
|
||||
You extract search terms for a boolean/regex search.
|
||||
Query: {query}
|
||||
|
||||
Rules:
|
||||
- Return ONLY a JSON array of strings.
|
||||
- Use up to 6 concise keywords/short phrases.
|
||||
- Keep phrases short (<= 3 words).
|
||||
- Avoid stopwords, punctuation, and generic terms.
|
||||
- No explanations, no extra text.
|
||||
"""
|
||||
raw = self._llm.complete(prompt).text.strip()
|
||||
try:
|
||||
terms = json.loads(raw)
|
||||
# basic sanitize
|
||||
terms = [t for t in terms if isinstance(t, str) and t.strip()]
|
||||
except Exception:
|
||||
# simple fall-back if JSON parse fails
|
||||
terms = [w for w in re.findall(r"\w+", query) if len(w) > 2][:6]
|
||||
return terms[:6]
|
||||
|
||||
def _compile_patterns(self, terms: List[str]) -> List[re.Pattern]:
|
||||
pats = []
|
||||
for t in terms:
|
||||
# Escape user/LLM output, add word boundaries; allow whitespace inside short phrases
|
||||
escaped = re.escape(t)
|
||||
# turn '\ ' (escaped space) back into '\s+' to match any whitespace in phrases
|
||||
escaped = escaped.replace(r"\ ", r"\s+")
|
||||
pats.append(re.compile(rf"\b{escaped}\b", self._flags))
|
||||
return pats
|
||||
|
||||
def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
|
||||
terms = self._extract_terms(query_bundle.query_str)
|
||||
patterns = self._compile_patterns(terms)
|
||||
|
||||
scored: List[tuple] = []
|
||||
for n in self._nodes:
|
||||
txt = n.get_content(metadata_mode="all")
|
||||
hits = 0
|
||||
for p in patterns:
|
||||
if p.search(txt):
|
||||
hits += 1
|
||||
if hits:
|
||||
# simple score = number of distinct term hits (you can weight phrase vs single word if you like)
|
||||
scored.append((n, float(hits)))
|
||||
|
||||
scored.sort(key=lambda x: x[1], reverse=True)
|
||||
return [NodeWithScore(node=n, score=s) for n, s in scored[: self._top_k]]
|
||||
|
||||
# 3) Wire it all together
|
||||
def build_query_engine(docs: List[Document], k_vec=5, k_regex=5, weights=(0.7, 0.3)):
|
||||
nodes = build_nodes(docs)
|
||||
# Vector index over the SAME nodes
|
||||
vindex = VectorStoreIndex(nodes)
|
||||
|
||||
vector_ret = vindex.as_retriever(similarity_top_k=k_vec)
|
||||
regex_ret = RegexRetriever(nodes, top_k=k_regex)
|
||||
|
||||
ensemble = EnsembleRetriever(
|
||||
retrievers=[vector_ret, regex_ret],
|
||||
weights=list(weights), # tune this: more recall from regex? bump weight on regex
|
||||
# uses Reciprocal Rank Fusion by default
|
||||
)
|
||||
|
||||
return RetrieverQueryEngine(retriever=ensemble)
|
||||
|
||||
# 4) Use it
|
||||
# docs = SimpleDirectoryReader("data").load_data()
|
||||
# qe = build_query_engine(docs)
|
||||
# print(qe.query("Find entries with strong feelings of depression."))
|
||||
Loading…
Add table
Add a link
Reference in a new issue