""" This is output generated by ChatG to implement a new regex + vector search engine """ from __future__ import annotations from typing import List, Iterable import json, re from llama_index.core import VectorStoreIndex, Settings from llama_index.core.node_parser import SentenceSplitter from llama_index.core.schema import NodeWithScore, QueryBundle from llama_index.core.retrievers import BaseRetriever, EnsembleRetriever from llama_index.core.query_engine import RetrieverQueryEngine from llama_index.core import Document # 0) Configure your LLM + embeddings up front # Example: Settings.llm = ; Settings.embed_model = # (You can also pass an llm explicitly into the retriever if you prefer.) # Settings.llm.complete("hello") should work in v0.10+ # 1) Prepare nodes once (so regex + vector share the same chunks) def build_nodes(docs: List[Document], chunk_size: int = 1024, overlap: int = 100): splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=overlap) return splitter.get_nodes_from_documents(docs) # 2) LLM-guided regex retriever class RegexRetriever(BaseRetriever): def __init__(self, nodes: Iterable, llm=None, top_k: int = 5, flags=re.IGNORECASE): super().__init__() self._nodes = list(nodes) self._llm = llm or Settings.llm self._top_k = top_k self._flags = flags def _extract_terms(self, query: str) -> List[str]: """Ask the LLM for up to ~6 distinctive keywords/short phrases. Return a list of strings.""" prompt = f""" You extract search terms for a boolean/regex search. Query: {query} Rules: - Return ONLY a JSON array of strings. - Use up to 6 concise keywords/short phrases. - Keep phrases short (<= 3 words). - Avoid stopwords, punctuation, and generic terms. - No explanations, no extra text. """ raw = self._llm.complete(prompt).text.strip() try: terms = json.loads(raw) # basic sanitize terms = [t for t in terms if isinstance(t, str) and t.strip()] except Exception: # simple fall-back if JSON parse fails terms = [w for w in re.findall(r"\w+", query) if len(w) > 2][:6] return terms[:6] def _compile_patterns(self, terms: List[str]) -> List[re.Pattern]: pats = [] for t in terms: # Escape user/LLM output, add word boundaries; allow whitespace inside short phrases escaped = re.escape(t) # turn '\ ' (escaped space) back into '\s+' to match any whitespace in phrases escaped = escaped.replace(r"\ ", r"\s+") pats.append(re.compile(rf"\b{escaped}\b", self._flags)) return pats def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]: terms = self._extract_terms(query_bundle.query_str) patterns = self._compile_patterns(terms) scored: List[tuple] = [] for n in self._nodes: txt = n.get_content(metadata_mode="all") hits = 0 for p in patterns: if p.search(txt): hits += 1 if hits: # simple score = number of distinct term hits (you can weight phrase vs single word if you like) scored.append((n, float(hits))) scored.sort(key=lambda x: x[1], reverse=True) return [NodeWithScore(node=n, score=s) for n, s in scored[: self._top_k]] # 3) Wire it all together def build_query_engine(docs: List[Document], k_vec=5, k_regex=5, weights=(0.7, 0.3)): nodes = build_nodes(docs) # Vector index over the SAME nodes vindex = VectorStoreIndex(nodes) vector_ret = vindex.as_retriever(similarity_top_k=k_vec) regex_ret = RegexRetriever(nodes, top_k=k_regex) ensemble = EnsembleRetriever( retrievers=[vector_ret, regex_ret], weights=list(weights), # tune this: more recall from regex? bump weight on regex # uses Reciprocal Rank Fusion by default ) return RetrieverQueryEngine(retriever=ensemble) # 4) Use it # docs = SimpleDirectoryReader("data").load_data() # qe = build_query_engine(docs) # print(qe.query("Find entries with strong feelings of depression."))