ssearch/archived/query_multitool.py

"""
This is output generated by ChatG to implement a new regex + vector search engine
"""

from __future__ import annotations
from typing import List, Iterable
import json, re

from llama_index.core import VectorStoreIndex, Settings
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import NodeWithScore, QueryBundle
from llama_index.core.retrievers import BaseRetriever, EnsembleRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import Document

# 0) Configure your LLM + embeddings up front
# Example: Settings.llm = <your Command-R wrapper> ; Settings.embed_model = <your embeddings>
# (You can also pass an llm explicitly into the retriever if you prefer.)
# Settings.llm.complete("hello") should work in v0.10+

# 1) Prepare nodes once (so regex + vector share the same chunks)
def build_nodes(docs: List[Document], chunk_size: int = 1024, overlap: int = 100):
    splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    return splitter.get_nodes_from_documents(docs)

# 2) LLM-guided regex retriever
class RegexRetriever(BaseRetriever):
    def __init__(self, nodes: Iterable, llm=None, top_k: int = 5, flags=re.IGNORECASE):
        super().__init__()
        self._nodes = list(nodes)
        self._llm = llm or Settings.llm
        self._top_k = top_k
        self._flags = flags

    def _extract_terms(self, query: str) -> List[str]:
        """Ask the LLM for up to ~6 distinctive keywords/short phrases. Return a list of strings."""
        prompt = f"""
You extract search terms for a boolean/regex search.
Query: {query}

Rules:
- Return ONLY a JSON array of strings.
- Use up to 6 concise keywords/short phrases.
- Keep phrases short (<= 3 words).
- Avoid stopwords, punctuation, and generic terms.
- No explanations, no extra text.
"""
        raw = self._llm.complete(prompt).text.strip()
        try:
            terms = json.loads(raw)
            # basic sanitize
            terms = [t for t in terms if isinstance(t, str) and t.strip()]
        except Exception:
            # simple fall-back if JSON parse fails
            terms = [w for w in re.findall(r"\w+", query) if len(w) > 2][:6]
        return terms[:6]

    def _compile_patterns(self, terms: List[str]) -> List[re.Pattern]:
        pats = []
        for t in terms:
            # Escape user/LLM output, add word boundaries; allow whitespace inside short phrases
            escaped = re.escape(t)
            # turn '\ ' (escaped space) back into '\s+' to match any whitespace in phrases
            escaped = escaped.replace(r"\ ", r"\s+")
            pats.append(re.compile(rf"\b{escaped}\b", self._flags))
        return pats

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        terms = self._extract_terms(query_bundle.query_str)
        patterns = self._compile_patterns(terms)

        scored: List[tuple] = []
        for n in self._nodes:
            txt = n.get_content(metadata_mode="all")
            hits = 0
            for p in patterns:
                if p.search(txt):
                    hits += 1
            if hits:
                # simple score = number of distinct term hits (you can weight phrase vs single word if you like)
                scored.append((n, float(hits)))

        scored.sort(key=lambda x: x[1], reverse=True)
        return [NodeWithScore(node=n, score=s) for n, s in scored[: self._top_k]]

# 3) Wire it all together
def build_query_engine(docs: List[Document], k_vec=5, k_regex=5, weights=(0.7, 0.3)):
    nodes = build_nodes(docs)
    # Vector index over the SAME nodes
    vindex = VectorStoreIndex(nodes)

    vector_ret = vindex.as_retriever(similarity_top_k=k_vec)
    regex_ret = RegexRetriever(nodes, top_k=k_regex)

    ensemble = EnsembleRetriever(
        retrievers=[vector_ret, regex_ret],
        weights=list(weights),       # tune this: more recall from regex? bump weight on regex
        # uses Reciprocal Rank Fusion by default
    )

    return RetrieverQueryEngine(retriever=ensemble)

# 4) Use it
# docs = SimpleDirectoryReader("data").load_data()
# qe = build_query_engine(docs)
# print(qe.query("Find entries with strong feelings of depression."))