Initial commit: RAG pipeline for semantic search over personal journal archive

Vector search with cross-encoder re-ranking, hybrid BM25+vector retrieval, incremental index updates, and multiple LLM backends (Ollama local, OpenAI API).
2026-02-20 06:02:28 -05:00 · 2026-02-20 06:02:28 -05:00 · e9fc99ddc6
commit e9fc99ddc6
43 changed files with 7349 additions and 0 deletions
--- a/archived/query_multitool.py
+++ b/archived/query_multitool.py
@ -0,0 +1,106 @@
+"""
+This is output generated by ChatG to implement a new regex + vector search engine
+"""
+
+from __future__ import annotations
+from typing import List, Iterable
+import json, re
+
+from llama_index.core import VectorStoreIndex, Settings
+from llama_index.core.node_parser import SentenceSplitter
+from llama_index.core.schema import NodeWithScore, QueryBundle
+from llama_index.core.retrievers import BaseRetriever, EnsembleRetriever
+from llama_index.core.query_engine import RetrieverQueryEngine
+from llama_index.core import Document
+
+# 0) Configure your LLM + embeddings up front
+# Example: Settings.llm = <your Command-R wrapper> ; Settings.embed_model = <your embeddings>
+# (You can also pass an llm explicitly into the retriever if you prefer.)
+# Settings.llm.complete("hello") should work in v0.10+
+
+# 1) Prepare nodes once (so regex + vector share the same chunks)
+def build_nodes(docs: List[Document], chunk_size: int = 1024, overlap: int = 100):
+    splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
+    return splitter.get_nodes_from_documents(docs)
+
+# 2) LLM-guided regex retriever
+class RegexRetriever(BaseRetriever):
+    def __init__(self, nodes: Iterable, llm=None, top_k: int = 5, flags=re.IGNORECASE):
+        super().__init__()
+        self._nodes = list(nodes)
+        self._llm = llm or Settings.llm
+        self._top_k = top_k
+        self._flags = flags
+
+    def _extract_terms(self, query: str) -> List[str]:
+        """Ask the LLM for up to ~6 distinctive keywords/short phrases. Return a list of strings."""
+        prompt = f"""
+You extract search terms for a boolean/regex search.
+Query: {query}
+
+Rules:
+- Return ONLY a JSON array of strings.
+- Use up to 6 concise keywords/short phrases.
+- Keep phrases short (<= 3 words).
+- Avoid stopwords, punctuation, and generic terms.
+- No explanations, no extra text.
+"""
+        raw = self._llm.complete(prompt).text.strip()
+        try:
+            terms = json.loads(raw)
+            # basic sanitize
+            terms = [t for t in terms if isinstance(t, str) and t.strip()]
+        except Exception:
+            # simple fall-back if JSON parse fails
+            terms = [w for w in re.findall(r"\w+", query) if len(w) > 2][:6]
+        return terms[:6]
+
+    def _compile_patterns(self, terms: List[str]) -> List[re.Pattern]:
+        pats = []
+        for t in terms:
+            # Escape user/LLM output, add word boundaries; allow whitespace inside short phrases
+            escaped = re.escape(t)
+            # turn '\ ' (escaped space) back into '\s+' to match any whitespace in phrases
+            escaped = escaped.replace(r"\ ", r"\s+")
+            pats.append(re.compile(rf"\b{escaped}\b", self._flags))
+        return pats
+
+    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
+        terms = self._extract_terms(query_bundle.query_str)
+        patterns = self._compile_patterns(terms)
+
+        scored: List[tuple] = []
+        for n in self._nodes:
+            txt = n.get_content(metadata_mode="all")
+            hits = 0
+            for p in patterns:
+                if p.search(txt):
+                    hits += 1
+            if hits:
+                # simple score = number of distinct term hits (you can weight phrase vs single word if you like)
+                scored.append((n, float(hits)))
+
+        scored.sort(key=lambda x: x[1], reverse=True)
+        return [NodeWithScore(node=n, score=s) for n, s in scored[: self._top_k]]
+
+# 3) Wire it all together
+def build_query_engine(docs: List[Document], k_vec=5, k_regex=5, weights=(0.7, 0.3)):
+    nodes = build_nodes(docs)
+    # Vector index over the SAME nodes
+    vindex = VectorStoreIndex(nodes)
+
+    vector_ret = vindex.as_retriever(similarity_top_k=k_vec)
+    regex_ret = RegexRetriever(nodes, top_k=k_regex)
+
+    ensemble = EnsembleRetriever(
+        retrievers=[vector_ret, regex_ret],
+        weights=list(weights),       # tune this: more recall from regex? bump weight on regex
+        # uses Reciprocal Rank Fusion by default
+    )
+
+    return RetrieverQueryEngine(retriever=ensemble)
+
+# 4) Use it
+# docs = SimpleDirectoryReader("data").load_data()
+# qe = build_query_engine(docs)
+# print(qe.query("Find entries with strong feelings of depression."))