Updated to Gemma4 models.

2026-04-09 10:35:48 -04:00 · 2026-04-09 10:35:48 -04:00 · 3347a242ef
commit 3347a242ef
parent 53404dd396
4 changed files with 46 additions and 10 deletions
--- a/query_hybrid.py
+++ b/query_hybrid.py
@ -43,8 +43,10 @@ import sys
 # Embedding model (must match build_store.py)
 EMBED_MODEL = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5", local_files_only=True)

-# LLM model for generation
-LLM_MODEL = "command-r7b"
+# LLM model for generation. Use temp 0.3.
+#LLM_MODEL = "command-r7b"
+# testing gemma4:e4b. Recommendations are to use temp 1.0
+LLM_MODEL = "gemma4:e4b"

 # Cross-encoder model for re-ranking (cached in ./models/)
 RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2"
@ -89,7 +91,8 @@ def main():
    # Note: Ollama temperature defaults to 0.8
    Settings.llm = Ollama(
        model=LLM_MODEL,
-        temperature=0.3,
+        temperature=1.0,
+        thinking=True,          # enable
        request_timeout=360.0,
        context_window=8000,
    )
@ -153,9 +156,20 @@ def main():
    n_bm25_only = len([n for n in bm25_nodes if n.node.node_id not in {v.node.node_id for v in vector_nodes}])
    n_both = len(vector_nodes) + len(bm25_nodes) - len(merged)

+    # Estimate context length (prompt + node text)
+    context_text = "\n\n".join(n.get_content() for n in reranked)
+    prompt_text = PROMPT.format(context_str=context_text, query_str=q)
+    n_context_tokens = len(prompt_text.split())  # rough word count; ~1.3 tokens/word
+
    print(f"\nQuery: {q}")
-    print(f"Vector: {len(vector_nodes)}, BM25: {len(bm25_nodes)}, "
+    print(f"Vector: {len(vector_nodes)} ({n_vector_only} unique), "
+          f"BM25: {len(bm25_nodes)} ({n_bm25_only} unique), "
          f"overlap: {n_both}, merged: {len(merged)}, re-ranked to: {len(reranked)}")
+    
+    # The token estimate uses a ~1.3 tokens/word ratio, which is a rough approximation. 
+    # For an exact count you'd need the model's tokenizer, but this gives a useful ballpark 
+    # for gauging how much of the context window we use.
+    print(f"Context: ~{n_context_tokens} words (~{int(n_context_tokens * 1.3)} tokens)")

    # Synthesize response with LLM
    synthesizer = get_response_synthesizer(text_qa_template=PROMPT)
@ -169,7 +183,7 @@ def main():
    for node in response.source_nodes:
        meta = getattr(node, "metadata", None) or node.node.metadata
        score = getattr(node, "score", None)
-        print(f"{meta.get('file_name')}  {meta.get('file_path')}  {score:.3f}")
+        print(f"data/{meta.get('file_name')}  {score:.3f}")


 if __name__ == "__main__":