Updated to Gemma4 models.
This commit is contained in:
parent
53404dd396
commit
3347a242ef
4 changed files with 46 additions and 10 deletions
|
|
@ -43,8 +43,10 @@ import sys
|
|||
# Embedding model (must match build_store.py)
|
||||
EMBED_MODEL = HuggingFaceEmbedding(cache_folder="./models", model_name="BAAI/bge-large-en-v1.5", local_files_only=True)
|
||||
|
||||
# LLM model for generation
|
||||
LLM_MODEL = "command-r7b"
|
||||
# LLM model for generation. Use temp 0.3.
|
||||
#LLM_MODEL = "command-r7b"
|
||||
# testing gemma4:e4b. Recommendations are to use temp 1.0
|
||||
LLM_MODEL = "gemma4:e4b"
|
||||
|
||||
# Cross-encoder model for re-ranking (cached in ./models/)
|
||||
RERANK_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2"
|
||||
|
|
@ -89,7 +91,8 @@ def main():
|
|||
# Note: Ollama temperature defaults to 0.8
|
||||
Settings.llm = Ollama(
|
||||
model=LLM_MODEL,
|
||||
temperature=0.3,
|
||||
temperature=1.0,
|
||||
thinking=True, # enable
|
||||
request_timeout=360.0,
|
||||
context_window=8000,
|
||||
)
|
||||
|
|
@ -153,9 +156,20 @@ def main():
|
|||
n_bm25_only = len([n for n in bm25_nodes if n.node.node_id not in {v.node.node_id for v in vector_nodes}])
|
||||
n_both = len(vector_nodes) + len(bm25_nodes) - len(merged)
|
||||
|
||||
# Estimate context length (prompt + node text)
|
||||
context_text = "\n\n".join(n.get_content() for n in reranked)
|
||||
prompt_text = PROMPT.format(context_str=context_text, query_str=q)
|
||||
n_context_tokens = len(prompt_text.split()) # rough word count; ~1.3 tokens/word
|
||||
|
||||
print(f"\nQuery: {q}")
|
||||
print(f"Vector: {len(vector_nodes)}, BM25: {len(bm25_nodes)}, "
|
||||
print(f"Vector: {len(vector_nodes)} ({n_vector_only} unique), "
|
||||
f"BM25: {len(bm25_nodes)} ({n_bm25_only} unique), "
|
||||
f"overlap: {n_both}, merged: {len(merged)}, re-ranked to: {len(reranked)}")
|
||||
|
||||
# The token estimate uses a ~1.3 tokens/word ratio, which is a rough approximation.
|
||||
# For an exact count you'd need the model's tokenizer, but this gives a useful ballpark
|
||||
# for gauging how much of the context window we use.
|
||||
print(f"Context: ~{n_context_tokens} words (~{int(n_context_tokens * 1.3)} tokens)")
|
||||
|
||||
# Synthesize response with LLM
|
||||
synthesizer = get_response_synthesizer(text_qa_template=PROMPT)
|
||||
|
|
@ -169,7 +183,7 @@ def main():
|
|||
for node in response.source_nodes:
|
||||
meta = getattr(node, "metadata", None) or node.node.metadata
|
||||
score = getattr(node, "score", None)
|
||||
print(f"{meta.get('file_name')} {meta.get('file_path')} {score:.3f}")
|
||||
print(f"data/{meta.get('file_name')} {score:.3f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue