Migrate to uv; fix clippings paths and wrapper

2026-06-08 07:53:34 -04:00 · 2026-06-08 07:53:34 -04:00 · 4df608c440
commit 4df608c440
parent 3347a242ef
7 changed files with 5137 additions and 7 deletions
--- a/.python-version
+++ b/.python-version
@ -0,0 +1 @@
+3.12
--- a/README.md
+++ b/README.md
@ -30,20 +30,28 @@ ssearch/
 ├── clippings/                  # Symlink to clippings (PDFs, TXT, webarchive, RTF)
 ├── store/                      # Persisted journal vector store
 ├── models/                     # Cached HuggingFace models (offline)
-├── requirements.txt            # Python dependencies
+├── pyproject.toml              # Project metadata and dependencies (uv)
+├── uv.lock                     # Pinned dependency lockfile (uv)
 ```

 ## Setup

-**Prerequisites**: Python 3.12, [Ollama](https://ollama.com) with `gemma4:e4b` or similar pulled.
+**Prerequisites**: [uv](https://docs.astral.sh/uv/), [Ollama](https://ollama.com) with `gemma4:e4b` or similar pulled. uv manages the Python 3.12 toolchain and dependencies.

 ```bash
 cd ssearch
-python3 -m venv .venv
-source .venv/bin/activate
-pip install -r requirements.txt
+uv sync                    # create .venv and install the search pipeline
+uv sync --group notebook   # also install Jupyter / analysis deps (optional)
 ```

+`uv sync` reads `pyproject.toml` and the pinned `uv.lock`, creating `.venv` automatically (no manual `python -m venv` or activation needed). Run any script with `uv run`:
+
+```bash
+uv run python retrieve.py "your query"
+```
+
+You can still `source .venv/bin/activate` and call `python` directly if you prefer.
+
 The `data/` symlink should point to the journal archive (plain `.txt` files). The `clippings/` symlink should point to the clippings folder. The embedding model (`BAAI/bge-large-en-v1.5`) and cross-encoder (`cross-encoder/ms-marco-MiniLM-L-12-v2`) are cached in `./models/` for offline use.

 ### Offline model loading
--- a/clippings_search/build_clippings.py
+++ b/clippings_search/build_clippings.py
@ -15,6 +15,13 @@
 # libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
 # at import time.
 import os
+from pathlib import Path
+
+# This script lives in clippings_search/ but uses ./ paths relative to the
+# project root (./models, ./clippings, ./clippings_search/store_clippings).
+# Anchor the CWD to the project root so it works no matter where it's invoked from.
+os.chdir(Path(__file__).resolve().parent.parent)
+
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
 os.environ["HF_HUB_OFFLINE"] = "1"
@ -30,7 +37,6 @@ from llama_index.core import (
 from llama_index.vector_stores.chroma import ChromaVectorStore
 from llama_index.embeddings.huggingface import HuggingFaceEmbedding
 from llama_index.core.node_parser import SentenceSplitter
-from pathlib import Path
 import argparse
 import datetime
 import time
--- a/clippings_search/retrieve_clippings.py
+++ b/clippings_search/retrieve_clippings.py
@ -11,6 +11,13 @@
 # libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
 # at import time.
 import os
+from pathlib import Path
+
+# This script lives in clippings_search/ but uses ./ paths relative to the
+# project root (./models, ./clippings_search/store_clippings). Anchor the CWD
+# to the project root so it works no matter where it's invoked from.
+os.chdir(Path(__file__).resolve().parent.parent)
+
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
 os.environ["HF_HUB_OFFLINE"] = "1"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,46 @@
+[project]
+name = "ssearch"
+version = "0.1.0"
+description = "Semantic search over a personal journal archive and a clippings library (RAG: LlamaIndex + HuggingFace embeddings + Ollama/OpenAI)."
+readme = "README.md"
+requires-python = ">=3.12"
+license = { text = "MIT" }
+authors = [{ name = "E. M. Furst" }]
+
+dependencies = [
+    # --- RAG framework (version-sensitive: pinned exactly) ---
+    "llama-index-core==0.14.14",
+    "llama-index-embeddings-huggingface==0.6.1",
+    "llama-index-readers-file==0.5.6",
+    "llama-index-llms-ollama==0.9.1",
+    "llama-index-llms-openai==0.6.18",
+    "llama-index-retrievers-bm25==0.6.5",
+    "llama-index-vector-stores-chroma==0.5.5",
+    # --- Vector store (clippings) ---
+    "chromadb==1.5.1",
+    # --- Embeddings / cross-encoder re-ranking (pulls torch) ---
+    "sentence-transformers>=5.1.0",
+    # transformers + huggingface-hub pinned: a transformers update once broke
+    # offline model loading (AutoTokenizer phoning home). See devlog 2026-02-20.
+    "transformers==4.55.0",
+    "huggingface-hub==0.34.4",
+    # --- Keyword search / document loaders ---
+    "nltk>=3.9.1",
+    "beautifulsoup4>=4.13.4",
+    "striprtf>=0.0.26",
+    "pypdf>=6.7.1",
+]
+
+[dependency-groups]
+# Analysis + exploration notebooks (not needed to run the search pipeline).
+# Install with: uv sync --group notebook
+notebook = [
+    "jupyter>=1.1.1",
+    "ipykernel>=6.30.0",
+    "ipywidgets>=8.1.0",
+    "matplotlib>=3.10.0",
+    "seaborn>=0.13.0",
+    "pandas>=2.2.0",
+    "scikit-learn>=1.7.0",
+    "scipy>=1.16.0",
+]
--- a/run_retrieve.sh
+++ b/run_retrieve.sh
@ -31,7 +31,7 @@ while true; do
    time_start=$(date +%s)

    # Call the python script with the query and format the output
-    python3 $QUERY_SCRIPT --query "$query" | \
+    python3 $QUERY_SCRIPT "$query" | \
        expand | sed -E 's|(.* )(.*/data)|\1./data|' | fold -s -w 131
    
    time_end=$(date +%s)
--- a/uv.lock
+++ b/uv.lock