Migrate to uv; fix clippings paths and wrapper
This commit is contained in:
parent
3347a242ef
commit
4df608c440
7 changed files with 5137 additions and 7 deletions
1
.python-version
Normal file
1
.python-version
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
3.12
|
||||||
18
README.md
18
README.md
|
|
@ -30,20 +30,28 @@ ssearch/
|
||||||
├── clippings/ # Symlink to clippings (PDFs, TXT, webarchive, RTF)
|
├── clippings/ # Symlink to clippings (PDFs, TXT, webarchive, RTF)
|
||||||
├── store/ # Persisted journal vector store
|
├── store/ # Persisted journal vector store
|
||||||
├── models/ # Cached HuggingFace models (offline)
|
├── models/ # Cached HuggingFace models (offline)
|
||||||
├── requirements.txt # Python dependencies
|
├── pyproject.toml # Project metadata and dependencies (uv)
|
||||||
|
├── uv.lock # Pinned dependency lockfile (uv)
|
||||||
```
|
```
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
**Prerequisites**: Python 3.12, [Ollama](https://ollama.com) with `gemma4:e4b` or similar pulled.
|
**Prerequisites**: [uv](https://docs.astral.sh/uv/), [Ollama](https://ollama.com) with `gemma4:e4b` or similar pulled. uv manages the Python 3.12 toolchain and dependencies.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd ssearch
|
cd ssearch
|
||||||
python3 -m venv .venv
|
uv sync # create .venv and install the search pipeline
|
||||||
source .venv/bin/activate
|
uv sync --group notebook # also install Jupyter / analysis deps (optional)
|
||||||
pip install -r requirements.txt
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
`uv sync` reads `pyproject.toml` and the pinned `uv.lock`, creating `.venv` automatically (no manual `python -m venv` or activation needed). Run any script with `uv run`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv run python retrieve.py "your query"
|
||||||
|
```
|
||||||
|
|
||||||
|
You can still `source .venv/bin/activate` and call `python` directly if you prefer.
|
||||||
|
|
||||||
The `data/` symlink should point to the journal archive (plain `.txt` files). The `clippings/` symlink should point to the clippings folder. The embedding model (`BAAI/bge-large-en-v1.5`) and cross-encoder (`cross-encoder/ms-marco-MiniLM-L-12-v2`) are cached in `./models/` for offline use.
|
The `data/` symlink should point to the journal archive (plain `.txt` files). The `clippings/` symlink should point to the clippings folder. The embedding model (`BAAI/bge-large-en-v1.5`) and cross-encoder (`cross-encoder/ms-marco-MiniLM-L-12-v2`) are cached in `./models/` for offline use.
|
||||||
|
|
||||||
### Offline model loading
|
### Offline model loading
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,13 @@
|
||||||
# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
|
# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
|
||||||
# at import time.
|
# at import time.
|
||||||
import os
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# This script lives in clippings_search/ but uses ./ paths relative to the
|
||||||
|
# project root (./models, ./clippings, ./clippings_search/store_clippings).
|
||||||
|
# Anchor the CWD to the project root so it works no matter where it's invoked from.
|
||||||
|
os.chdir(Path(__file__).resolve().parent.parent)
|
||||||
|
|
||||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||||
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
|
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
|
||||||
os.environ["HF_HUB_OFFLINE"] = "1"
|
os.environ["HF_HUB_OFFLINE"] = "1"
|
||||||
|
|
@ -30,7 +37,6 @@ from llama_index.core import (
|
||||||
from llama_index.vector_stores.chroma import ChromaVectorStore
|
from llama_index.vector_stores.chroma import ChromaVectorStore
|
||||||
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
||||||
from llama_index.core.node_parser import SentenceSplitter
|
from llama_index.core.node_parser import SentenceSplitter
|
||||||
from pathlib import Path
|
|
||||||
import argparse
|
import argparse
|
||||||
import datetime
|
import datetime
|
||||||
import time
|
import time
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,13 @@
|
||||||
# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
|
# libraries, because huggingface_hub.constants evaluates HF_HUB_OFFLINE
|
||||||
# at import time.
|
# at import time.
|
||||||
import os
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# This script lives in clippings_search/ but uses ./ paths relative to the
|
||||||
|
# project root (./models, ./clippings_search/store_clippings). Anchor the CWD
|
||||||
|
# to the project root so it works no matter where it's invoked from.
|
||||||
|
os.chdir(Path(__file__).resolve().parent.parent)
|
||||||
|
|
||||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||||
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
|
os.environ["SENTENCE_TRANSFORMERS_HOME"] = "./models"
|
||||||
os.environ["HF_HUB_OFFLINE"] = "1"
|
os.environ["HF_HUB_OFFLINE"] = "1"
|
||||||
|
|
|
||||||
46
pyproject.toml
Normal file
46
pyproject.toml
Normal file
|
|
@ -0,0 +1,46 @@
|
||||||
|
[project]
|
||||||
|
name = "ssearch"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Semantic search over a personal journal archive and a clippings library (RAG: LlamaIndex + HuggingFace embeddings + Ollama/OpenAI)."
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.12"
|
||||||
|
license = { text = "MIT" }
|
||||||
|
authors = [{ name = "E. M. Furst" }]
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
# --- RAG framework (version-sensitive: pinned exactly) ---
|
||||||
|
"llama-index-core==0.14.14",
|
||||||
|
"llama-index-embeddings-huggingface==0.6.1",
|
||||||
|
"llama-index-readers-file==0.5.6",
|
||||||
|
"llama-index-llms-ollama==0.9.1",
|
||||||
|
"llama-index-llms-openai==0.6.18",
|
||||||
|
"llama-index-retrievers-bm25==0.6.5",
|
||||||
|
"llama-index-vector-stores-chroma==0.5.5",
|
||||||
|
# --- Vector store (clippings) ---
|
||||||
|
"chromadb==1.5.1",
|
||||||
|
# --- Embeddings / cross-encoder re-ranking (pulls torch) ---
|
||||||
|
"sentence-transformers>=5.1.0",
|
||||||
|
# transformers + huggingface-hub pinned: a transformers update once broke
|
||||||
|
# offline model loading (AutoTokenizer phoning home). See devlog 2026-02-20.
|
||||||
|
"transformers==4.55.0",
|
||||||
|
"huggingface-hub==0.34.4",
|
||||||
|
# --- Keyword search / document loaders ---
|
||||||
|
"nltk>=3.9.1",
|
||||||
|
"beautifulsoup4>=4.13.4",
|
||||||
|
"striprtf>=0.0.26",
|
||||||
|
"pypdf>=6.7.1",
|
||||||
|
]
|
||||||
|
|
||||||
|
[dependency-groups]
|
||||||
|
# Analysis + exploration notebooks (not needed to run the search pipeline).
|
||||||
|
# Install with: uv sync --group notebook
|
||||||
|
notebook = [
|
||||||
|
"jupyter>=1.1.1",
|
||||||
|
"ipykernel>=6.30.0",
|
||||||
|
"ipywidgets>=8.1.0",
|
||||||
|
"matplotlib>=3.10.0",
|
||||||
|
"seaborn>=0.13.0",
|
||||||
|
"pandas>=2.2.0",
|
||||||
|
"scikit-learn>=1.7.0",
|
||||||
|
"scipy>=1.16.0",
|
||||||
|
]
|
||||||
|
|
@ -31,7 +31,7 @@ while true; do
|
||||||
time_start=$(date +%s)
|
time_start=$(date +%s)
|
||||||
|
|
||||||
# Call the python script with the query and format the output
|
# Call the python script with the query and format the output
|
||||||
python3 $QUERY_SCRIPT --query "$query" | \
|
python3 $QUERY_SCRIPT "$query" | \
|
||||||
expand | sed -E 's|(.* )(.*/data)|\1./data|' | fold -s -w 131
|
expand | sed -E 's|(.* )(.*/data)|\1./data|' | fold -s -w 131
|
||||||
|
|
||||||
time_end=$(date +%s)
|
time_end=$(date +%s)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue