llm-workshop/04-semantic-search/search_keywords.py

# search_keywords.py
# Keyword search: extract terms from a query using POS tagging, then grep
# across journal files for matches.
#
# Complements the vector search pipeline by catching exact names, places,
# and dates that embeddings can miss. No vector store or LLM needed.
#
# Term extraction uses NLTK POS tagging to keep nouns (NN*), proper nouns
# (NNP*), and adjectives (JJ*) -- skipping stopwords and function words
# automatically. Consecutive proper nouns are joined into multi-word phrases
# (e.g., "Robert Wright" stays as one search term, not "robert" + "wright").
#
# E.M.F. February 2026

import os
import sys
import re
from pathlib import Path

import nltk

#
# Globals
#
DATA_DIR = Path("./data")
CONTEXT_LINES = 2       # lines of context around each match
MAX_MATCHES_PER_FILE = 3  # cap matches shown per file to avoid flooding

# POS tags to keep: nouns, proper nouns, adjectives
KEEP_TAGS = {"NN", "NNS", "NNP", "NNPS", "JJ", "JJS", "JJR"}

# Proper noun tags (consecutive runs are joined as phrases)
PROPER_NOUN_TAGS = {"NNP", "NNPS"}

# Minimum word length to keep (filters out short noise)
MIN_WORD_LEN = 3


def ensure_nltk_data():
    """Download NLTK data if not already present."""
    for resource, name in [
        ("tokenizers/punkt_tab", "punkt_tab"),
        ("taggers/averaged_perceptron_tagger_eng", "averaged_perceptron_tagger_eng"),
    ]:
        try:
            nltk.data.find(resource)
        except LookupError:
            print(f"Downloading NLTK resource: {name}")
            nltk.download(name, quiet=True)


def extract_terms(query):
    """Extract key terms from a query using POS tagging.

    Tokenizes the query, runs POS tagging, and keeps nouns, proper nouns,
    and adjectives. Consecutive proper nouns (NNP/NNPS) are joined into
    multi-word phrases (e.g., "Robert Wright" → "robert wright").

    Returns a list of terms (lowercase), phrases listed first.
    """
    tokens = nltk.word_tokenize(query)
    tagged = nltk.pos_tag(tokens)

    phrases = []     # multi-word proper noun phrases
    single_terms = []  # individual nouns/adjectives
    proper_run = []  # accumulator for consecutive proper nouns

    for word, tag in tagged:
        if tag in PROPER_NOUN_TAGS:
            proper_run.append(word)
        else:
            # Flush any accumulated proper noun run
            if proper_run:
                phrase = " ".join(proper_run).lower()
                if len(phrase) >= MIN_WORD_LEN:
                    phrases.append(phrase)
                proper_run = []
            # Keep other nouns and adjectives as single terms
            if tag in KEEP_TAGS and len(word) >= MIN_WORD_LEN:
                single_terms.append(word.lower())

    # Flush final proper noun run
    if proper_run:
        phrase = " ".join(proper_run).lower()
        if len(phrase) >= MIN_WORD_LEN:
            phrases.append(phrase)

    # Phrases first (more specific), then single terms
    all_terms = phrases + single_terms
    return list(dict.fromkeys(all_terms))  # deduplicate, preserve order


def search_files(terms, data_dir, context_lines=CONTEXT_LINES):
    """Search all .txt files in data_dir for the given terms.

    Returns a list of (file_path, match_count, matches) where matches is a
    list of (line_number, context_block) tuples.
    """
    if not terms:
        return []

    # Build a single regex pattern that matches any term (case-insensitive)
    pattern = re.compile(
        r"\b(" + "|".join(re.escape(t) for t in terms) + r")\b",
        re.IGNORECASE
    )

    results = []
    txt_files = sorted(data_dir.glob("*.txt"))

    for fpath in txt_files:
        try:
            lines = fpath.read_text(encoding="utf-8").splitlines()
        except (OSError, UnicodeDecodeError):
            continue

        matches = []
        match_count = 0
        seen_lines = set()  # avoid overlapping context blocks

        for i, line in enumerate(lines):
            if pattern.search(line):
                match_count += 1
                if i in seen_lines:
                    continue

                # Extract context window
                start = max(0, i - context_lines)
                end = min(len(lines), i + context_lines + 1)
                block = []
                for j in range(start, end):
                    seen_lines.add(j)
                    marker = ">>>" if j == i else "   "
                    block.append(f"  {marker} {j+1:4d}: {lines[j]}")

                matches.append((i + 1, "\n".join(block)))

        if match_count > 0:
            results.append((fpath, match_count, matches))

    # Sort by match count (most matches first)
    results.sort(key=lambda x: x[1], reverse=True)
    return results


def main():
    if len(sys.argv) < 2:
        print("Usage: python search_keywords.py QUERY_TEXT")
        sys.exit(1)

    ensure_nltk_data()

    q = " ".join(sys.argv[1:])

    # Extract terms
    terms = extract_terms(q)
    if not terms:
        print(f"Query: {q}")
        print("No searchable terms extracted. Try a more specific query.")
        sys.exit(0)

    print(f"Query: {q}")
    print(f"Extracted terms: {', '.join(terms)}\n")

    # Search
    results = search_files(terms, DATA_DIR)

    if not results:
        print("No matches found.")
        sys.exit(0)

    # Summary
    total_matches = sum(r[1] for r in results)
    print(f"Found {total_matches} matches across {len(results)} files\n")

    # Detailed output
    for fpath, match_count, matches in results:
        print("="*60)
        print(f"--- {fpath.name}  ({match_count} matches) ---")
        print("="*60)
        for line_num, block in matches[:MAX_MATCHES_PER_FILE]:
            print(block)
            print()
        if len(matches) > MAX_MATCHES_PER_FILE:
            print(f"  ... and {len(matches) - MAX_MATCHES_PER_FILE} more matches\n")


if __name__ == "__main__":
    main()