Sync RAG and semantic-search updates from che-computing

- 03-rag, 04-semantic-search: env-var-before-imports fix in build/query scripts - 03-rag: new libraries section, fetch_arxiv.py, exercises for larger corpus and finding current SOTA models, formal references (Lewis, Booth) - 04-semantic-search: libraries pointer back to Part III, larger corpus subsection, model-update exercise, formal references - 06-neural-networks: add Nielsen reference (recommended by student) - README: vocab.md link, agentic systems in description, Ollama prereq for 02-05 - New: vocab.md (glossary organized by section) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-28 12:05:08 -04:00 · 2026-04-28 12:05:08 -04:00 · 59e5f86884
commit 59e5f86884
parent b37661e983
9 changed files with 359 additions and 17 deletions
--- a/03-rag/fetch_arxiv.py
+++ b/03-rag/fetch_arxiv.py
@ -0,0 +1,112 @@
+# fetch_arxiv.py
+#
+# Fetch arXiv abstracts and write each as a separate text file in ./data.
+# This builds a larger, more interesting corpus for RAG experiments.
+#
+# Default: 100 most recent abstracts in cs.LG (machine learning).
+# Try other categories: physics.chem-ph, cond-mat.soft, cs.AI, cs.CL,
+# physics.flu-dyn, etc.
+#
+# Usage:
+#   python fetch_arxiv.py                       # default: cs.LG, 100 papers
+#   python fetch_arxiv.py --category cs.AI
+#   python fetch_arxiv.py --category cs.LG --max 200 --output data_arxiv
+#
+# CHEG 667-013
+
+import argparse
+import os
+import re
+import time
+import urllib.parse
+import urllib.request
+import xml.etree.ElementTree as ET
+
+
+def fetch_abstracts(category, max_results, batch_size=50):
+    """Fetch arXiv abstracts in batches via the API."""
+    base_url = "https://export.arxiv.org/api/query"
+    ns = {"atom": "http://www.w3.org/2005/Atom"}
+    entries = []
+
+    for start in range(0, max_results, batch_size):
+        n = min(batch_size, max_results - start)
+        params = {
+            "search_query": f"cat:{category}",
+            "sortBy": "submittedDate",
+            "sortOrder": "descending",
+            "start": start,
+            "max_results": n,
+        }
+        url = f"{base_url}?{urllib.parse.urlencode(params)}"
+        print(f"  fetching {start+1}-{start+n}...")
+        with urllib.request.urlopen(url) as resp:
+            data = resp.read()
+        root = ET.fromstring(data)
+        batch = root.findall("atom:entry", ns)
+        if not batch:
+            break
+        entries.extend(batch)
+        time.sleep(3)  # arXiv asks for 3-second delay between requests
+    return entries
+
+
+def safe_filename(s, max_len=80):
+    """Convert a title to a filesystem-safe filename."""
+    s = re.sub(r"\s+", "_", s.strip())
+    s = re.sub(r"[^A-Za-z0-9._-]", "", s)
+    return s[:max_len]
+
+
+def write_abstract(entry, outdir):
+    """Extract title, authors, date, abstract; write to a text file."""
+    ns = {"atom": "http://www.w3.org/2005/Atom"}
+    title = entry.find("atom:title", ns).text.strip()
+    summary = entry.find("atom:summary", ns).text.strip()
+    published = entry.find("atom:published", ns).text.strip()[:10]
+    authors = [
+        a.find("atom:name", ns).text
+        for a in entry.findall("atom:author", ns)
+    ]
+    arxiv_id = entry.find("atom:id", ns).text.strip().split("/")[-1]
+
+    fname = f"{published}_{safe_filename(title)}.txt"
+    path = os.path.join(outdir, fname)
+    body = (
+        f"Title: {title}\n"
+        f"Authors: {', '.join(authors)}\n"
+        f"Date: {published}\n"
+        f"arXiv: {arxiv_id}\n"
+        f"\n"
+        f"{summary}\n"
+    )
+    with open(path, "w") as f:
+        f.write(body)
+    return fname
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--category", default="cs.LG",
+                        help="arXiv category (default: cs.LG)")
+    parser.add_argument("--max", type=int, default=100,
+                        help="number of abstracts to fetch (default: 100)")
+    parser.add_argument("--output", default="data",
+                        help="output directory (default: data)")
+    args = parser.parse_args()
+
+    os.makedirs(args.output, exist_ok=True)
+    print(f"Fetching {args.max} abstracts from arXiv:{args.category} -> {args.output}/")
+    entries = fetch_abstracts(args.category, args.max)
+    print(f"Got {len(entries)} entries. Writing to {args.output}/...")
+    for e in entries:
+        try:
+            fname = write_abstract(e, args.output)
+        except Exception as exc:
+            print(f"  skipped one: {exc}")
+            continue
+    print(f"Done. {len(os.listdir(args.output))} files in {args.output}/")
+
+
+if __name__ == "__main__":
+    main()