llm-workshop/03-rag/fetch_arxiv.py

# fetch_arxiv.py
#
# Fetch arXiv abstracts and write each as a separate text file in ./data.
# This builds a larger, more interesting corpus for RAG experiments.
#
# Default: 100 most recent abstracts in cs.LG (machine learning).
# Try other categories: physics.chem-ph, cond-mat.soft, cs.AI, cs.CL,
# physics.flu-dyn, etc.
#
# Usage:
#   python fetch_arxiv.py                       # default: cs.LG, 100 papers
#   python fetch_arxiv.py --category cs.AI
#   python fetch_arxiv.py --category cs.LG --max 200 --output data_arxiv
#
# CHEG 667-013

import argparse
import os
import re
import time
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET


USER_AGENT = "fetch_arxiv.py/1.0 (CHEG 667-013 educational use)"


def fetch_with_retry(url, max_attempts=5):
    """GET a URL with retry-and-backoff for transient errors.

    arXiv's API frequently returns 503 even for well-paced requests, especially
    if the User-Agent is the default Python one. We set a real User-Agent and
    retry with exponential backoff.
    """
    request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
    delay = 5
    for attempt in range(1, max_attempts + 1):
        try:
            with urllib.request.urlopen(request, timeout=30) as resp:
                return resp.read()
        except urllib.error.HTTPError as e:
            if e.code in (429, 500, 502, 503, 504) and attempt < max_attempts:
                print(f"    HTTP {e.code}; retrying in {delay}s "
                      f"(attempt {attempt}/{max_attempts})")
                time.sleep(delay)
                delay *= 2
                continue
            raise
        except urllib.error.URLError as e:
            if attempt < max_attempts:
                print(f"    Network error: {e.reason}; retrying in {delay}s "
                      f"(attempt {attempt}/{max_attempts})")
                time.sleep(delay)
                delay *= 2
                continue
            raise
    raise RuntimeError(f"Failed to fetch {url} after {max_attempts} attempts")


def fetch_abstracts(category, max_results, batch_size=50):
    """Fetch arXiv abstracts in batches via the API."""
    base_url = "https://export.arxiv.org/api/query"
    ns = {"atom": "http://www.w3.org/2005/Atom"}
    entries = []

    for start in range(0, max_results, batch_size):
        n = min(batch_size, max_results - start)
        params = {
            "search_query": f"cat:{category}",
            "sortBy": "submittedDate",
            "sortOrder": "descending",
            "start": start,
            "max_results": n,
        }
        url = f"{base_url}?{urllib.parse.urlencode(params)}"
        print(f"  fetching {start+1}-{start+n}...")
        data = fetch_with_retry(url)
        root = ET.fromstring(data)
        batch = root.findall("atom:entry", ns)
        if not batch:
            break
        entries.extend(batch)
        time.sleep(3)  # arXiv asks for 3-second delay between requests
    return entries


def safe_filename(s, max_len=80):
    """Convert a title to a filesystem-safe filename."""
    s = re.sub(r"\s+", "_", s.strip())
    s = re.sub(r"[^A-Za-z0-9._-]", "", s)
    return s[:max_len]


def write_abstract(entry, outdir):
    """Extract title, authors, date, abstract; write to a text file."""
    ns = {"atom": "http://www.w3.org/2005/Atom"}
    title = entry.find("atom:title", ns).text.strip()
    summary = entry.find("atom:summary", ns).text.strip()
    published = entry.find("atom:published", ns).text.strip()[:10]
    authors = [
        a.find("atom:name", ns).text
        for a in entry.findall("atom:author", ns)
    ]
    arxiv_id = entry.find("atom:id", ns).text.strip().split("/")[-1]

    fname = f"{published}_{safe_filename(title)}.txt"
    path = os.path.join(outdir, fname)
    body = (
        f"Title: {title}\n"
        f"Authors: {', '.join(authors)}\n"
        f"Date: {published}\n"
        f"arXiv: {arxiv_id}\n"
        f"\n"
        f"{summary}\n"
    )
    with open(path, "w") as f:
        f.write(body)
    return fname


def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--category", default="cs.LG",
                        help="arXiv category (default: cs.LG)")
    parser.add_argument("--max", type=int, default=100,
                        help="number of abstracts to fetch (default: 100)")
    parser.add_argument("--output", default="data",
                        help="output directory (default: data)")
    args = parser.parse_args()

    os.makedirs(args.output, exist_ok=True)
    print(f"Fetching {args.max} abstracts from arXiv:{args.category} -> {args.output}/")
    entries = fetch_abstracts(args.category, args.max)
    print(f"Got {len(entries)} entries. Writing to {args.output}/...")
    for e in entries:
        try:
            fname = write_abstract(e, args.output)
        except Exception as exc:
            print(f"  skipped one: {exc}")
            continue
    print(f"Done. {len(os.listdir(args.output))} files in {args.output}/")


if __name__ == "__main__":
    main()