arXiv frequently returns 503 to default Python urllib clients. Set a real User-Agent header and retry on 429/5xx with exponential backoff. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
146 lines
5 KiB
Python
146 lines
5 KiB
Python
# fetch_arxiv.py
|
|
#
|
|
# Fetch arXiv abstracts and write each as a separate text file in ./data.
|
|
# This builds a larger, more interesting corpus for RAG experiments.
|
|
#
|
|
# Default: 100 most recent abstracts in cs.LG (machine learning).
|
|
# Try other categories: physics.chem-ph, cond-mat.soft, cs.AI, cs.CL,
|
|
# physics.flu-dyn, etc.
|
|
#
|
|
# Usage:
|
|
# python fetch_arxiv.py # default: cs.LG, 100 papers
|
|
# python fetch_arxiv.py --category cs.AI
|
|
# python fetch_arxiv.py --category cs.LG --max 200 --output data_arxiv
|
|
#
|
|
# CHEG 667-013
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import time
|
|
import urllib.parse
|
|
import urllib.request
|
|
import xml.etree.ElementTree as ET
|
|
|
|
|
|
USER_AGENT = "fetch_arxiv.py/1.0 (CHEG 667-013 educational use)"
|
|
|
|
|
|
def fetch_with_retry(url, max_attempts=5):
|
|
"""GET a URL with retry-and-backoff for transient errors.
|
|
|
|
arXiv's API frequently returns 503 even for well-paced requests, especially
|
|
if the User-Agent is the default Python one. We set a real User-Agent and
|
|
retry with exponential backoff.
|
|
"""
|
|
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
|
delay = 5
|
|
for attempt in range(1, max_attempts + 1):
|
|
try:
|
|
with urllib.request.urlopen(request, timeout=30) as resp:
|
|
return resp.read()
|
|
except urllib.error.HTTPError as e:
|
|
if e.code in (429, 500, 502, 503, 504) and attempt < max_attempts:
|
|
print(f" HTTP {e.code}; retrying in {delay}s "
|
|
f"(attempt {attempt}/{max_attempts})")
|
|
time.sleep(delay)
|
|
delay *= 2
|
|
continue
|
|
raise
|
|
except urllib.error.URLError as e:
|
|
if attempt < max_attempts:
|
|
print(f" Network error: {e.reason}; retrying in {delay}s "
|
|
f"(attempt {attempt}/{max_attempts})")
|
|
time.sleep(delay)
|
|
delay *= 2
|
|
continue
|
|
raise
|
|
raise RuntimeError(f"Failed to fetch {url} after {max_attempts} attempts")
|
|
|
|
|
|
def fetch_abstracts(category, max_results, batch_size=50):
|
|
"""Fetch arXiv abstracts in batches via the API."""
|
|
base_url = "https://export.arxiv.org/api/query"
|
|
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
|
entries = []
|
|
|
|
for start in range(0, max_results, batch_size):
|
|
n = min(batch_size, max_results - start)
|
|
params = {
|
|
"search_query": f"cat:{category}",
|
|
"sortBy": "submittedDate",
|
|
"sortOrder": "descending",
|
|
"start": start,
|
|
"max_results": n,
|
|
}
|
|
url = f"{base_url}?{urllib.parse.urlencode(params)}"
|
|
print(f" fetching {start+1}-{start+n}...")
|
|
data = fetch_with_retry(url)
|
|
root = ET.fromstring(data)
|
|
batch = root.findall("atom:entry", ns)
|
|
if not batch:
|
|
break
|
|
entries.extend(batch)
|
|
time.sleep(3) # arXiv asks for 3-second delay between requests
|
|
return entries
|
|
|
|
|
|
def safe_filename(s, max_len=80):
|
|
"""Convert a title to a filesystem-safe filename."""
|
|
s = re.sub(r"\s+", "_", s.strip())
|
|
s = re.sub(r"[^A-Za-z0-9._-]", "", s)
|
|
return s[:max_len]
|
|
|
|
|
|
def write_abstract(entry, outdir):
|
|
"""Extract title, authors, date, abstract; write to a text file."""
|
|
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
|
title = entry.find("atom:title", ns).text.strip()
|
|
summary = entry.find("atom:summary", ns).text.strip()
|
|
published = entry.find("atom:published", ns).text.strip()[:10]
|
|
authors = [
|
|
a.find("atom:name", ns).text
|
|
for a in entry.findall("atom:author", ns)
|
|
]
|
|
arxiv_id = entry.find("atom:id", ns).text.strip().split("/")[-1]
|
|
|
|
fname = f"{published}_{safe_filename(title)}.txt"
|
|
path = os.path.join(outdir, fname)
|
|
body = (
|
|
f"Title: {title}\n"
|
|
f"Authors: {', '.join(authors)}\n"
|
|
f"Date: {published}\n"
|
|
f"arXiv: {arxiv_id}\n"
|
|
f"\n"
|
|
f"{summary}\n"
|
|
)
|
|
with open(path, "w") as f:
|
|
f.write(body)
|
|
return fname
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
parser.add_argument("--category", default="cs.LG",
|
|
help="arXiv category (default: cs.LG)")
|
|
parser.add_argument("--max", type=int, default=100,
|
|
help="number of abstracts to fetch (default: 100)")
|
|
parser.add_argument("--output", default="data",
|
|
help="output directory (default: data)")
|
|
args = parser.parse_args()
|
|
|
|
os.makedirs(args.output, exist_ok=True)
|
|
print(f"Fetching {args.max} abstracts from arXiv:{args.category} -> {args.output}/")
|
|
entries = fetch_abstracts(args.category, args.max)
|
|
print(f"Got {len(entries)} entries. Writing to {args.output}/...")
|
|
for e in entries:
|
|
try:
|
|
fname = write_abstract(e, args.output)
|
|
except Exception as exc:
|
|
print(f" skipped one: {exc}")
|
|
continue
|
|
print(f"Done. {len(os.listdir(args.output))} files in {args.output}/")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|