From 6a03a0b9d142c0f9b3da8bbce97cbe399d672da2 Mon Sep 17 00:00:00 2001 From: Eric Furst Date: Fri, 1 May 2026 10:25:40 -0400 Subject: [PATCH] fetch_arxiv: retry on transient errors, set User-Agent arXiv frequently returns 503 to default Python urllib clients. Set a real User-Agent header and retry on 429/5xx with exponential backoff. Co-Authored-By: Claude Opus 4.6 (1M context) --- 03-rag/fetch_arxiv.py | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/03-rag/fetch_arxiv.py b/03-rag/fetch_arxiv.py index c5c56bc..88c553b 100644 --- a/03-rag/fetch_arxiv.py +++ b/03-rag/fetch_arxiv.py @@ -23,6 +23,41 @@ import urllib.request import xml.etree.ElementTree as ET +USER_AGENT = "fetch_arxiv.py/1.0 (CHEG 667-013 educational use)" + + +def fetch_with_retry(url, max_attempts=5): + """GET a URL with retry-and-backoff for transient errors. + + arXiv's API frequently returns 503 even for well-paced requests, especially + if the User-Agent is the default Python one. We set a real User-Agent and + retry with exponential backoff. + """ + request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) + delay = 5 + for attempt in range(1, max_attempts + 1): + try: + with urllib.request.urlopen(request, timeout=30) as resp: + return resp.read() + except urllib.error.HTTPError as e: + if e.code in (429, 500, 502, 503, 504) and attempt < max_attempts: + print(f" HTTP {e.code}; retrying in {delay}s " + f"(attempt {attempt}/{max_attempts})") + time.sleep(delay) + delay *= 2 + continue + raise + except urllib.error.URLError as e: + if attempt < max_attempts: + print(f" Network error: {e.reason}; retrying in {delay}s " + f"(attempt {attempt}/{max_attempts})") + time.sleep(delay) + delay *= 2 + continue + raise + raise RuntimeError(f"Failed to fetch {url} after {max_attempts} attempts") + + def fetch_abstracts(category, max_results, batch_size=50): """Fetch arXiv abstracts in batches via the API.""" base_url = "https://export.arxiv.org/api/query" @@ -40,8 +75,7 @@ def fetch_abstracts(category, max_results, batch_size=50): } url = f"{base_url}?{urllib.parse.urlencode(params)}" print(f" fetching {start+1}-{start+n}...") - with urllib.request.urlopen(url) as resp: - data = resp.read() + data = fetch_with_retry(url) root = ET.fromstring(data) batch = root.findall("atom:entry", ns) if not batch: