fetch_arxiv: retry on transient errors, set User-Agent

arXiv frequently returns 503 to default Python urllib clients. Set a
real User-Agent header and retry on 429/5xx with exponential backoff.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Eric Furst 2026-05-01 10:25:40 -04:00
commit 6a03a0b9d1

View file

@ -23,6 +23,41 @@ import urllib.request
import xml.etree.ElementTree as ET
USER_AGENT = "fetch_arxiv.py/1.0 (CHEG 667-013 educational use)"
def fetch_with_retry(url, max_attempts=5):
"""GET a URL with retry-and-backoff for transient errors.
arXiv's API frequently returns 503 even for well-paced requests, especially
if the User-Agent is the default Python one. We set a real User-Agent and
retry with exponential backoff.
"""
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
delay = 5
for attempt in range(1, max_attempts + 1):
try:
with urllib.request.urlopen(request, timeout=30) as resp:
return resp.read()
except urllib.error.HTTPError as e:
if e.code in (429, 500, 502, 503, 504) and attempt < max_attempts:
print(f" HTTP {e.code}; retrying in {delay}s "
f"(attempt {attempt}/{max_attempts})")
time.sleep(delay)
delay *= 2
continue
raise
except urllib.error.URLError as e:
if attempt < max_attempts:
print(f" Network error: {e.reason}; retrying in {delay}s "
f"(attempt {attempt}/{max_attempts})")
time.sleep(delay)
delay *= 2
continue
raise
raise RuntimeError(f"Failed to fetch {url} after {max_attempts} attempts")
def fetch_abstracts(category, max_results, batch_size=50):
"""Fetch arXiv abstracts in batches via the API."""
base_url = "https://export.arxiv.org/api/query"
@ -40,8 +75,7 @@ def fetch_abstracts(category, max_results, batch_size=50):
}
url = f"{base_url}?{urllib.parse.urlencode(params)}"
print(f" fetching {start+1}-{start+n}...")
with urllib.request.urlopen(url) as resp:
data = resp.read()
data = fetch_with_retry(url)
root = ET.fromstring(data)
batch = root.findall("atom:entry", ns)
if not batch: