fetch_arxiv: retry on transient errors, set User-Agent
arXiv frequently returns 503 to default Python urllib clients. Set a real User-Agent header and retry on 429/5xx with exponential backoff. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
59e5f86884
commit
6a03a0b9d1
1 changed files with 36 additions and 2 deletions
|
|
@ -23,6 +23,41 @@ import urllib.request
|
|||
import xml.etree.ElementTree as ET
|
||||
|
||||
|
||||
USER_AGENT = "fetch_arxiv.py/1.0 (CHEG 667-013 educational use)"
|
||||
|
||||
|
||||
def fetch_with_retry(url, max_attempts=5):
|
||||
"""GET a URL with retry-and-backoff for transient errors.
|
||||
|
||||
arXiv's API frequently returns 503 even for well-paced requests, especially
|
||||
if the User-Agent is the default Python one. We set a real User-Agent and
|
||||
retry with exponential backoff.
|
||||
"""
|
||||
request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
|
||||
delay = 5
|
||||
for attempt in range(1, max_attempts + 1):
|
||||
try:
|
||||
with urllib.request.urlopen(request, timeout=30) as resp:
|
||||
return resp.read()
|
||||
except urllib.error.HTTPError as e:
|
||||
if e.code in (429, 500, 502, 503, 504) and attempt < max_attempts:
|
||||
print(f" HTTP {e.code}; retrying in {delay}s "
|
||||
f"(attempt {attempt}/{max_attempts})")
|
||||
time.sleep(delay)
|
||||
delay *= 2
|
||||
continue
|
||||
raise
|
||||
except urllib.error.URLError as e:
|
||||
if attempt < max_attempts:
|
||||
print(f" Network error: {e.reason}; retrying in {delay}s "
|
||||
f"(attempt {attempt}/{max_attempts})")
|
||||
time.sleep(delay)
|
||||
delay *= 2
|
||||
continue
|
||||
raise
|
||||
raise RuntimeError(f"Failed to fetch {url} after {max_attempts} attempts")
|
||||
|
||||
|
||||
def fetch_abstracts(category, max_results, batch_size=50):
|
||||
"""Fetch arXiv abstracts in batches via the API."""
|
||||
base_url = "https://export.arxiv.org/api/query"
|
||||
|
|
@ -40,8 +75,7 @@ def fetch_abstracts(category, max_results, batch_size=50):
|
|||
}
|
||||
url = f"{base_url}?{urllib.parse.urlencode(params)}"
|
||||
print(f" fetching {start+1}-{start+n}...")
|
||||
with urllib.request.urlopen(url) as resp:
|
||||
data = resp.read()
|
||||
data = fetch_with_retry(url)
|
||||
root = ET.fromstring(data)
|
||||
batch = root.findall("atom:entry", ns)
|
||||
if not batch:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue