fetch_arxiv: retry on transient errors, set User-Agent

arXiv frequently returns 503 to default Python urllib clients. Set a real User-Agent header and retry on 429/5xx with exponential backoff. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-01 10:25:40 -04:00 · 2026-05-01 10:25:40 -04:00 · 6a03a0b9d1
commit 6a03a0b9d1
parent 59e5f86884
1 changed files with 36 additions and 2 deletions
--- a/03-rag/fetch_arxiv.py
+++ b/03-rag/fetch_arxiv.py
@ -23,6 +23,41 @@ import urllib.request
 import xml.etree.ElementTree as ET


+USER_AGENT = "fetch_arxiv.py/1.0 (CHEG 667-013 educational use)"
+
+
+def fetch_with_retry(url, max_attempts=5):
+    """GET a URL with retry-and-backoff for transient errors.
+
+    arXiv's API frequently returns 503 even for well-paced requests, especially
+    if the User-Agent is the default Python one. We set a real User-Agent and
+    retry with exponential backoff.
+    """
+    request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
+    delay = 5
+    for attempt in range(1, max_attempts + 1):
+        try:
+            with urllib.request.urlopen(request, timeout=30) as resp:
+                return resp.read()
+        except urllib.error.HTTPError as e:
+            if e.code in (429, 500, 502, 503, 504) and attempt < max_attempts:
+                print(f"    HTTP {e.code}; retrying in {delay}s "
+                      f"(attempt {attempt}/{max_attempts})")
+                time.sleep(delay)
+                delay *= 2
+                continue
+            raise
+        except urllib.error.URLError as e:
+            if attempt < max_attempts:
+                print(f"    Network error: {e.reason}; retrying in {delay}s "
+                      f"(attempt {attempt}/{max_attempts})")
+                time.sleep(delay)
+                delay *= 2
+                continue
+            raise
+    raise RuntimeError(f"Failed to fetch {url} after {max_attempts} attempts")
+
+
 def fetch_abstracts(category, max_results, batch_size=50):
    """Fetch arXiv abstracts in batches via the API."""
    base_url = "https://export.arxiv.org/api/query"
@ -40,8 +75,7 @@ def fetch_abstracts(category, max_results, batch_size=50):
        }
        url = f"{base_url}?{urllib.parse.urlencode(params)}"
        print(f"  fetching {start+1}-{start+n}...")
-        with urllib.request.urlopen(url) as resp:
-            data = resp.read()
+        data = fetch_with_retry(url)
        root = ET.fromstring(data)
        batch = root.findall("atom:entry", ns)
        if not batch: