From 6a03a0b9d142c0f9b3da8bbce97cbe399d672da2 Mon Sep 17 00:00:00 2001
From: Eric Furst <emfurst@users.noreply.github.com>
Date: Fri, 1 May 2026 10:25:40 -0400
Subject: [PATCH] fetch_arxiv: retry on transient errors, set User-Agent

arXiv frequently returns 503 to default Python urllib clients. Set a
real User-Agent header and retry on 429/5xx with exponential backoff.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 03-rag/fetch_arxiv.py | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/03-rag/fetch_arxiv.py b/03-rag/fetch_arxiv.py
index c5c56bc..88c553b 100644
--- a/03-rag/fetch_arxiv.py
+++ b/03-rag/fetch_arxiv.py
@@ -23,6 +23,41 @@ import urllib.request
 import xml.etree.ElementTree as ET
 
 
+USER_AGENT = "fetch_arxiv.py/1.0 (CHEG 667-013 educational use)"
+
+
+def fetch_with_retry(url, max_attempts=5):
+    """GET a URL with retry-and-backoff for transient errors.
+
+    arXiv's API frequently returns 503 even for well-paced requests, especially
+    if the User-Agent is the default Python one. We set a real User-Agent and
+    retry with exponential backoff.
+    """
+    request = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
+    delay = 5
+    for attempt in range(1, max_attempts + 1):
+        try:
+            with urllib.request.urlopen(request, timeout=30) as resp:
+                return resp.read()
+        except urllib.error.HTTPError as e:
+            if e.code in (429, 500, 502, 503, 504) and attempt < max_attempts:
+                print(f"    HTTP {e.code}; retrying in {delay}s "
+                      f"(attempt {attempt}/{max_attempts})")
+                time.sleep(delay)
+                delay *= 2
+                continue
+            raise
+        except urllib.error.URLError as e:
+            if attempt < max_attempts:
+                print(f"    Network error: {e.reason}; retrying in {delay}s "
+                      f"(attempt {attempt}/{max_attempts})")
+                time.sleep(delay)
+                delay *= 2
+                continue
+            raise
+    raise RuntimeError(f"Failed to fetch {url} after {max_attempts} attempts")
+
+
 def fetch_abstracts(category, max_results, batch_size=50):
     """Fetch arXiv abstracts in batches via the API."""
     base_url = "https://export.arxiv.org/api/query"
@@ -40,8 +75,7 @@ def fetch_abstracts(category, max_results, batch_size=50):
         }
         url = f"{base_url}?{urllib.parse.urlencode(params)}"
         print(f"  fetching {start+1}-{start+n}...")
-        with urllib.request.urlopen(url) as resp:
-            data = resp.read()
+        data = fetch_with_retry(url)
         root = ET.fromstring(data)
         batch = root.findall("atom:entry", ns)
         if not batch: