# fetch_arxiv.py # # Fetch arXiv abstracts and write each as a separate text file in ./data. # This builds a larger, more interesting corpus for RAG experiments. # # Default: 100 most recent abstracts in cs.LG (machine learning). # Try other categories: physics.chem-ph, cond-mat.soft, cs.AI, cs.CL, # physics.flu-dyn, etc. # # Usage: # python fetch_arxiv.py # default: cs.LG, 100 papers # python fetch_arxiv.py --category cs.AI # python fetch_arxiv.py --category cs.LG --max 200 --output data_arxiv # # CHEG 667-013 import argparse import os import re import time import urllib.parse import urllib.request import xml.etree.ElementTree as ET def fetch_abstracts(category, max_results, batch_size=50): """Fetch arXiv abstracts in batches via the API.""" base_url = "https://export.arxiv.org/api/query" ns = {"atom": "http://www.w3.org/2005/Atom"} entries = [] for start in range(0, max_results, batch_size): n = min(batch_size, max_results - start) params = { "search_query": f"cat:{category}", "sortBy": "submittedDate", "sortOrder": "descending", "start": start, "max_results": n, } url = f"{base_url}?{urllib.parse.urlencode(params)}" print(f" fetching {start+1}-{start+n}...") with urllib.request.urlopen(url) as resp: data = resp.read() root = ET.fromstring(data) batch = root.findall("atom:entry", ns) if not batch: break entries.extend(batch) time.sleep(3) # arXiv asks for 3-second delay between requests return entries def safe_filename(s, max_len=80): """Convert a title to a filesystem-safe filename.""" s = re.sub(r"\s+", "_", s.strip()) s = re.sub(r"[^A-Za-z0-9._-]", "", s) return s[:max_len] def write_abstract(entry, outdir): """Extract title, authors, date, abstract; write to a text file.""" ns = {"atom": "http://www.w3.org/2005/Atom"} title = entry.find("atom:title", ns).text.strip() summary = entry.find("atom:summary", ns).text.strip() published = entry.find("atom:published", ns).text.strip()[:10] authors = [ a.find("atom:name", ns).text for a in entry.findall("atom:author", ns) ] arxiv_id = entry.find("atom:id", ns).text.strip().split("/")[-1] fname = f"{published}_{safe_filename(title)}.txt" path = os.path.join(outdir, fname) body = ( f"Title: {title}\n" f"Authors: {', '.join(authors)}\n" f"Date: {published}\n" f"arXiv: {arxiv_id}\n" f"\n" f"{summary}\n" ) with open(path, "w") as f: f.write(body) return fname def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--category", default="cs.LG", help="arXiv category (default: cs.LG)") parser.add_argument("--max", type=int, default=100, help="number of abstracts to fetch (default: 100)") parser.add_argument("--output", default="data", help="output directory (default: data)") args = parser.parse_args() os.makedirs(args.output, exist_ok=True) print(f"Fetching {args.max} abstracts from arXiv:{args.category} -> {args.output}/") entries = fetch_abstracts(args.category, args.max) print(f"Got {len(entries)} entries. Writing to {args.output}/...") for e in entries: try: fname = write_abstract(e, args.output) except Exception as exc: print(f" skipped one: {exc}") continue print(f"Done. {len(os.listdir(args.output))} files in {args.output}/") if __name__ == "__main__": main()