ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

download-arxiv.py (4305B)


      1 #!/usr/bin/env python3
      2 """
      3 Download PDFs from arXiv for registry entries with status 'queued' and an arxiv_id.
      4 
      5 Usage:
      6     python scripts/download-arxiv.py              # Download all queued arXiv papers
      7     python scripts/download-arxiv.py --limit 10   # Download up to 10
      8     python scripts/download-arxiv.py --dry-run     # Show what would be downloaded
      9     python scripts/download-arxiv.py --id metr-rct-2025  # Download a specific paper
     10 """
     11 
     12 import json
     13 import sys
     14 import os
     15 import time
     16 import urllib.request
     17 import urllib.error
     18 from pathlib import Path
     19 
     20 ROOT = Path(__file__).resolve().parent.parent
     21 REGISTRY_PATH = ROOT / "registry.jsonl"
     22 PAPERS_DIR = ROOT / "papers"
     23 
     24 # arXiv rate limit: no more than 1 request per 3 seconds
     25 DELAY_SECONDS = 3
     26 
     27 
     28 def load_registry():
     29     entries = []
     30     with open(REGISTRY_PATH) as f:
     31         for line in f:
     32             line = line.strip()
     33             if line:
     34                 entries.append(json.loads(line))
     35     return entries
     36 
     37 
     38 def save_registry(entries):
     39     with open(REGISTRY_PATH, "w") as f:
     40         for entry in entries:
     41             f.write(json.dumps(entry, ensure_ascii=False) + "\n")
     42 
     43 
     44 def download_pdf(arxiv_id, dest_path):
     45     """Download a PDF from arXiv. Returns True on success."""
     46     url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
     47     try:
     48         req = urllib.request.Request(url, headers={
     49             "User-Agent": "ai-research-survey/1.0 (systematic review project; mailto:research@example.com)"
     50         })
     51         with urllib.request.urlopen(req, timeout=60) as resp:
     52             content = resp.read()
     53             if len(content) < 1000:
     54                 print(f"  WARNING: Response too small ({len(content)} bytes), probably an error page")
     55                 return False
     56             dest_path.parent.mkdir(parents=True, exist_ok=True)
     57             with open(dest_path, "wb") as f:
     58                 f.write(content)
     59             size_mb = len(content) / (1024 * 1024)
     60             print(f"  Downloaded {size_mb:.1f} MB")
     61             return True
     62     except urllib.error.HTTPError as e:
     63         print(f"  HTTP error {e.code}: {e.reason}")
     64         return False
     65     except urllib.error.URLError as e:
     66         print(f"  URL error: {e.reason}")
     67         return False
     68     except Exception as e:
     69         print(f"  Error: {e}")
     70         return False
     71 
     72 
     73 def main():
     74     args = sys.argv[1:]
     75     dry_run = "--dry-run" in args
     76     limit = None
     77     specific_id = None
     78 
     79     for i, arg in enumerate(args):
     80         if arg == "--limit" and i + 1 < len(args):
     81             limit = int(args[i + 1])
     82         if arg == "--id" and i + 1 < len(args):
     83             specific_id = args[i + 1]
     84 
     85     entries = load_registry()
     86 
     87     # Find downloadable papers
     88     candidates = []
     89     for entry in entries:
     90         if specific_id and entry["id"] != specific_id:
     91             continue
     92         if not entry.get("arxiv_id"):
     93             continue
     94         if entry["status"] != "queued" and not specific_id:
     95             continue
     96         pdf_path = PAPERS_DIR / entry["id"] / "paper.pdf"
     97         if pdf_path.exists():
     98             continue
     99         candidates.append(entry)
    100 
    101     if limit:
    102         candidates = candidates[:limit]
    103 
    104     if not candidates:
    105         print("No papers to download.")
    106         return
    107 
    108     print(f"{'Would download' if dry_run else 'Downloading'} {len(candidates)} paper(s):\n")
    109 
    110     downloaded = 0
    111     failed = 0
    112 
    113     for i, entry in enumerate(candidates):
    114         print(f"[{i+1}/{len(candidates)}] {entry['id']} (arXiv:{entry['arxiv_id']})")
    115         print(f"  {entry['title']}")
    116 
    117         if dry_run:
    118             continue
    119 
    120         pdf_path = PAPERS_DIR / entry["id"] / "paper.pdf"
    121         if download_pdf(entry["arxiv_id"], pdf_path):
    122             entry["status"] = "downloaded"
    123             entry["directory"] = f"papers/{entry['id']}"
    124             downloaded += 1
    125         else:
    126             failed += 1
    127 
    128         # Rate limit: wait between requests
    129         if i < len(candidates) - 1:
    130             time.sleep(DELAY_SECONDS)
    131 
    132     if not dry_run:
    133         save_registry(entries)
    134         print(f"\nDone. Downloaded: {downloaded}, Failed: {failed}")
    135         if downloaded:
    136             print(f"Registry updated ({downloaded} entries set to 'downloaded').")
    137     else:
    138         print(f"\nDry run complete. Use without --dry-run to download.")
    139 
    140 
    141 if __name__ == "__main__":
    142     main()

Impressum · Datenschutz