Add arXiv PDF download script - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

commit a6c809bdf74b788c3f3285e3a37f649be5193fe9
parent 9168d67f29d824ef189647a7b5049acf0e55cdca
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Fri, 27 Feb 2026 21:08:24 +0100

Add arXiv PDF download script

Downloads PDFs for registry entries with status 'queued' and an arxiv_id.
Updates registry status to 'downloaded' on success. Respects arXiv rate
limits (3s between requests). Supports --dry-run, --limit N, and --id.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Diffstat:
A scripts/download-arxiv.py  | 142 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

1 file changed, 142 insertions(+), 0 deletions(-)
diff --git a/scripts/download-arxiv.py b/scripts/download-arxiv.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+"""
+Download PDFs from arXiv for registry entries with status 'queued' and an arxiv_id.
+
+Usage:
+    python scripts/download-arxiv.py              # Download all queued arXiv papers
+    python scripts/download-arxiv.py --limit 10   # Download up to 10
+    python scripts/download-arxiv.py --dry-run     # Show what would be downloaded
+    python scripts/download-arxiv.py --id metr-rct-2025  # Download a specific paper
+"""
+
+import json
+import sys
+import os
+import time
+import urllib.request
+import urllib.error
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parent.parent
+REGISTRY_PATH = ROOT / "registry.jsonl"
+PAPERS_DIR = ROOT / "papers"
+
+# arXiv rate limit: no more than 1 request per 3 seconds
+DELAY_SECONDS = 3
+
+
+def load_registry():
+    entries = []
+    with open(REGISTRY_PATH) as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                entries.append(json.loads(line))
+    return entries
+
+
+def save_registry(entries):
+    with open(REGISTRY_PATH, "w") as f:
+        for entry in entries:
+            f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+
+
+def download_pdf(arxiv_id, dest_path):
+    """Download a PDF from arXiv. Returns True on success."""
+    url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
+    try:
+        req = urllib.request.Request(url, headers={
+            "User-Agent": "ai-research-survey/1.0 (systematic review project; mailto:research@example.com)"
+        })
+        with urllib.request.urlopen(req, timeout=60) as resp:
+            content = resp.read()
+            if len(content) < 1000:
+                print(f"  WARNING: Response too small ({len(content)} bytes), probably an error page")
+                return False
+            dest_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(dest_path, "wb") as f:
+                f.write(content)
+            size_mb = len(content) / (1024 * 1024)
+            print(f"  Downloaded {size_mb:.1f} MB")
+            return True
+    except urllib.error.HTTPError as e:
+        print(f"  HTTP error {e.code}: {e.reason}")
+        return False
+    except urllib.error.URLError as e:
+        print(f"  URL error: {e.reason}")
+        return False
+    except Exception as e:
+        print(f"  Error: {e}")
+        return False
+
+
+def main():
+    args = sys.argv[1:]
+    dry_run = "--dry-run" in args
+    limit = None
+    specific_id = None
+
+    for i, arg in enumerate(args):
+        if arg == "--limit" and i + 1 < len(args):
+            limit = int(args[i + 1])
+        if arg == "--id" and i + 1 < len(args):
+            specific_id = args[i + 1]
+
+    entries = load_registry()
+
+    # Find downloadable papers
+    candidates = []
+    for entry in entries:
+        if specific_id and entry["id"] != specific_id:
+            continue
+        if not entry.get("arxiv_id"):
+            continue
+        if entry["status"] != "queued" and not specific_id:
+            continue
+        pdf_path = PAPERS_DIR / entry["id"] / "paper.pdf"
+        if pdf_path.exists():
+            continue
+        candidates.append(entry)
+
+    if limit:
+        candidates = candidates[:limit]
+
+    if not candidates:
+        print("No papers to download.")
+        return
+
+    print(f"{'Would download' if dry_run else 'Downloading'} {len(candidates)} paper(s):\n")
+
+    downloaded = 0
+    failed = 0
+
+    for i, entry in enumerate(candidates):
+        print(f"[{i+1}/{len(candidates)}] {entry['id']} (arXiv:{entry['arxiv_id']})")
+        print(f"  {entry['title']}")
+
+        if dry_run:
+            continue
+
+        pdf_path = PAPERS_DIR / entry["id"] / "paper.pdf"
+        if download_pdf(entry["arxiv_id"], pdf_path):
+            entry["status"] = "downloaded"
+            entry["directory"] = f"papers/{entry['id']}"
+            downloaded += 1
+        else:
+            failed += 1
+
+        # Rate limit: wait between requests
+        if i < len(candidates) - 1:
+            time.sleep(DELAY_SECONDS)
+
+    if not dry_run:
+        save_registry(entries)
+        print(f"\nDone. Downloaded: {downloaded}, Failed: {failed}")
+        if downloaded:
+            print(f"Registry updated ({downloaded} entries set to 'downloaded').")
+    else:
+        print(f"\nDry run complete. Use without --dry-run to download.")
+
+
+if __name__ == "__main__":
+    main()

	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs