ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

commit a6c809bdf74b788c3f3285e3a37f649be5193fe9
parent 9168d67f29d824ef189647a7b5049acf0e55cdca
Author: Brian Graham <brian@buildingbetterteams.de>
Date:   Fri, 27 Feb 2026 21:08:24 +0100

Add arXiv PDF download script

Downloads PDFs for registry entries with status 'queued' and an arxiv_id.
Updates registry status to 'downloaded' on success. Respects arXiv rate
limits (3s between requests). Supports --dry-run, --limit N, and --id.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Diffstat:
Ascripts/download-arxiv.py | 142+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 142 insertions(+), 0 deletions(-)

diff --git a/scripts/download-arxiv.py b/scripts/download-arxiv.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python3 +""" +Download PDFs from arXiv for registry entries with status 'queued' and an arxiv_id. + +Usage: + python scripts/download-arxiv.py # Download all queued arXiv papers + python scripts/download-arxiv.py --limit 10 # Download up to 10 + python scripts/download-arxiv.py --dry-run # Show what would be downloaded + python scripts/download-arxiv.py --id metr-rct-2025 # Download a specific paper +""" + +import json +import sys +import os +import time +import urllib.request +import urllib.error +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +REGISTRY_PATH = ROOT / "registry.jsonl" +PAPERS_DIR = ROOT / "papers" + +# arXiv rate limit: no more than 1 request per 3 seconds +DELAY_SECONDS = 3 + + +def load_registry(): + entries = [] + with open(REGISTRY_PATH) as f: + for line in f: + line = line.strip() + if line: + entries.append(json.loads(line)) + return entries + + +def save_registry(entries): + with open(REGISTRY_PATH, "w") as f: + for entry in entries: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + + +def download_pdf(arxiv_id, dest_path): + """Download a PDF from arXiv. Returns True on success.""" + url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" + try: + req = urllib.request.Request(url, headers={ + "User-Agent": "ai-research-survey/1.0 (systematic review project; mailto:research@example.com)" + }) + with urllib.request.urlopen(req, timeout=60) as resp: + content = resp.read() + if len(content) < 1000: + print(f" WARNING: Response too small ({len(content)} bytes), probably an error page") + return False + dest_path.parent.mkdir(parents=True, exist_ok=True) + with open(dest_path, "wb") as f: + f.write(content) + size_mb = len(content) / (1024 * 1024) + print(f" Downloaded {size_mb:.1f} MB") + return True + except urllib.error.HTTPError as e: + print(f" HTTP error {e.code}: {e.reason}") + return False + except urllib.error.URLError as e: + print(f" URL error: {e.reason}") + return False + except Exception as e: + print(f" Error: {e}") + return False + + +def main(): + args = sys.argv[1:] + dry_run = "--dry-run" in args + limit = None + specific_id = None + + for i, arg in enumerate(args): + if arg == "--limit" and i + 1 < len(args): + limit = int(args[i + 1]) + if arg == "--id" and i + 1 < len(args): + specific_id = args[i + 1] + + entries = load_registry() + + # Find downloadable papers + candidates = [] + for entry in entries: + if specific_id and entry["id"] != specific_id: + continue + if not entry.get("arxiv_id"): + continue + if entry["status"] != "queued" and not specific_id: + continue + pdf_path = PAPERS_DIR / entry["id"] / "paper.pdf" + if pdf_path.exists(): + continue + candidates.append(entry) + + if limit: + candidates = candidates[:limit] + + if not candidates: + print("No papers to download.") + return + + print(f"{'Would download' if dry_run else 'Downloading'} {len(candidates)} paper(s):\n") + + downloaded = 0 + failed = 0 + + for i, entry in enumerate(candidates): + print(f"[{i+1}/{len(candidates)}] {entry['id']} (arXiv:{entry['arxiv_id']})") + print(f" {entry['title']}") + + if dry_run: + continue + + pdf_path = PAPERS_DIR / entry["id"] / "paper.pdf" + if download_pdf(entry["arxiv_id"], pdf_path): + entry["status"] = "downloaded" + entry["directory"] = f"papers/{entry['id']}" + downloaded += 1 + else: + failed += 1 + + # Rate limit: wait between requests + if i < len(candidates) - 1: + time.sleep(DELAY_SECONDS) + + if not dry_run: + save_registry(entries) + print(f"\nDone. Downloaded: {downloaded}, Failed: {failed}") + if downloaded: + print(f"Registry updated ({downloaded} entries set to 'downloaded').") + else: + print(f"\nDry run complete. Use without --dry-run to download.") + + +if __name__ == "__main__": + main()

Impressum · Datenschutz