commit a6c809bdf74b788c3f3285e3a37f649be5193fe9
parent 9168d67f29d824ef189647a7b5049acf0e55cdca
Author: Brian Graham <brian@buildingbetterteams.de>
Date: Fri, 27 Feb 2026 21:08:24 +0100
Add arXiv PDF download script
Downloads PDFs for registry entries with status 'queued' and an arxiv_id.
Updates registry status to 'downloaded' on success. Respects arXiv rate
limits (3s between requests). Supports --dry-run, --limit N, and --id.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Diffstat:
1 file changed, 142 insertions(+), 0 deletions(-)
diff --git a/scripts/download-arxiv.py b/scripts/download-arxiv.py
@@ -0,0 +1,142 @@
+#!/usr/bin/env python3
+"""
+Download PDFs from arXiv for registry entries with status 'queued' and an arxiv_id.
+
+Usage:
+ python scripts/download-arxiv.py # Download all queued arXiv papers
+ python scripts/download-arxiv.py --limit 10 # Download up to 10
+ python scripts/download-arxiv.py --dry-run # Show what would be downloaded
+ python scripts/download-arxiv.py --id metr-rct-2025 # Download a specific paper
+"""
+
+import json
+import sys
+import os
+import time
+import urllib.request
+import urllib.error
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parent.parent
+REGISTRY_PATH = ROOT / "registry.jsonl"
+PAPERS_DIR = ROOT / "papers"
+
+# arXiv rate limit: no more than 1 request per 3 seconds
+DELAY_SECONDS = 3
+
+
+def load_registry():
+ entries = []
+ with open(REGISTRY_PATH) as f:
+ for line in f:
+ line = line.strip()
+ if line:
+ entries.append(json.loads(line))
+ return entries
+
+
+def save_registry(entries):
+ with open(REGISTRY_PATH, "w") as f:
+ for entry in entries:
+ f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+
+
+def download_pdf(arxiv_id, dest_path):
+ """Download a PDF from arXiv. Returns True on success."""
+ url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
+ try:
+ req = urllib.request.Request(url, headers={
+ "User-Agent": "ai-research-survey/1.0 (systematic review project; mailto:research@example.com)"
+ })
+ with urllib.request.urlopen(req, timeout=60) as resp:
+ content = resp.read()
+ if len(content) < 1000:
+ print(f" WARNING: Response too small ({len(content)} bytes), probably an error page")
+ return False
+ dest_path.parent.mkdir(parents=True, exist_ok=True)
+ with open(dest_path, "wb") as f:
+ f.write(content)
+ size_mb = len(content) / (1024 * 1024)
+ print(f" Downloaded {size_mb:.1f} MB")
+ return True
+ except urllib.error.HTTPError as e:
+ print(f" HTTP error {e.code}: {e.reason}")
+ return False
+ except urllib.error.URLError as e:
+ print(f" URL error: {e.reason}")
+ return False
+ except Exception as e:
+ print(f" Error: {e}")
+ return False
+
+
+def main():
+ args = sys.argv[1:]
+ dry_run = "--dry-run" in args
+ limit = None
+ specific_id = None
+
+ for i, arg in enumerate(args):
+ if arg == "--limit" and i + 1 < len(args):
+ limit = int(args[i + 1])
+ if arg == "--id" and i + 1 < len(args):
+ specific_id = args[i + 1]
+
+ entries = load_registry()
+
+ # Find downloadable papers
+ candidates = []
+ for entry in entries:
+ if specific_id and entry["id"] != specific_id:
+ continue
+ if not entry.get("arxiv_id"):
+ continue
+ if entry["status"] != "queued" and not specific_id:
+ continue
+ pdf_path = PAPERS_DIR / entry["id"] / "paper.pdf"
+ if pdf_path.exists():
+ continue
+ candidates.append(entry)
+
+ if limit:
+ candidates = candidates[:limit]
+
+ if not candidates:
+ print("No papers to download.")
+ return
+
+ print(f"{'Would download' if dry_run else 'Downloading'} {len(candidates)} paper(s):\n")
+
+ downloaded = 0
+ failed = 0
+
+ for i, entry in enumerate(candidates):
+ print(f"[{i+1}/{len(candidates)}] {entry['id']} (arXiv:{entry['arxiv_id']})")
+ print(f" {entry['title']}")
+
+ if dry_run:
+ continue
+
+ pdf_path = PAPERS_DIR / entry["id"] / "paper.pdf"
+ if download_pdf(entry["arxiv_id"], pdf_path):
+ entry["status"] = "downloaded"
+ entry["directory"] = f"papers/{entry['id']}"
+ downloaded += 1
+ else:
+ failed += 1
+
+ # Rate limit: wait between requests
+ if i < len(candidates) - 1:
+ time.sleep(DELAY_SECONDS)
+
+ if not dry_run:
+ save_registry(entries)
+ print(f"\nDone. Downloaded: {downloaded}, Failed: {failed}")
+ if downloaded:
+ print(f"Registry updated ({downloaded} entries set to 'downloaded').")
+ else:
+ print(f"\nDry run complete. Use without --dry-run to download.")
+
+
+if __name__ == "__main__":
+ main()