download-arxiv.py (4305B)
1 #!/usr/bin/env python3 2 """ 3 Download PDFs from arXiv for registry entries with status 'queued' and an arxiv_id. 4 5 Usage: 6 python scripts/download-arxiv.py # Download all queued arXiv papers 7 python scripts/download-arxiv.py --limit 10 # Download up to 10 8 python scripts/download-arxiv.py --dry-run # Show what would be downloaded 9 python scripts/download-arxiv.py --id metr-rct-2025 # Download a specific paper 10 """ 11 12 import json 13 import sys 14 import os 15 import time 16 import urllib.request 17 import urllib.error 18 from pathlib import Path 19 20 ROOT = Path(__file__).resolve().parent.parent 21 REGISTRY_PATH = ROOT / "registry.jsonl" 22 PAPERS_DIR = ROOT / "papers" 23 24 # arXiv rate limit: no more than 1 request per 3 seconds 25 DELAY_SECONDS = 3 26 27 28 def load_registry(): 29 entries = [] 30 with open(REGISTRY_PATH) as f: 31 for line in f: 32 line = line.strip() 33 if line: 34 entries.append(json.loads(line)) 35 return entries 36 37 38 def save_registry(entries): 39 with open(REGISTRY_PATH, "w") as f: 40 for entry in entries: 41 f.write(json.dumps(entry, ensure_ascii=False) + "\n") 42 43 44 def download_pdf(arxiv_id, dest_path): 45 """Download a PDF from arXiv. Returns True on success.""" 46 url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" 47 try: 48 req = urllib.request.Request(url, headers={ 49 "User-Agent": "ai-research-survey/1.0 (systematic review project; mailto:research@example.com)" 50 }) 51 with urllib.request.urlopen(req, timeout=60) as resp: 52 content = resp.read() 53 if len(content) < 1000: 54 print(f" WARNING: Response too small ({len(content)} bytes), probably an error page") 55 return False 56 dest_path.parent.mkdir(parents=True, exist_ok=True) 57 with open(dest_path, "wb") as f: 58 f.write(content) 59 size_mb = len(content) / (1024 * 1024) 60 print(f" Downloaded {size_mb:.1f} MB") 61 return True 62 except urllib.error.HTTPError as e: 63 print(f" HTTP error {e.code}: {e.reason}") 64 return False 65 except urllib.error.URLError as e: 66 print(f" URL error: {e.reason}") 67 return False 68 except Exception as e: 69 print(f" Error: {e}") 70 return False 71 72 73 def main(): 74 args = sys.argv[1:] 75 dry_run = "--dry-run" in args 76 limit = None 77 specific_id = None 78 79 for i, arg in enumerate(args): 80 if arg == "--limit" and i + 1 < len(args): 81 limit = int(args[i + 1]) 82 if arg == "--id" and i + 1 < len(args): 83 specific_id = args[i + 1] 84 85 entries = load_registry() 86 87 # Find downloadable papers 88 candidates = [] 89 for entry in entries: 90 if specific_id and entry["id"] != specific_id: 91 continue 92 if not entry.get("arxiv_id"): 93 continue 94 if entry["status"] != "queued" and not specific_id: 95 continue 96 pdf_path = PAPERS_DIR / entry["id"] / "paper.pdf" 97 if pdf_path.exists(): 98 continue 99 candidates.append(entry) 100 101 if limit: 102 candidates = candidates[:limit] 103 104 if not candidates: 105 print("No papers to download.") 106 return 107 108 print(f"{'Would download' if dry_run else 'Downloading'} {len(candidates)} paper(s):\n") 109 110 downloaded = 0 111 failed = 0 112 113 for i, entry in enumerate(candidates): 114 print(f"[{i+1}/{len(candidates)}] {entry['id']} (arXiv:{entry['arxiv_id']})") 115 print(f" {entry['title']}") 116 117 if dry_run: 118 continue 119 120 pdf_path = PAPERS_DIR / entry["id"] / "paper.pdf" 121 if download_pdf(entry["arxiv_id"], pdf_path): 122 entry["status"] = "downloaded" 123 entry["directory"] = f"papers/{entry['id']}" 124 downloaded += 1 125 else: 126 failed += 1 127 128 # Rate limit: wait between requests 129 if i < len(candidates) - 1: 130 time.sleep(DELAY_SECONDS) 131 132 if not dry_run: 133 save_registry(entries) 134 print(f"\nDone. Downloaded: {downloaded}, Failed: {failed}") 135 if downloaded: 136 print(f"Registry updated ({downloaded} entries set to 'downloaded').") 137 else: 138 print(f"\nDry run complete. Use without --dry-run to download.") 139 140 141 if __name__ == "__main__": 142 main()