download-doi.py (16306B)
1 #!/usr/bin/env python3 2 """ 3 Download PDFs for registry entries that have DOIs but no arxiv_id. 4 5 Strategies tried in order: 6 1. Extract arXiv ID from arXiv DOI (10.48550/arXiv.XXXX.XXXXX) and download directly 7 2. Semantic Scholar open access PDF 8 3. Unpaywall 9 4. CORE API (free key: https://core.ac.uk/services/api) 10 5. OpenAlex 11 6. Sci-Hub (opt-in: --scihub flag) 12 13 Usage: 14 python scripts/download-doi.py # Download all eligible 15 python scripts/download-doi.py --limit 10 # First 10 only 16 python scripts/download-doi.py --dry-run # Show what would be downloaded 17 python scripts/download-doi.py --id SLUG # Specific paper 18 python scripts/download-doi.py --scihub # Also try Sci-Hub 19 python scripts/download-doi.py --workers 20 # Parallel workers (default: 10) 20 python scripts/download-doi.py --core-key KEY # CORE API key (or set CORE_API_KEY env var) 21 """ 22 23 import json 24 import os 25 import re 26 import sys 27 import time 28 import threading 29 import urllib.error 30 import urllib.parse 31 import urllib.request 32 from concurrent.futures import ThreadPoolExecutor, as_completed 33 from pathlib import Path 34 35 ROOT = Path(__file__).resolve().parent.parent 36 REGISTRY_PATH = ROOT / "registry.jsonl" 37 PAPERS_DIR = ROOT / "papers" 38 39 USER_AGENT = "ai-research-survey/1.0 (systematic review; mailto:research@example.com)" 40 41 42 # ── Rate limiting ──────────────────────────────────────────────────────── 43 44 class RateLimiter: 45 """Thread-safe per-API rate limiter.""" 46 def __init__(self, min_interval): 47 self.min_interval = min_interval 48 self._lock = threading.Lock() 49 self._last = 0.0 50 51 def wait(self): 52 with self._lock: 53 now = time.monotonic() 54 gap = self.min_interval - (now - self._last) 55 if gap > 0: 56 time.sleep(gap) 57 self._last = time.monotonic() 58 59 60 RATE = { 61 "arxiv": RateLimiter(3.0), # arXiv policy: 1 req / 3s 62 "s2": RateLimiter(0.5), # Semantic Scholar free tier 63 "unpaywall": RateLimiter(0.5), 64 "core": RateLimiter(1.0), 65 "openalex": RateLimiter(0.2), # generous with mailto 66 "scihub": RateLimiter(2.0), 67 } 68 69 _print_lock = threading.Lock() 70 71 72 def log(slug, msg): 73 with _print_lock: 74 print(f" [{slug}] {msg}", flush=True) 75 76 77 # ── Shared helpers ─────────────────────────────────────────────────────── 78 79 def load_registry(): 80 entries = [] 81 with open(REGISTRY_PATH) as f: 82 for line in f: 83 line = line.strip() 84 if line: 85 entries.append(json.loads(line)) 86 return entries 87 88 89 def save_registry(entries): 90 with open(REGISTRY_PATH, "w") as f: 91 for entry in entries: 92 f.write(json.dumps(entry, ensure_ascii=False) + "\n") 93 94 95 def api_get_json(url, headers=None): 96 """GET a URL and parse JSON response. Returns None on any error.""" 97 req_headers = {"User-Agent": USER_AGENT} 98 if headers: 99 req_headers.update(headers) 100 req = urllib.request.Request(url, headers=req_headers) 101 try: 102 with urllib.request.urlopen(req, timeout=30) as resp: 103 return json.loads(resp.read().decode()) 104 except Exception: 105 return None 106 107 108 def download_pdf(url, dest_path, slug=""): 109 """Download a PDF from a URL. Returns True on success.""" 110 try: 111 req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) 112 with urllib.request.urlopen(req, timeout=60) as resp: 113 content_type = resp.headers.get("Content-Type", "") 114 content = resp.read() 115 if len(content) < 1000: 116 return False 117 if content[:5] != b"%PDF-" and "pdf" not in content_type.lower(): 118 return False 119 dest_path.parent.mkdir(parents=True, exist_ok=True) 120 with open(dest_path, "wb") as f: 121 f.write(content) 122 size_mb = len(content) / (1024 * 1024) 123 log(slug, f"downloaded {size_mb:.1f} MB") 124 return True 125 except Exception as e: 126 log(slug, f"download failed: {e}") 127 return False 128 129 130 def clean_doi(raw): 131 """Strip https://doi.org/ prefix if present.""" 132 for prefix in ("https://doi.org/", "http://doi.org/"): 133 if raw.startswith(prefix): 134 return raw[len(prefix):] 135 return raw 136 137 138 # ── Strategy 0: arXiv DOI ──────────────────────────────────────────────── 139 140 def extract_arxiv_from_doi(doi): 141 if doi.startswith("10.48550/"): 142 suffix = doi[9:] 143 if suffix.lower().startswith("arxiv."): 144 return suffix[6:] 145 return None 146 147 148 def try_arxiv(arxiv_id, dest_path, slug=""): 149 RATE["arxiv"].wait() 150 url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" 151 return download_pdf(url, dest_path, slug) 152 153 154 # ── Strategy 1: Semantic Scholar ───────────────────────────────────────── 155 156 def try_semantic_scholar(doi): 157 RATE["s2"].wait() 158 data = api_get_json( 159 f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}" 160 "?fields=openAccessPdf,externalIds" 161 ) 162 if not data: 163 return None, None 164 ext = data.get("externalIds", {}) 165 arxiv_id = ext.get("ArXiv") 166 pdf_info = data.get("openAccessPdf") 167 pdf_url = pdf_info.get("url") if pdf_info else None 168 return pdf_url, arxiv_id 169 170 171 # ── Strategy 2: Unpaywall ──────────────────────────────────────────────── 172 173 def try_unpaywall(doi): 174 RATE["unpaywall"].wait() 175 data = api_get_json( 176 f"https://api.unpaywall.org/v2/{doi}?email=research@example.com" 177 ) 178 if not data: 179 return None 180 best = data.get("best_oa_location") or {} 181 if best.get("url_for_pdf"): 182 return best["url_for_pdf"] 183 for loc in data.get("oa_locations", []): 184 if loc.get("url_for_pdf"): 185 return loc["url_for_pdf"] 186 return None 187 188 189 # ── Strategy 3: CORE API ───────────────────────────────────────────────── 190 191 def try_core(doi, api_key=None): 192 RATE["core"].wait() 193 params = {"q": f'doi:"{doi}"', "limit": "3"} 194 if api_key: 195 params["apiKey"] = api_key 196 url = f"https://api.core.ac.uk/v3/search/works?{urllib.parse.urlencode(params)}" 197 headers = {} 198 if api_key: 199 headers["Authorization"] = f"Bearer {api_key}" 200 data = api_get_json(url, headers=headers) 201 if not data: 202 return None 203 for result in data.get("results", []): 204 dl_url = result.get("downloadUrl") 205 if dl_url: 206 return dl_url 207 for link in result.get("links", []): 208 if link.get("type") in ("download", "pdf") and link.get("url"): 209 return link["url"] 210 return None 211 212 213 # ── Strategy 4: OpenAlex ───────────────────────────────────────────────── 214 215 def try_openalex(doi): 216 RATE["openalex"].wait() 217 encoded = urllib.parse.quote(f"https://doi.org/{doi}", safe="") 218 url = ( 219 f"https://api.openalex.org/works/{encoded}" 220 "?select=open_access,best_oa_location" 221 "&mailto=research@example.com" 222 ) 223 data = api_get_json(url) 224 if not data: 225 return None 226 best = data.get("best_oa_location") or {} 227 if best.get("pdf_url"): 228 return best["pdf_url"] 229 oa = data.get("open_access") or {} 230 return oa.get("oa_url") 231 232 233 # ── Strategy 5: Sci-Hub ────────────────────────────────────────────────── 234 235 SCIHUB_MIRRORS = [ 236 "https://sci-hub.se", 237 "https://sci-hub.st", 238 "https://sci-hub.ru", 239 ] 240 241 242 def try_scihub(doi, dest_path, slug=""): 243 for base in SCIHUB_MIRRORS: 244 RATE["scihub"].wait() 245 url = f"{base}/{doi}" 246 try: 247 req = urllib.request.Request( 248 url, 249 headers={ 250 "User-Agent": USER_AGENT, 251 "Accept": "text/html,application/xhtml+xml", 252 }, 253 ) 254 with urllib.request.urlopen(req, timeout=30) as resp: 255 html = resp.read().decode("utf-8", errors="ignore") 256 except Exception: 257 continue 258 pdf_url = _extract_scihub_pdf_url(html, base) 259 if pdf_url and download_pdf(pdf_url, dest_path, slug): 260 return True 261 return False 262 263 264 def _extract_scihub_pdf_url(html, base): 265 m = re.search( 266 r'<iframe[^>]+id=["\']pdf["\'][^>]*src=["\']([^"\']+)["\']', html 267 ) 268 if not m: 269 m = re.search( 270 r'<iframe[^>]+src=["\']([^"\']+)["\'][^>]*id=["\']pdf["\']', html 271 ) 272 if m: 273 return _fix_url(m.group(1), base) 274 m = re.search(r'<embed[^>]+src=["\']([^"\']+\.pdf[^"\']*)["\']', html) 275 if m: 276 return _fix_url(m.group(1), base) 277 m = re.search(r"location\.href=['\"]([^'\"]+\.pdf[^'\"]*)['\"]", html) 278 if m: 279 return _fix_url(m.group(1), base) 280 return None 281 282 283 def _fix_url(url, base): 284 if url.startswith("//"): 285 return "https:" + url 286 if url.startswith("/"): 287 return base + url 288 return url 289 290 291 # ── Per-paper worker ───────────────────────────────────────────────────── 292 293 def process_paper(entry, use_scihub=False, core_key=None): 294 """Try all strategies for one paper. Returns (success, found_arxiv).""" 295 slug = entry["id"] 296 doi = clean_doi(entry.get("doi", "")) 297 pdf_path = PAPERS_DIR / slug / "paper.pdf" 298 success = False 299 found_arxiv = False 300 301 # Strategy 0: arXiv DOI 302 if doi and not success: 303 arxiv_id = extract_arxiv_from_doi(doi) 304 if arxiv_id: 305 log(slug, f"arXiv DOI → {arxiv_id}") 306 entry["arxiv_id"] = arxiv_id 307 found_arxiv = True 308 success = try_arxiv(arxiv_id, pdf_path, slug) 309 310 # Strategy 1: Semantic Scholar 311 if doi and not success: 312 log(slug, "trying S2") 313 pdf_url, s2_arxiv = try_semantic_scholar(doi) 314 if s2_arxiv and not entry.get("arxiv_id"): 315 entry["arxiv_id"] = s2_arxiv 316 found_arxiv = True 317 log(slug, f"found arXiv:{s2_arxiv}") 318 if pdf_url: 319 success = download_pdf(pdf_url, pdf_path, slug) 320 elif s2_arxiv: 321 success = try_arxiv(s2_arxiv, pdf_path, slug) 322 323 # Strategy 2: Unpaywall 324 if doi and not success: 325 log(slug, "trying Unpaywall") 326 pdf_url = try_unpaywall(doi) 327 if pdf_url: 328 success = download_pdf(pdf_url, pdf_path, slug) 329 330 # Strategy 3: CORE 331 if doi and not success: 332 log(slug, "trying CORE") 333 pdf_url = try_core(doi, core_key) 334 if pdf_url: 335 success = download_pdf(pdf_url, pdf_path, slug) 336 337 # Strategy 4: OpenAlex 338 if doi and not success: 339 log(slug, "trying OpenAlex") 340 pdf_url = try_openalex(doi) 341 if pdf_url: 342 success = download_pdf(pdf_url, pdf_path, slug) 343 344 # Strategy 5: Sci-Hub (opt-in) 345 if doi and not success and use_scihub: 346 log(slug, "trying Sci-Hub") 347 success = try_scihub(doi, pdf_path, slug) 348 349 if success: 350 entry["status"] = "downloaded" 351 entry["directory"] = f"papers/{slug}" 352 log(slug, "OK") 353 else: 354 log(slug, "FAILED — no open access PDF found") 355 356 return success, found_arxiv 357 358 359 # ── Main ───────────────────────────────────────────────────────────────── 360 361 def main(): 362 args = sys.argv[1:] 363 dry_run = "--dry-run" in args 364 use_scihub = "--scihub" in args 365 limit = None 366 specific_id = None 367 workers = 10 368 core_key = os.environ.get("CORE_API_KEY") 369 370 for i, arg in enumerate(args): 371 if arg == "--limit" and i + 1 < len(args): 372 limit = int(args[i + 1]) 373 if arg == "--id" and i + 1 < len(args): 374 specific_id = args[i + 1] 375 if arg == "--core-key" and i + 1 < len(args): 376 core_key = args[i + 1] 377 if arg == "--workers" and i + 1 < len(args): 378 workers = int(args[i + 1]) 379 380 entries = load_registry() 381 382 candidates = [] 383 for entry in entries: 384 if specific_id and entry["id"] != specific_id: 385 continue 386 if entry.get("arxiv_id"): 387 continue # Handled by download-arxiv.py 388 if entry["status"] != "queued" and not specific_id: 389 continue 390 if not entry.get("doi") and not entry.get("source_url"): 391 continue 392 pdf_path = PAPERS_DIR / entry["id"] / "paper.pdf" 393 if pdf_path.exists(): 394 continue 395 candidates.append(entry) 396 397 if limit: 398 candidates = candidates[:limit] 399 400 if not candidates: 401 print("No papers to download.") 402 return 403 404 strategies = ["arXiv-DOI", "S2", "Unpaywall", "CORE", "OpenAlex"] 405 if use_scihub: 406 strategies.append("Sci-Hub") 407 print(f"{'Would try' if dry_run else 'Trying'} {len(candidates)} paper(s) " 408 f"with {workers} workers " 409 f"[strategies: {', '.join(strategies)}]\n") 410 411 if dry_run: 412 for i, entry in enumerate(candidates): 413 print(f"[{i+1}/{len(candidates)}] {entry['id']} {entry['title'][:70]}") 414 print(f"\nDry run complete. {len(candidates)} candidates.") 415 return 416 417 downloaded = 0 418 found_arxiv = 0 419 failed = 0 420 failed_list = [] 421 _counter_lock = threading.Lock() 422 save_interval = 50 # save registry every N completions 423 completions = 0 424 425 def on_done(future, entry): 426 nonlocal downloaded, found_arxiv, failed, completions 427 try: 428 success, got_arxiv = future.result() 429 except Exception as e: 430 log(entry["id"], f"exception: {e}") 431 success, got_arxiv = False, False 432 433 with _counter_lock: 434 if success: 435 downloaded += 1 436 else: 437 failed += 1 438 failed_list.append(entry) 439 if got_arxiv: 440 found_arxiv += 1 441 completions += 1 442 n = completions 443 with _print_lock: 444 print(f"[{n}/{len(candidates)}] " 445 f"ok={downloaded} fail={failed}", flush=True) 446 447 # periodic save 448 if n % save_interval == 0: 449 save_registry(entries) 450 with _print_lock: 451 print(f" (registry saved at {n})", flush=True) 452 453 with ThreadPoolExecutor(max_workers=workers) as pool: 454 futures = [] 455 for entry in candidates: 456 fut = pool.submit(process_paper, entry, use_scihub, core_key) 457 fut.add_done_callback(lambda f, e=entry: on_done(f, e)) 458 futures.append(fut) 459 460 # wait for all to complete 461 for fut in futures: 462 fut.result() 463 464 # final save 465 save_registry(entries) 466 467 with open(ROOT / "manual-download-needed.txt", "w") as f: 468 f.write(f"# Papers requiring manual download ({len(failed_list)} total)\n") 469 f.write("# All automated download strategies exhausted.\n") 470 f.write("# Download manually → papers/<slug>/paper.pdf\n\n") 471 for e in failed_list: 472 url = e.get("source_url") or e.get("doi") or "no URL" 473 f.write(f"{e['id']}\n") 474 f.write(f" {e['title']}\n") 475 f.write(f" {e.get('year', '?')} | {e.get('venue', '?')}\n") 476 f.write(f" {url}\n\n") 477 478 print(f"\nDone. Downloaded: {downloaded}, Failed: {failed}") 479 if found_arxiv: 480 print(f"Found {found_arxiv} new arXiv IDs") 481 if failed_list: 482 print(f"Remaining {len(failed_list)} written to manual-download-needed.txt") 483 484 485 if __name__ == "__main__": 486 main()