download-doi.py - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

download-doi.py (16306B)
      1 #!/usr/bin/env python3
      2 """
      3 Download PDFs for registry entries that have DOIs but no arxiv_id.
      4 
      5 Strategies tried in order:
      6 1. Extract arXiv ID from arXiv DOI (10.48550/arXiv.XXXX.XXXXX) and download directly
      7 2. Semantic Scholar open access PDF
      8 3. Unpaywall
      9 4. CORE API (free key: https://core.ac.uk/services/api)
     10 5. OpenAlex
     11 6. Sci-Hub (opt-in: --scihub flag)
     12 
     13 Usage:
     14     python scripts/download-doi.py              # Download all eligible
     15     python scripts/download-doi.py --limit 10   # First 10 only
     16     python scripts/download-doi.py --dry-run    # Show what would be downloaded
     17     python scripts/download-doi.py --id SLUG    # Specific paper
     18     python scripts/download-doi.py --scihub     # Also try Sci-Hub
     19     python scripts/download-doi.py --workers 20 # Parallel workers (default: 10)
     20     python scripts/download-doi.py --core-key KEY  # CORE API key (or set CORE_API_KEY env var)
     21 """
     22 
     23 import json
     24 import os
     25 import re
     26 import sys
     27 import time
     28 import threading
     29 import urllib.error
     30 import urllib.parse
     31 import urllib.request
     32 from concurrent.futures import ThreadPoolExecutor, as_completed
     33 from pathlib import Path
     34 
     35 ROOT = Path(__file__).resolve().parent.parent
     36 REGISTRY_PATH = ROOT / "registry.jsonl"
     37 PAPERS_DIR = ROOT / "papers"
     38 
     39 USER_AGENT = "ai-research-survey/1.0 (systematic review; mailto:research@example.com)"
     40 
     41 
     42 # ── Rate limiting ────────────────────────────────────────────────────────
     43 
     44 class RateLimiter:
     45     """Thread-safe per-API rate limiter."""
     46     def __init__(self, min_interval):
     47         self.min_interval = min_interval
     48         self._lock = threading.Lock()
     49         self._last = 0.0
     50 
     51     def wait(self):
     52         with self._lock:
     53             now = time.monotonic()
     54             gap = self.min_interval - (now - self._last)
     55             if gap > 0:
     56                 time.sleep(gap)
     57             self._last = time.monotonic()
     58 
     59 
     60 RATE = {
     61     "arxiv":     RateLimiter(3.0),   # arXiv policy: 1 req / 3s
     62     "s2":        RateLimiter(0.5),   # Semantic Scholar free tier
     63     "unpaywall": RateLimiter(0.5),
     64     "core":      RateLimiter(1.0),
     65     "openalex":  RateLimiter(0.2),   # generous with mailto
     66     "scihub":    RateLimiter(2.0),
     67 }
     68 
     69 _print_lock = threading.Lock()
     70 
     71 
     72 def log(slug, msg):
     73     with _print_lock:
     74         print(f"  [{slug}] {msg}", flush=True)
     75 
     76 
     77 # ── Shared helpers ───────────────────────────────────────────────────────
     78 
     79 def load_registry():
     80     entries = []
     81     with open(REGISTRY_PATH) as f:
     82         for line in f:
     83             line = line.strip()
     84             if line:
     85                 entries.append(json.loads(line))
     86     return entries
     87 
     88 
     89 def save_registry(entries):
     90     with open(REGISTRY_PATH, "w") as f:
     91         for entry in entries:
     92             f.write(json.dumps(entry, ensure_ascii=False) + "\n")
     93 
     94 
     95 def api_get_json(url, headers=None):
     96     """GET a URL and parse JSON response. Returns None on any error."""
     97     req_headers = {"User-Agent": USER_AGENT}
     98     if headers:
     99         req_headers.update(headers)
    100     req = urllib.request.Request(url, headers=req_headers)
    101     try:
    102         with urllib.request.urlopen(req, timeout=30) as resp:
    103             return json.loads(resp.read().decode())
    104     except Exception:
    105         return None
    106 
    107 
    108 def download_pdf(url, dest_path, slug=""):
    109     """Download a PDF from a URL. Returns True on success."""
    110     try:
    111         req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
    112         with urllib.request.urlopen(req, timeout=60) as resp:
    113             content_type = resp.headers.get("Content-Type", "")
    114             content = resp.read()
    115             if len(content) < 1000:
    116                 return False
    117             if content[:5] != b"%PDF-" and "pdf" not in content_type.lower():
    118                 return False
    119             dest_path.parent.mkdir(parents=True, exist_ok=True)
    120             with open(dest_path, "wb") as f:
    121                 f.write(content)
    122             size_mb = len(content) / (1024 * 1024)
    123             log(slug, f"downloaded {size_mb:.1f} MB")
    124             return True
    125     except Exception as e:
    126         log(slug, f"download failed: {e}")
    127         return False
    128 
    129 
    130 def clean_doi(raw):
    131     """Strip https://doi.org/ prefix if present."""
    132     for prefix in ("https://doi.org/", "http://doi.org/"):
    133         if raw.startswith(prefix):
    134             return raw[len(prefix):]
    135     return raw
    136 
    137 
    138 # ── Strategy 0: arXiv DOI ────────────────────────────────────────────────
    139 
    140 def extract_arxiv_from_doi(doi):
    141     if doi.startswith("10.48550/"):
    142         suffix = doi[9:]
    143         if suffix.lower().startswith("arxiv."):
    144             return suffix[6:]
    145     return None
    146 
    147 
    148 def try_arxiv(arxiv_id, dest_path, slug=""):
    149     RATE["arxiv"].wait()
    150     url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    151     return download_pdf(url, dest_path, slug)
    152 
    153 
    154 # ── Strategy 1: Semantic Scholar ─────────────────────────────────────────
    155 
    156 def try_semantic_scholar(doi):
    157     RATE["s2"].wait()
    158     data = api_get_json(
    159         f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}"
    160         "?fields=openAccessPdf,externalIds"
    161     )
    162     if not data:
    163         return None, None
    164     ext = data.get("externalIds", {})
    165     arxiv_id = ext.get("ArXiv")
    166     pdf_info = data.get("openAccessPdf")
    167     pdf_url = pdf_info.get("url") if pdf_info else None
    168     return pdf_url, arxiv_id
    169 
    170 
    171 # ── Strategy 2: Unpaywall ────────────────────────────────────────────────
    172 
    173 def try_unpaywall(doi):
    174     RATE["unpaywall"].wait()
    175     data = api_get_json(
    176         f"https://api.unpaywall.org/v2/{doi}?email=research@example.com"
    177     )
    178     if not data:
    179         return None
    180     best = data.get("best_oa_location") or {}
    181     if best.get("url_for_pdf"):
    182         return best["url_for_pdf"]
    183     for loc in data.get("oa_locations", []):
    184         if loc.get("url_for_pdf"):
    185             return loc["url_for_pdf"]
    186     return None
    187 
    188 
    189 # ── Strategy 3: CORE API ─────────────────────────────────────────────────
    190 
    191 def try_core(doi, api_key=None):
    192     RATE["core"].wait()
    193     params = {"q": f'doi:"{doi}"', "limit": "3"}
    194     if api_key:
    195         params["apiKey"] = api_key
    196     url = f"https://api.core.ac.uk/v3/search/works?{urllib.parse.urlencode(params)}"
    197     headers = {}
    198     if api_key:
    199         headers["Authorization"] = f"Bearer {api_key}"
    200     data = api_get_json(url, headers=headers)
    201     if not data:
    202         return None
    203     for result in data.get("results", []):
    204         dl_url = result.get("downloadUrl")
    205         if dl_url:
    206             return dl_url
    207         for link in result.get("links", []):
    208             if link.get("type") in ("download", "pdf") and link.get("url"):
    209                 return link["url"]
    210     return None
    211 
    212 
    213 # ── Strategy 4: OpenAlex ─────────────────────────────────────────────────
    214 
    215 def try_openalex(doi):
    216     RATE["openalex"].wait()
    217     encoded = urllib.parse.quote(f"https://doi.org/{doi}", safe="")
    218     url = (
    219         f"https://api.openalex.org/works/{encoded}"
    220         "?select=open_access,best_oa_location"
    221         "&mailto=research@example.com"
    222     )
    223     data = api_get_json(url)
    224     if not data:
    225         return None
    226     best = data.get("best_oa_location") or {}
    227     if best.get("pdf_url"):
    228         return best["pdf_url"]
    229     oa = data.get("open_access") or {}
    230     return oa.get("oa_url")
    231 
    232 
    233 # ── Strategy 5: Sci-Hub ──────────────────────────────────────────────────
    234 
    235 SCIHUB_MIRRORS = [
    236     "https://sci-hub.se",
    237     "https://sci-hub.st",
    238     "https://sci-hub.ru",
    239 ]
    240 
    241 
    242 def try_scihub(doi, dest_path, slug=""):
    243     for base in SCIHUB_MIRRORS:
    244         RATE["scihub"].wait()
    245         url = f"{base}/{doi}"
    246         try:
    247             req = urllib.request.Request(
    248                 url,
    249                 headers={
    250                     "User-Agent": USER_AGENT,
    251                     "Accept": "text/html,application/xhtml+xml",
    252                 },
    253             )
    254             with urllib.request.urlopen(req, timeout=30) as resp:
    255                 html = resp.read().decode("utf-8", errors="ignore")
    256         except Exception:
    257             continue
    258         pdf_url = _extract_scihub_pdf_url(html, base)
    259         if pdf_url and download_pdf(pdf_url, dest_path, slug):
    260             return True
    261     return False
    262 
    263 
    264 def _extract_scihub_pdf_url(html, base):
    265     m = re.search(
    266         r'<iframe[^>]+id=["\']pdf["\'][^>]*src=["\']([^"\']+)["\']', html
    267     )
    268     if not m:
    269         m = re.search(
    270             r'<iframe[^>]+src=["\']([^"\']+)["\'][^>]*id=["\']pdf["\']', html
    271         )
    272     if m:
    273         return _fix_url(m.group(1), base)
    274     m = re.search(r'<embed[^>]+src=["\']([^"\']+\.pdf[^"\']*)["\']', html)
    275     if m:
    276         return _fix_url(m.group(1), base)
    277     m = re.search(r"location\.href=['\"]([^'\"]+\.pdf[^'\"]*)['\"]", html)
    278     if m:
    279         return _fix_url(m.group(1), base)
    280     return None
    281 
    282 
    283 def _fix_url(url, base):
    284     if url.startswith("//"):
    285         return "https:" + url
    286     if url.startswith("/"):
    287         return base + url
    288     return url
    289 
    290 
    291 # ── Per-paper worker ─────────────────────────────────────────────────────
    292 
    293 def process_paper(entry, use_scihub=False, core_key=None):
    294     """Try all strategies for one paper. Returns (success, found_arxiv)."""
    295     slug = entry["id"]
    296     doi = clean_doi(entry.get("doi", ""))
    297     pdf_path = PAPERS_DIR / slug / "paper.pdf"
    298     success = False
    299     found_arxiv = False
    300 
    301     # Strategy 0: arXiv DOI
    302     if doi and not success:
    303         arxiv_id = extract_arxiv_from_doi(doi)
    304         if arxiv_id:
    305             log(slug, f"arXiv DOI → {arxiv_id}")
    306             entry["arxiv_id"] = arxiv_id
    307             found_arxiv = True
    308             success = try_arxiv(arxiv_id, pdf_path, slug)
    309 
    310     # Strategy 1: Semantic Scholar
    311     if doi and not success:
    312         log(slug, "trying S2")
    313         pdf_url, s2_arxiv = try_semantic_scholar(doi)
    314         if s2_arxiv and not entry.get("arxiv_id"):
    315             entry["arxiv_id"] = s2_arxiv
    316             found_arxiv = True
    317             log(slug, f"found arXiv:{s2_arxiv}")
    318         if pdf_url:
    319             success = download_pdf(pdf_url, pdf_path, slug)
    320         elif s2_arxiv:
    321             success = try_arxiv(s2_arxiv, pdf_path, slug)
    322 
    323     # Strategy 2: Unpaywall
    324     if doi and not success:
    325         log(slug, "trying Unpaywall")
    326         pdf_url = try_unpaywall(doi)
    327         if pdf_url:
    328             success = download_pdf(pdf_url, pdf_path, slug)
    329 
    330     # Strategy 3: CORE
    331     if doi and not success:
    332         log(slug, "trying CORE")
    333         pdf_url = try_core(doi, core_key)
    334         if pdf_url:
    335             success = download_pdf(pdf_url, pdf_path, slug)
    336 
    337     # Strategy 4: OpenAlex
    338     if doi and not success:
    339         log(slug, "trying OpenAlex")
    340         pdf_url = try_openalex(doi)
    341         if pdf_url:
    342             success = download_pdf(pdf_url, pdf_path, slug)
    343 
    344     # Strategy 5: Sci-Hub (opt-in)
    345     if doi and not success and use_scihub:
    346         log(slug, "trying Sci-Hub")
    347         success = try_scihub(doi, pdf_path, slug)
    348 
    349     if success:
    350         entry["status"] = "downloaded"
    351         entry["directory"] = f"papers/{slug}"
    352         log(slug, "OK")
    353     else:
    354         log(slug, "FAILED — no open access PDF found")
    355 
    356     return success, found_arxiv
    357 
    358 
    359 # ── Main ─────────────────────────────────────────────────────────────────
    360 
    361 def main():
    362     args = sys.argv[1:]
    363     dry_run = "--dry-run" in args
    364     use_scihub = "--scihub" in args
    365     limit = None
    366     specific_id = None
    367     workers = 10
    368     core_key = os.environ.get("CORE_API_KEY")
    369 
    370     for i, arg in enumerate(args):
    371         if arg == "--limit" and i + 1 < len(args):
    372             limit = int(args[i + 1])
    373         if arg == "--id" and i + 1 < len(args):
    374             specific_id = args[i + 1]
    375         if arg == "--core-key" and i + 1 < len(args):
    376             core_key = args[i + 1]
    377         if arg == "--workers" and i + 1 < len(args):
    378             workers = int(args[i + 1])
    379 
    380     entries = load_registry()
    381 
    382     candidates = []
    383     for entry in entries:
    384         if specific_id and entry["id"] != specific_id:
    385             continue
    386         if entry.get("arxiv_id"):
    387             continue  # Handled by download-arxiv.py
    388         if entry["status"] != "queued" and not specific_id:
    389             continue
    390         if not entry.get("doi") and not entry.get("source_url"):
    391             continue
    392         pdf_path = PAPERS_DIR / entry["id"] / "paper.pdf"
    393         if pdf_path.exists():
    394             continue
    395         candidates.append(entry)
    396 
    397     if limit:
    398         candidates = candidates[:limit]
    399 
    400     if not candidates:
    401         print("No papers to download.")
    402         return
    403 
    404     strategies = ["arXiv-DOI", "S2", "Unpaywall", "CORE", "OpenAlex"]
    405     if use_scihub:
    406         strategies.append("Sci-Hub")
    407     print(f"{'Would try' if dry_run else 'Trying'} {len(candidates)} paper(s) "
    408           f"with {workers} workers "
    409           f"[strategies: {', '.join(strategies)}]\n")
    410 
    411     if dry_run:
    412         for i, entry in enumerate(candidates):
    413             print(f"[{i+1}/{len(candidates)}] {entry['id']}  {entry['title'][:70]}")
    414         print(f"\nDry run complete. {len(candidates)} candidates.")
    415         return
    416 
    417     downloaded = 0
    418     found_arxiv = 0
    419     failed = 0
    420     failed_list = []
    421     _counter_lock = threading.Lock()
    422     save_interval = 50  # save registry every N completions
    423     completions = 0
    424 
    425     def on_done(future, entry):
    426         nonlocal downloaded, found_arxiv, failed, completions
    427         try:
    428             success, got_arxiv = future.result()
    429         except Exception as e:
    430             log(entry["id"], f"exception: {e}")
    431             success, got_arxiv = False, False
    432 
    433         with _counter_lock:
    434             if success:
    435                 downloaded += 1
    436             else:
    437                 failed += 1
    438                 failed_list.append(entry)
    439             if got_arxiv:
    440                 found_arxiv += 1
    441             completions += 1
    442             n = completions
    443             with _print_lock:
    444                 print(f"[{n}/{len(candidates)}] "
    445                       f"ok={downloaded} fail={failed}", flush=True)
    446 
    447             # periodic save
    448             if n % save_interval == 0:
    449                 save_registry(entries)
    450                 with _print_lock:
    451                     print(f"  (registry saved at {n})", flush=True)
    452 
    453     with ThreadPoolExecutor(max_workers=workers) as pool:
    454         futures = []
    455         for entry in candidates:
    456             fut = pool.submit(process_paper, entry, use_scihub, core_key)
    457             fut.add_done_callback(lambda f, e=entry: on_done(f, e))
    458             futures.append(fut)
    459 
    460         # wait for all to complete
    461         for fut in futures:
    462             fut.result()
    463 
    464     # final save
    465     save_registry(entries)
    466 
    467     with open(ROOT / "manual-download-needed.txt", "w") as f:
    468         f.write(f"# Papers requiring manual download ({len(failed_list)} total)\n")
    469         f.write("# All automated download strategies exhausted.\n")
    470         f.write("# Download manually → papers/<slug>/paper.pdf\n\n")
    471         for e in failed_list:
    472             url = e.get("source_url") or e.get("doi") or "no URL"
    473             f.write(f"{e['id']}\n")
    474             f.write(f"  {e['title']}\n")
    475             f.write(f"  {e.get('year', '?')} | {e.get('venue', '?')}\n")
    476             f.write(f"  {url}\n\n")
    477 
    478     print(f"\nDone. Downloaded: {downloaded}, Failed: {failed}")
    479     if found_arxiv:
    480         print(f"Found {found_arxiv} new arXiv IDs")
    481     if failed_list:
    482         print(f"Remaining {len(failed_list)} written to manual-download-needed.txt")
    483 
    484 
    485 if __name__ == "__main__":
    486     main()
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs