enrich-metadata.py - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

enrich-metadata.py (6020B)
      1 #!/usr/bin/env python3
      2 """
      3 Enrich paper metadata from Semantic Scholar API.
      4 
      5 Queries by DOI or title, writes papers/<slug>/metadata.json with:
      6 author_count, affiliations, venue, citation_count, is_open_access, fields_of_study.
      7 
      8 Usage:
      9     python scripts/enrich-metadata.py                    # All papers with scan.json but no metadata.json
     10     python scripts/enrich-metadata.py --limit 5          # First 5
     11     python scripts/enrich-metadata.py --slug <slug>      # Single paper
     12 
     13 Rate-limited to 1 request per second (S2 API limit for unauthenticated).
     14 """
     15 
     16 import json
     17 import sys
     18 import time
     19 import urllib.request
     20 import urllib.parse
     21 import urllib.error
     22 from pathlib import Path
     23 
     24 ROOT = Path(__file__).resolve().parent.parent
     25 PAPERS_DIR = ROOT / "papers"
     26 REGISTRY = ROOT / "registry.jsonl"
     27 
     28 S2_API = "https://api.semanticscholar.org/graph/v1"
     29 S2_FIELDS = "title,authors,venue,year,citationCount,isOpenAccess,fieldsOfStudy,externalIds"
     30 
     31 # Rate limit: S2 free tier is 1 req/sec but often 429s at that rate
     32 RATE_LIMIT = 3.0
     33 MAX_RETRIES = 3
     34 
     35 
     36 def load_registry():
     37     """Load registry as dict keyed by slug."""
     38     registry = {}
     39     with open(REGISTRY) as f:
     40         for line in f:
     41             line = line.strip()
     42             if not line:
     43                 continue
     44             entry = json.loads(line)
     45             registry[entry["id"]] = entry
     46     return registry
     47 
     48 
     49 def query_s2_by_doi(doi):
     50     """Query Semantic Scholar by DOI."""
     51     url = f"{S2_API}/paper/DOI:{urllib.parse.quote(doi, safe='')}?fields={S2_FIELDS}"
     52     return _fetch_json(url)
     53 
     54 
     55 def query_s2_by_title(title):
     56     """Query Semantic Scholar by title search."""
     57     params = urllib.parse.urlencode({"query": title, "limit": 1, "fields": S2_FIELDS})
     58     url = f"{S2_API}/paper/search?{params}"
     59     result = _fetch_json(url)
     60     if result and result.get("data") and len(result["data"]) > 0:
     61         return result["data"][0]
     62     return None
     63 
     64 
     65 def _fetch_json(url):
     66     """Fetch JSON from URL with retry on 429."""
     67     for attempt in range(MAX_RETRIES):
     68         try:
     69             req = urllib.request.Request(url, headers={"User-Agent": "ai-research-survey/1.0"})
     70             with urllib.request.urlopen(req, timeout=15) as resp:
     71                 return json.loads(resp.read())
     72         except urllib.error.HTTPError as e:
     73             if e.code == 429 and attempt < MAX_RETRIES - 1:
     74                 wait = (attempt + 1) * 5
     75                 print(f"429, waiting {wait}s...", end=" ", flush=True)
     76                 time.sleep(wait)
     77                 continue
     78             print(f"  API error: {e}", file=sys.stderr)
     79             return None
     80         except (urllib.error.URLError, json.JSONDecodeError, TimeoutError) as e:
     81             print(f"  API error: {e}", file=sys.stderr)
     82             return None
     83 
     84 
     85 def extract_metadata(s2_data):
     86     """Extract structured metadata from S2 API response."""
     87     if not s2_data:
     88         return None
     89 
     90     authors = s2_data.get("authors", [])
     91     affiliations = []
     92     for a in authors:
     93         if a.get("affiliations"):
     94             affiliations.extend(a["affiliations"])
     95 
     96     return {
     97         "author_count": len(authors),
     98         "affiliations": list(set(affiliations)) if affiliations else [],
     99         "venue": s2_data.get("venue", ""),
    100         "citation_count": s2_data.get("citationCount", 0),
    101         "is_open_access": s2_data.get("isOpenAccess", False),
    102         "fields_of_study": s2_data.get("fieldsOfStudy") or [],
    103         "s2_paper_id": s2_data.get("paperId", ""),
    104     }
    105 
    106 
    107 def main():
    108     args = sys.argv[1:]
    109     limit = None
    110     target_slug = None
    111     v2_only = False
    112 
    113     i = 0
    114     while i < len(args):
    115         if args[i] == "--limit" and i + 1 < len(args):
    116             limit = int(args[i + 1])
    117             i += 2
    118         elif args[i] == "--slug" and i + 1 < len(args):
    119             target_slug = args[i + 1]
    120             i += 2
    121         elif args[i] == "--v2-only":
    122             v2_only = True
    123             i += 1
    124         else:
    125             i += 1
    126 
    127     registry = load_registry()
    128 
    129     # Find papers to enrich
    130     if target_slug:
    131         slugs = [target_slug]
    132     else:
    133         slugs = []
    134         for scan_path in sorted(PAPERS_DIR.glob("*/scan.json")):
    135             slug = scan_path.parent.name
    136             meta_path = scan_path.parent / "metadata.json"
    137             if meta_path.exists():
    138                 continue
    139             if v2_only:
    140                 scan_data = json.loads(scan_path.read_text())
    141                 if scan_data.get("scan_version") != 2:
    142                     continue
    143             slugs.append(slug)
    144             if limit and len(slugs) >= limit:
    145                 break
    146 
    147     print(f"Enriching {len(slugs)} papers...")
    148     success = 0
    149     failed = 0
    150 
    151     for slug in slugs:
    152         entry = registry.get(slug, {})
    153         doi = entry.get("doi", "")
    154         title = entry.get("title", "")
    155 
    156         # Also try reading title from scan.json
    157         scan_path = PAPERS_DIR / slug / "scan.json"
    158         if scan_path.exists() and not title:
    159             try:
    160                 scan_data = json.loads(scan_path.read_text())
    161                 title = scan_data.get("paper", {}).get("title", "")
    162                 if not doi:
    163                     doi = scan_data.get("paper", {}).get("doi", "")
    164             except (json.JSONDecodeError, KeyError):
    165                 pass
    166 
    167         print(f"  {slug}...", end=" ", flush=True)
    168 
    169         s2_data = None
    170         if doi:
    171             s2_data = query_s2_by_doi(doi)
    172             time.sleep(RATE_LIMIT)
    173 
    174         if not s2_data and title:
    175             s2_data = query_s2_by_title(title)
    176             time.sleep(RATE_LIMIT)
    177 
    178         metadata = extract_metadata(s2_data)
    179         if metadata:
    180             meta_path = PAPERS_DIR / slug / "metadata.json"
    181             meta_path.write_text(json.dumps(metadata, indent=2, ensure_ascii=False) + "\n")
    182             print(f"OK (citations: {metadata['citation_count']})")
    183             success += 1
    184         else:
    185             print("not found")
    186             failed += 1
    187 
    188     print(f"\nDone: {success} enriched, {failed} not found")
    189 
    190 
    191 if __name__ == "__main__":
    192     main()
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs