enrich-metadata.py (6020B)
1 #!/usr/bin/env python3 2 """ 3 Enrich paper metadata from Semantic Scholar API. 4 5 Queries by DOI or title, writes papers/<slug>/metadata.json with: 6 author_count, affiliations, venue, citation_count, is_open_access, fields_of_study. 7 8 Usage: 9 python scripts/enrich-metadata.py # All papers with scan.json but no metadata.json 10 python scripts/enrich-metadata.py --limit 5 # First 5 11 python scripts/enrich-metadata.py --slug <slug> # Single paper 12 13 Rate-limited to 1 request per second (S2 API limit for unauthenticated). 14 """ 15 16 import json 17 import sys 18 import time 19 import urllib.request 20 import urllib.parse 21 import urllib.error 22 from pathlib import Path 23 24 ROOT = Path(__file__).resolve().parent.parent 25 PAPERS_DIR = ROOT / "papers" 26 REGISTRY = ROOT / "registry.jsonl" 27 28 S2_API = "https://api.semanticscholar.org/graph/v1" 29 S2_FIELDS = "title,authors,venue,year,citationCount,isOpenAccess,fieldsOfStudy,externalIds" 30 31 # Rate limit: S2 free tier is 1 req/sec but often 429s at that rate 32 RATE_LIMIT = 3.0 33 MAX_RETRIES = 3 34 35 36 def load_registry(): 37 """Load registry as dict keyed by slug.""" 38 registry = {} 39 with open(REGISTRY) as f: 40 for line in f: 41 line = line.strip() 42 if not line: 43 continue 44 entry = json.loads(line) 45 registry[entry["id"]] = entry 46 return registry 47 48 49 def query_s2_by_doi(doi): 50 """Query Semantic Scholar by DOI.""" 51 url = f"{S2_API}/paper/DOI:{urllib.parse.quote(doi, safe='')}?fields={S2_FIELDS}" 52 return _fetch_json(url) 53 54 55 def query_s2_by_title(title): 56 """Query Semantic Scholar by title search.""" 57 params = urllib.parse.urlencode({"query": title, "limit": 1, "fields": S2_FIELDS}) 58 url = f"{S2_API}/paper/search?{params}" 59 result = _fetch_json(url) 60 if result and result.get("data") and len(result["data"]) > 0: 61 return result["data"][0] 62 return None 63 64 65 def _fetch_json(url): 66 """Fetch JSON from URL with retry on 429.""" 67 for attempt in range(MAX_RETRIES): 68 try: 69 req = urllib.request.Request(url, headers={"User-Agent": "ai-research-survey/1.0"}) 70 with urllib.request.urlopen(req, timeout=15) as resp: 71 return json.loads(resp.read()) 72 except urllib.error.HTTPError as e: 73 if e.code == 429 and attempt < MAX_RETRIES - 1: 74 wait = (attempt + 1) * 5 75 print(f"429, waiting {wait}s...", end=" ", flush=True) 76 time.sleep(wait) 77 continue 78 print(f" API error: {e}", file=sys.stderr) 79 return None 80 except (urllib.error.URLError, json.JSONDecodeError, TimeoutError) as e: 81 print(f" API error: {e}", file=sys.stderr) 82 return None 83 84 85 def extract_metadata(s2_data): 86 """Extract structured metadata from S2 API response.""" 87 if not s2_data: 88 return None 89 90 authors = s2_data.get("authors", []) 91 affiliations = [] 92 for a in authors: 93 if a.get("affiliations"): 94 affiliations.extend(a["affiliations"]) 95 96 return { 97 "author_count": len(authors), 98 "affiliations": list(set(affiliations)) if affiliations else [], 99 "venue": s2_data.get("venue", ""), 100 "citation_count": s2_data.get("citationCount", 0), 101 "is_open_access": s2_data.get("isOpenAccess", False), 102 "fields_of_study": s2_data.get("fieldsOfStudy") or [], 103 "s2_paper_id": s2_data.get("paperId", ""), 104 } 105 106 107 def main(): 108 args = sys.argv[1:] 109 limit = None 110 target_slug = None 111 v2_only = False 112 113 i = 0 114 while i < len(args): 115 if args[i] == "--limit" and i + 1 < len(args): 116 limit = int(args[i + 1]) 117 i += 2 118 elif args[i] == "--slug" and i + 1 < len(args): 119 target_slug = args[i + 1] 120 i += 2 121 elif args[i] == "--v2-only": 122 v2_only = True 123 i += 1 124 else: 125 i += 1 126 127 registry = load_registry() 128 129 # Find papers to enrich 130 if target_slug: 131 slugs = [target_slug] 132 else: 133 slugs = [] 134 for scan_path in sorted(PAPERS_DIR.glob("*/scan.json")): 135 slug = scan_path.parent.name 136 meta_path = scan_path.parent / "metadata.json" 137 if meta_path.exists(): 138 continue 139 if v2_only: 140 scan_data = json.loads(scan_path.read_text()) 141 if scan_data.get("scan_version") != 2: 142 continue 143 slugs.append(slug) 144 if limit and len(slugs) >= limit: 145 break 146 147 print(f"Enriching {len(slugs)} papers...") 148 success = 0 149 failed = 0 150 151 for slug in slugs: 152 entry = registry.get(slug, {}) 153 doi = entry.get("doi", "") 154 title = entry.get("title", "") 155 156 # Also try reading title from scan.json 157 scan_path = PAPERS_DIR / slug / "scan.json" 158 if scan_path.exists() and not title: 159 try: 160 scan_data = json.loads(scan_path.read_text()) 161 title = scan_data.get("paper", {}).get("title", "") 162 if not doi: 163 doi = scan_data.get("paper", {}).get("doi", "") 164 except (json.JSONDecodeError, KeyError): 165 pass 166 167 print(f" {slug}...", end=" ", flush=True) 168 169 s2_data = None 170 if doi: 171 s2_data = query_s2_by_doi(doi) 172 time.sleep(RATE_LIMIT) 173 174 if not s2_data and title: 175 s2_data = query_s2_by_title(title) 176 time.sleep(RATE_LIMIT) 177 178 metadata = extract_metadata(s2_data) 179 if metadata: 180 meta_path = PAPERS_DIR / slug / "metadata.json" 181 meta_path.write_text(json.dumps(metadata, indent=2, ensure_ascii=False) + "\n") 182 print(f"OK (citations: {metadata['citation_count']})") 183 success += 1 184 else: 185 print("not found") 186 failed += 1 187 188 print(f"\nDone: {success} enriched, {failed} not found") 189 190 191 if __name__ == "__main__": 192 main()