run-scan-v4-haiku.py (20773B)
1 #!/usr/bin/env python3 2 """ 3 V4 Haiku scan: fast coverage pass for all papers. 4 5 For each paper with paper.txt and paper_type.json: 6 1. Read paper text + paper_type 7 2. Run Haiku to answer shared core + type-specific questions 8 3. If existing v2/v3 Opus scan exists, merge: Opus answers override Haiku 9 4. Write scan-v4.json (separate from scan.json to preserve v2/v3 data) 10 11 Usage: 12 python3 scripts/run-scan-v4-haiku.py # All unscanned 13 python3 scripts/run-scan-v4-haiku.py --limit 10 # First N 14 python3 scripts/run-scan-v4-haiku.py --parallel 8 # Concurrent (Haiku is fast) 15 python3 scripts/run-scan-v4-haiku.py --id metr-rct-2025 # Specific paper 16 python3 scripts/run-scan-v4-haiku.py --force # Re-scan all 17 """ 18 19 import json 20 import subprocess 21 import sys 22 import urllib.parse 23 import urllib.request 24 from concurrent.futures import ThreadPoolExecutor, as_completed 25 from pathlib import Path 26 27 ROOT = Path(__file__).resolve().parent.parent 28 PAPERS_DIR = ROOT / "papers" 29 SCHEMA_PATH = ROOT / "schema" / "scan-v4.schema.json" 30 31 # Load schema for question descriptions 32 with open(SCHEMA_PATH) as f: 33 SCHEMA = json.load(f) 34 35 # ── Build prompt from schema ────────────────────────────────────────── 36 37 def build_questions_text(category_obj): 38 """Extract question descriptions from a schema category object.""" 39 lines = [] 40 props = category_obj.get("properties", {}) 41 for qname, qdef in props.items(): 42 desc = qdef.get("description", "") 43 if not desc and "$ref" in qdef: 44 desc = qdef.get("description", qname) 45 lines.append(f" - **{qname}**: {desc}") 46 return "\n".join(lines) 47 48 49 def build_prompt(paper_type, paper_text, paper_id, registry_entry, hn_data): 50 """Build the v4 Haiku scan prompt.""" 51 core_cats = SCHEMA["properties"]["checklist"]["properties"] 52 type_mod = SCHEMA["properties"]["type_checklist"]["properties"].get(paper_type, {}) 53 type_cats = type_mod.get("properties", {}) 54 55 core_section = "" 56 for cat_name, cat_def in core_cats.items(): 57 desc = cat_def.get("description", cat_name) 58 core_section += f"\n### {cat_name}\n{desc}\n{build_questions_text(cat_def)}\n" 59 60 type_section = "" 61 for cat_name, cat_def in type_cats.items(): 62 desc = cat_def.get("description", cat_name) 63 type_section += f"\n### {cat_name}\n{desc}\n{build_questions_text(cat_def)}\n" 64 65 reg_json = json.dumps(registry_entry, indent=2, ensure_ascii=False) if registry_entry else "{}" 66 67 return f"""You are scanning a research paper for methodological quality. This is a {paper_type} paper. 68 69 Answer every question with a JSON object containing: 70 - "applies": true/false (is this criterion relevant to this paper?) 71 - "answer": true/false (does the paper satisfy it? false when applies=false) 72 - "justification": "1-2 sentences citing specific evidence" 73 - "source": "haiku" 74 75 Be strict. Absence of evidence = answer: false. Do not be generous. 76 77 ## Registry Entry 78 ```json 79 {reg_json} 80 ``` 81 82 ## Shared Core Questions (answer ALL of these) 83 {core_section} 84 85 ## {paper_type.title()} Module Questions (answer ALL of these) 86 {type_section} 87 88 ## Additional Required Fields 89 90 ### Claims 91 Extract 3-8 key empirical claims. For each: {{"claim": "...", "evidence": "...", "supported": "strong|moderate|weak|unsupported"}} 92 93 ### Key Findings 94 2-4 sentence summary of the paper's most important findings. 95 96 ### Red Flags 97 List methodological concerns: {{"flag": "short label", "detail": "explanation"}} 98 99 ### Methodology Tags 100 Assign from: rct, observational, benchmark-eval, case-study, meta-analysis, theoretical, qualitative 101 102 ### Cited Papers 103 Extract 3-10 survey-relevant references: {{"title": "...", "relevance": "..."}} 104 105 ### Engagement Factors 106 Rate 0-3 on each dimension: 107 - practical_relevance: Can practitioners use this? 108 - surprise_contrarian: Challenges conventional wisdom? 109 - fear_safety: Raises AI risk concerns? 110 - drama_conflict: Controversy angle? 111 - demo_ability: Can someone try it now? 112 - brand_recognition: Famous lab or product? 113 Each: {{"score": 0-3, "justification": "1 sentence"}} 114 115 ## Output 116 117 Respond with a single JSON object: 118 {{ 119 "scan_version": 4, 120 "paper_type": "{paper_type}", 121 "paper": {{"title": "...", "authors": [...], "year": ..., "venue": "...", "arxiv_id": "...", "doi": "..."}}, 122 "checklist": {{<shared core categories with questions>}}, 123 "type_checklist": {{"{paper_type}": {{<type-specific categories with questions>}}}}, 124 "claims": [...], 125 "methodology_tags": [...], 126 "key_findings": "...", 127 "red_flags": [...], 128 "cited_papers": [...], 129 "engagement_factors": {{...}}, 130 "hn_data": {json.dumps(hn_data)} 131 }} 132 133 ## Paper Text 134 {paper_text} 135 """ 136 137 138 # ── V2/V3 → V4 Merge Logic ─────────────────────────────────────────── 139 140 # Map v2/v3 category.question → v4 location 141 V2_TO_V4_CORE = { 142 # claims_and_evidence → checklist.claims_and_evidence 143 "claims_and_evidence.abstract_claims_supported": ("checklist", "claims_and_evidence", "abstract_claims_supported"), 144 "claims_and_evidence.causal_claims_justified": ("checklist", "claims_and_evidence", "causal_claims_justified"), 145 "claims_and_evidence.generalization_bounded": ("checklist", "claims_and_evidence", "generalization_bounded"), 146 "claims_and_evidence.alternative_explanations_discussed": ("checklist", "claims_and_evidence", "alternative_explanations_discussed"), 147 "claims_and_evidence.proxy_outcome_distinction": ("checklist", "claims_and_evidence", "proxy_outcome_distinction"), 148 # limitations_and_scope → checklist.limitations_and_scope 149 "limitations_and_scope.limitations_section_present": ("checklist", "limitations_and_scope", "limitations_section_present"), 150 "limitations_and_scope.threats_to_validity_specific": ("checklist", "limitations_and_scope", "threats_to_validity_specific"), 151 "limitations_and_scope.scope_boundaries_stated": ("checklist", "limitations_and_scope", "scope_boundaries_stated"), 152 # conflicts_of_interest → checklist.conflicts_of_interest 153 "conflicts_of_interest.funding_disclosed": ("checklist", "conflicts_of_interest", "funding_disclosed"), 154 "conflicts_of_interest.affiliations_disclosed": ("checklist", "conflicts_of_interest", "affiliations_disclosed"), 155 "conflicts_of_interest.funder_independent_of_outcome": ("checklist", "conflicts_of_interest", "funder_independent_of_outcome"), 156 "conflicts_of_interest.financial_interests_declared": ("checklist", "conflicts_of_interest", "financial_interests_declared"), 157 } 158 159 # v2/v3 empirical questions → v4 type_checklist.empirical 160 V2_TO_V4_EMPIRICAL = {} 161 for cat in ["artifacts", "statistical_methodology", "evaluation_design", "setup_transparency", 162 "data_integrity", "contamination", "human_studies", "cost_and_practicality"]: 163 # Get question names from v2 schema 164 v2_schema_path = ROOT / "schema" / "scan.schema.json" 165 with open(v2_schema_path) as f: 166 v2_schema = json.load(f) 167 cat_props = v2_schema["properties"]["checklist"]["properties"].get(cat, {}).get("properties", {}) 168 for qname in cat_props: 169 V2_TO_V4_EMPIRICAL[f"{cat}.{qname}"] = ("type_checklist", "empirical", cat, qname) 170 171 # Also map conditional modules 172 for cat in ["experimental_rigor", "data_leakage", "survey_methodology"]: 173 cat_props = v2_schema["properties"]["checklist"]["properties"].get(cat, {}).get("properties", {}) 174 for qname in cat_props: 175 V2_TO_V4_EMPIRICAL[f"{cat}.{qname}"] = ("type_checklist", "empirical", cat, qname) 176 177 178 def merge_opus_answers(v4_scan, v2_scan, paper_type): 179 """Overlay Opus v2/v3 answers onto Haiku v4 scan. Returns merged scan + agreement stats.""" 180 agreements = 0 181 disagreements = 0 182 opus_overrides = 0 183 184 v2_checklist = v2_scan.get("checklist", {}) 185 186 # Merge core questions 187 for v2_key, (section, cat, qname) in V2_TO_V4_CORE.items(): 188 v2_cat, v2_qname = v2_key.split(".") 189 v2_answer = v2_checklist.get(v2_cat, {}).get(v2_qname) 190 if not v2_answer or not isinstance(v2_answer, dict): 191 continue 192 193 v4_section = v4_scan.get(section, {}) 194 v4_cat_data = v4_section.get(cat, {}) 195 v4_answer = v4_cat_data.get(qname) 196 197 if v4_answer and isinstance(v4_answer, dict): 198 # Compare 199 if v4_answer.get("applies") == v2_answer.get("applies") and v4_answer.get("answer") == v2_answer.get("answer"): 200 agreements += 1 201 else: 202 disagreements += 1 203 204 # Override with Opus 205 opus_item = { 206 "applies": v2_answer["applies"], 207 "answer": v2_answer["answer"], 208 "justification": v2_answer.get("justification", ""), 209 "source": "opus", 210 } 211 if cat not in v4_section: 212 v4_section[cat] = {} 213 v4_section[cat][qname] = opus_item 214 v4_scan[section] = v4_section 215 opus_overrides += 1 216 217 # Merge empirical type questions (only if paper is empirical) 218 if paper_type == "empirical": 219 for v2_key, path in V2_TO_V4_EMPIRICAL.items(): 220 v2_cat, v2_qname = v2_key.split(".") 221 v2_answer = v2_checklist.get(v2_cat, {}).get(v2_qname) 222 if not v2_answer or not isinstance(v2_answer, dict): 223 continue 224 225 if len(path) == 4: 226 section, ptype, cat, qname = path 227 else: 228 continue 229 230 # Navigate to v4 location 231 type_cl = v4_scan.get(section, {}) 232 type_data = type_cl.get(ptype, {}) 233 cat_data = type_data.get(cat, {}) 234 v4_answer = cat_data.get(qname) 235 236 if v4_answer and isinstance(v4_answer, dict): 237 if v4_answer.get("applies") == v2_answer.get("applies") and v4_answer.get("answer") == v2_answer.get("answer"): 238 agreements += 1 239 else: 240 disagreements += 1 241 242 opus_item = { 243 "applies": v2_answer["applies"], 244 "answer": v2_answer["answer"], 245 "justification": v2_answer.get("justification", ""), 246 "source": "opus", 247 } 248 if cat not in cat_data: 249 cat_data[qname] = opus_item 250 else: 251 cat_data[qname] = opus_item 252 type_data[cat] = cat_data 253 type_cl[ptype] = type_data 254 v4_scan[section] = type_cl 255 opus_overrides += 1 256 257 # Also merge engagement factors from v3 if Opus-generated 258 v2_ef = v2_scan.get("engagement_factors") 259 if v2_ef and v2_scan.get("scan_version", 1) >= 3: 260 v4_scan["engagement_factors"] = v2_ef 261 262 return v4_scan, {"agreements": agreements, "disagreements": disagreements, "opus_overrides": opus_overrides} 263 264 265 # ── HN Fetch ────────────────────────────────────────────────────────── 266 267 def fetch_hn(paper_id, arxiv_id=""): 268 """Fetch HN data. Returns dict compatible with hn_data schema.""" 269 hn_path = PAPERS_DIR / paper_id / "hn.json" 270 if hn_path.exists(): 271 with open(hn_path) as f: 272 return json.load(f) 273 274 if not arxiv_id: 275 return {"threads": [], "top_points": 0, "total_points": 0, "total_comments": 0} 276 277 try: 278 params = urllib.parse.urlencode({"query": arxiv_id, "tags": "story", "hitsPerPage": 10}) 279 req = urllib.request.Request(f"https://hn.algolia.com/api/v1/search?{params}", 280 headers={"User-Agent": "research-survey/1.0"}) 281 resp = urllib.request.urlopen(req, timeout=10) 282 data = json.loads(resp.read()) 283 hits = data.get("hits", []) 284 threads = [] 285 for h in hits: 286 threads.append({ 287 "hn_id": h.get("objectID", ""), 288 "title": h.get("title", ""), 289 "points": h.get("points", 0) or 0, 290 "comments": h.get("num_comments", 0) or 0, 291 "url": f"https://news.ycombinator.com/item?id={h.get('objectID', '')}", 292 }) 293 threads.sort(key=lambda t: -t["points"]) 294 return { 295 "threads": threads, 296 "top_points": threads[0]["points"] if threads else 0, 297 "total_points": sum(t["points"] for t in threads), 298 "total_comments": sum(t["comments"] for t in threads), 299 } 300 except Exception: 301 return {"threads": [], "top_points": 0, "total_points": 0, "total_comments": 0} 302 303 304 # ── Scan One Paper ──────────────────────────────────────────────────── 305 306 def load_registry(): 307 entries = {} 308 with open(ROOT / "registry.jsonl") as f: 309 for line in f: 310 if line.strip(): 311 e = json.loads(line) 312 entries[e["id"]] = e 313 return entries 314 315 316 def scan_one(paper_id, registry, force=False): 317 """Run v4 Haiku scan on one paper. Returns (paper_id, ok, reason, stats).""" 318 v4_path = PAPERS_DIR / paper_id / "scan-v4.json" 319 if v4_path.exists() and not force: 320 return paper_id, True, "already scanned", {} 321 322 txt_path = PAPERS_DIR / paper_id / "paper.txt" 323 type_path = PAPERS_DIR / paper_id / "paper_type.json" 324 325 if not txt_path.exists(): 326 return paper_id, False, "no paper.txt", {} 327 if not type_path.exists(): 328 return paper_id, False, "no paper_type.json", {} 329 330 with open(type_path) as f: 331 paper_type = json.load(f).get("paper_type") 332 if not paper_type: 333 return paper_id, False, "invalid paper_type", {} 334 335 paper_text = txt_path.read_text(encoding="utf-8").replace("\x00", "") 336 reg_entry = registry.get(paper_id, {}) 337 arxiv_id = reg_entry.get("arxiv_id", "") 338 339 # Fetch HN data 340 hn_data = fetch_hn(paper_id, arxiv_id) 341 342 # Build and run prompt 343 prompt = build_prompt(paper_type, paper_text, paper_id, reg_entry, hn_data) 344 345 # Pick model: haiku for most papers, sonnet for large ones 346 model = "haiku" 347 if len(paper_text) > 50000: 348 model = "sonnet" 349 350 try: 351 result = subprocess.run( 352 ["claude", "-p", "-", "--model", model, "--max-turns", "1"], 353 input=prompt, 354 capture_output=True, text=True, timeout=600, 355 cwd=str(ROOT), 356 ) 357 358 if result.returncode != 0: 359 # Retry with sonnet if haiku failed 360 if model == "haiku": 361 model = "sonnet" 362 result = subprocess.run( 363 ["claude", "-p", "-", "--model", model, "--max-turns", "1"], 364 input=prompt, 365 capture_output=True, text=True, timeout=600, 366 cwd=str(ROOT), 367 ) 368 if result.returncode != 0: 369 return paper_id, False, f"claude exit {result.returncode} (sonnet fallback)", {} 370 else: 371 return paper_id, False, f"claude exit {result.returncode}", {} 372 373 output = result.stdout.strip() 374 json_start = output.find("{") 375 json_end = output.rfind("}") + 1 376 if json_start == -1 or json_end == 0: 377 return paper_id, False, "no JSON in output", {} 378 379 v4_scan = json.loads(output[json_start:json_end]) 380 381 # Ensure required fields 382 v4_scan["scan_version"] = 4 383 v4_scan["paper_type"] = paper_type 384 v4_scan["hn_data"] = hn_data 385 386 # Mark all answers with the model that produced them 387 scan_model = model # haiku or sonnet 388 for section_key in ["checklist", "type_checklist"]: 389 section = v4_scan.get(section_key, {}) 390 if section_key == "type_checklist": 391 for ptype_key, ptype_data in section.items(): 392 if isinstance(ptype_data, dict): 393 for cat_data in ptype_data.values(): 394 if isinstance(cat_data, dict): 395 for qd in cat_data.values(): 396 if isinstance(qd, dict) and "applies" in qd and "source" not in qd: 397 qd["source"] = scan_model 398 else: 399 for cat_data in section.values(): 400 if isinstance(cat_data, dict): 401 for qd in cat_data.values(): 402 if isinstance(qd, dict) and "applies" in qd and "source" not in qd: 403 qd["source"] = scan_model 404 405 # Merge Opus answers if v2/v3 scan exists 406 merge_stats = {} 407 v2_path = PAPERS_DIR / paper_id / "scan.json" 408 if v2_path.exists(): 409 with open(v2_path) as f: 410 v2_scan = json.load(f) 411 if v2_scan.get("scan_version", 1) >= 2: 412 v4_scan, merge_stats = merge_opus_answers(v4_scan, v2_scan, paper_type) 413 414 # Write v4 scan 415 with open(v4_path, "w") as f: 416 json.dump(v4_scan, f, ensure_ascii=False, indent=2) 417 418 opus_n = merge_stats.get("opus_overrides", 0) 419 agree = merge_stats.get("agreements", 0) 420 disagree = merge_stats.get("disagreements", 0) 421 model_tag = scan_model 422 coverage = f"{model_tag}-only" if opus_n == 0 else f"merged({model_tag}+opus={opus_n},agree={agree},disagree={disagree})" 423 return paper_id, True, coverage, merge_stats 424 425 except json.JSONDecodeError as e: 426 return paper_id, False, f"JSON error: {e}", {} 427 except subprocess.TimeoutExpired: 428 return paper_id, False, "timeout", {} 429 except Exception as e: 430 return paper_id, False, f"error: {e}", {} 431 432 433 # ── Main ────────────────────────────────────────────────────────────── 434 435 def main(): 436 args = sys.argv[1:] 437 force = "--force" in args 438 limit = None 439 specific_id = None 440 parallel = 1 441 442 for i, arg in enumerate(args): 443 if arg == "--limit" and i + 1 < len(args): 444 limit = int(args[i + 1]) 445 if arg == "--id" and i + 1 < len(args): 446 specific_id = args[i + 1] 447 if arg == "--parallel" and i + 1 < len(args): 448 parallel = int(args[i + 1]) 449 450 registry = load_registry() 451 452 # Collect candidates: papers with paper.txt + paper_type.json 453 candidates = [] 454 for type_path in sorted(PAPERS_DIR.glob("*/paper_type.json")): 455 pid = type_path.parent.name 456 if specific_id and pid != specific_id: 457 continue 458 v4_path = type_path.parent / "scan-v4.json" 459 if v4_path.exists() and not force and not specific_id: 460 continue 461 txt_path = type_path.parent / "paper.txt" 462 if not txt_path.exists(): 463 continue 464 candidates.append(pid) 465 466 if limit: 467 candidates = candidates[:limit] 468 469 if not candidates: 470 print("No papers to scan.") 471 return 472 473 print(f"V4 Haiku scan: {len(candidates)} papers" 474 f"{f' (parallel={parallel})' if parallel > 1 else ''}:\n") 475 476 total_agree = 0 477 total_disagree = 0 478 ok_count = 0 479 fail_count = 0 480 481 if parallel > 1: 482 with ThreadPoolExecutor(max_workers=parallel) as executor: 483 futures = {executor.submit(scan_one, pid, registry, force): pid for pid in candidates} 484 for future in as_completed(futures): 485 pid, ok, reason, stats = future.result() 486 if ok: 487 ok_count += 1 488 total_agree += stats.get("agreements", 0) 489 total_disagree += stats.get("disagreements", 0) 490 if "merged" in reason: 491 print(f" OK: {pid} — {reason}") 492 else: 493 fail_count += 1 494 print(f" FAIL: {pid} — {reason}") 495 else: 496 for i, pid in enumerate(candidates): 497 if (i + 1) % 20 == 0: 498 print(f" ... {i+1}/{len(candidates)}") 499 pid, ok, reason, stats = scan_one(pid, registry, force) 500 if ok: 501 ok_count += 1 502 total_agree += stats.get("agreements", 0) 503 total_disagree += stats.get("disagreements", 0) 504 else: 505 fail_count += 1 506 print(f" FAIL: {pid} — {reason}") 507 508 print(f"\nDone. OK: {ok_count}, Failed: {fail_count}") 509 if total_agree + total_disagree > 0: 510 rate = total_agree / (total_agree + total_disagree) * 100 511 print(f"Haiku-Opus agreement: {total_agree}/{total_agree + total_disagree} ({rate:.1f}%)") 512 513 514 if __name__ == "__main__": 515 main()