run-scan-v5-haiku.py (13974B)
1 #!/usr/bin/env python3 2 """ 3 V5 Haiku scan: fast coverage pass for all papers. PURE Haiku output — no merge. 4 5 For each paper with paper.txt and paper_type.json: 6 1. Read paper text + paper_type 7 2. Run Haiku to answer shared core + type-specific questions 8 3. Write scan-v5.json with raw Haiku/Sonnet answers (no Opus merge). 9 10 The build pipeline handles Opus/Haiku merging at read time. This keeps v5 11 files pure for calibration analysis (Haiku vs Opus per question). 12 13 Usage: 14 python3 scripts/run-scan-v5-haiku.py # All unscanned 15 python3 scripts/run-scan-v5-haiku.py --limit 10 # First N 16 python3 scripts/run-scan-v5-haiku.py --parallel 8 # Concurrent 17 python3 scripts/run-scan-v5-haiku.py --id metr-rct-2025 # Specific paper 18 python3 scripts/run-scan-v5-haiku.py --force # Re-scan all 19 """ 20 21 import json 22 import subprocess 23 import sys 24 import urllib.parse 25 import urllib.request 26 from concurrent.futures import ThreadPoolExecutor, as_completed 27 from pathlib import Path 28 29 ROOT = Path(__file__).resolve().parent.parent 30 PAPERS_DIR = ROOT / "papers" 31 SCHEMA_PATH = ROOT / "schema" / "scan-v4.schema.json" 32 33 # Load schema for question descriptions 34 with open(SCHEMA_PATH) as f: 35 SCHEMA = json.load(f) 36 37 # ── Build prompt from schema ────────────────────────────────────────── 38 39 def build_questions_text(category_obj): 40 """Extract question descriptions from a schema category object.""" 41 lines = [] 42 props = category_obj.get("properties", {}) 43 for qname, qdef in props.items(): 44 desc = qdef.get("description", "") 45 if not desc and "$ref" in qdef: 46 desc = qdef.get("description", qname) 47 lines.append(f" - **{qname}**: {desc}") 48 return "\n".join(lines) 49 50 51 def build_prompt(paper_type, paper_text, paper_id, registry_entry, hn_data): 52 """Build the v5 Haiku scan prompt.""" 53 core_cats = SCHEMA["properties"]["checklist"]["properties"] 54 type_mod = SCHEMA["properties"]["type_checklist"]["properties"].get(paper_type, {}) 55 type_cats = type_mod.get("properties", {}) 56 57 core_section = "" 58 for cat_name, cat_def in core_cats.items(): 59 desc = cat_def.get("description", cat_name) 60 core_section += f"\n### {cat_name}\n{desc}\n{build_questions_text(cat_def)}\n" 61 62 type_section = "" 63 for cat_name, cat_def in type_cats.items(): 64 desc = cat_def.get("description", cat_name) 65 type_section += f"\n### {cat_name}\n{desc}\n{build_questions_text(cat_def)}\n" 66 67 reg_json = json.dumps(registry_entry, indent=2, ensure_ascii=False) if registry_entry else "{}" 68 69 return f"""You are scanning a research paper for methodological quality. This is a {paper_type} paper. 70 71 Answer every question with a JSON object containing: 72 - "applies": true/false (is this criterion relevant to this paper?) 73 - "answer": true/false (does the paper satisfy it? false when applies=false) 74 - "justification": "1-2 sentences citing specific evidence" 75 - "source": "haiku" 76 77 Be strict. Absence of evidence = answer: false. Do not be generous. 78 79 ## Registry Entry 80 ```json 81 {reg_json} 82 ``` 83 84 ## Shared Core Questions (answer ALL of these) 85 {core_section} 86 87 ## {paper_type.title()} Module Questions (answer ALL of these) 88 {type_section} 89 90 ## Additional Required Fields 91 92 ### Claims 93 Extract 3-8 key empirical claims. For each: {{"claim": "...", "evidence": "...", "supported": "strong|moderate|weak|unsupported"}} 94 95 ### Key Findings 96 2-4 sentence summary of the paper's most important findings. 97 98 ### Red Flags 99 List methodological concerns: {{"flag": "short label", "detail": "explanation"}} 100 101 ### Methodology Tags 102 Assign from: rct, observational, benchmark-eval, case-study, meta-analysis, theoretical, qualitative 103 104 ### Cited Papers 105 Extract 3-10 survey-relevant references: {{"title": "...", "relevance": "..."}} 106 107 ### Engagement Factors 108 Rate 0-3 on each dimension: 109 - practical_relevance: Can practitioners use this? 110 - surprise_contrarian: Challenges conventional wisdom? 111 - fear_safety: Raises AI risk concerns? 112 - drama_conflict: Controversy angle? 113 - demo_ability: Can someone try it now? 114 - brand_recognition: Famous lab or product? 115 Each: {{"score": 0-3, "justification": "1 sentence"}} 116 117 ## Output 118 119 Respond with a single JSON object: 120 {{ 121 "scan_version": 5, 122 "paper_type": "{paper_type}", 123 "paper": {{"title": "...", "authors": [...], "year": ..., "venue": "...", "arxiv_id": "...", "doi": "..."}}, 124 "checklist": {{<shared core categories with questions>}}, 125 "type_checklist": {{"{paper_type}": {{<type-specific categories with questions>}}}}, 126 "claims": [...], 127 "methodology_tags": [...], 128 "key_findings": "...", 129 "red_flags": [...], 130 "cited_papers": [...], 131 "engagement_factors": {{...}}, 132 "hn_data": {json.dumps(hn_data)} 133 }} 134 135 ## Paper Text 136 {paper_text} 137 """ 138 139 140 # ── HN Fetch ────────────────────────────────────────────────────────── 141 142 def fetch_hn(paper_id, arxiv_id=""): 143 """Fetch HN data. Returns dict compatible with hn_data schema.""" 144 hn_path = PAPERS_DIR / paper_id / "hn.json" 145 if hn_path.exists(): 146 with open(hn_path) as f: 147 return json.load(f) 148 149 if not arxiv_id: 150 return {"threads": [], "top_points": 0, "total_points": 0, "total_comments": 0} 151 152 try: 153 params = urllib.parse.urlencode({"query": arxiv_id, "tags": "story", "hitsPerPage": 10}) 154 req = urllib.request.Request(f"https://hn.algolia.com/api/v1/search?{params}", 155 headers={"User-Agent": "research-survey/1.0"}) 156 resp = urllib.request.urlopen(req, timeout=10) 157 data = json.loads(resp.read()) 158 hits = data.get("hits", []) 159 threads = [] 160 for h in hits: 161 threads.append({ 162 "hn_id": h.get("objectID", ""), 163 "title": h.get("title", ""), 164 "points": h.get("points", 0) or 0, 165 "comments": h.get("num_comments", 0) or 0, 166 "url": f"https://news.ycombinator.com/item?id={h.get('objectID', '')}", 167 }) 168 threads.sort(key=lambda t: -t["points"]) 169 return { 170 "threads": threads, 171 "top_points": threads[0]["points"] if threads else 0, 172 "total_points": sum(t["points"] for t in threads), 173 "total_comments": sum(t["comments"] for t in threads), 174 } 175 except Exception: 176 return {"threads": [], "top_points": 0, "total_points": 0, "total_comments": 0} 177 178 179 # ── Scan One Paper ──────────────────────────────────────────────────── 180 181 def load_registry(): 182 entries = {} 183 with open(ROOT / "registry.jsonl") as f: 184 for line in f: 185 if line.strip(): 186 e = json.loads(line) 187 entries[e["id"]] = e 188 return entries 189 190 191 def scan_one(paper_id, registry, force=False): 192 """Run v5 Haiku scan on one paper. Returns (paper_id, ok, reason, stats).""" 193 v5_path = PAPERS_DIR / paper_id / "scan-v5.json" 194 if v5_path.exists() and not force: 195 return paper_id, True, "already scanned", {} 196 197 txt_path = PAPERS_DIR / paper_id / "paper.txt" 198 type_path = PAPERS_DIR / paper_id / "paper_type.json" 199 200 if not txt_path.exists(): 201 return paper_id, False, "no paper.txt", {} 202 if not type_path.exists(): 203 return paper_id, False, "no paper_type.json", {} 204 205 with open(type_path) as f: 206 paper_type = json.load(f).get("paper_type") 207 if not paper_type: 208 return paper_id, False, "invalid paper_type", {} 209 210 paper_text = txt_path.read_text(encoding="utf-8").replace("\x00", "") 211 reg_entry = registry.get(paper_id, {}) 212 arxiv_id = reg_entry.get("arxiv_id", "") 213 214 # Fetch HN data 215 hn_data = fetch_hn(paper_id, arxiv_id) 216 217 # Build and run prompt 218 prompt = build_prompt(paper_type, paper_text, paper_id, reg_entry, hn_data) 219 220 # Pick model: haiku for most papers, sonnet for large ones 221 model = "haiku" 222 if len(paper_text) > 50000: 223 model = "sonnet" 224 225 try: 226 result = subprocess.run( 227 ["claude", "-p", "-", "--model", model, "--max-turns", "1"], 228 input=prompt, 229 capture_output=True, text=True, timeout=600, 230 cwd=str(ROOT), 231 ) 232 233 if result.returncode != 0: 234 stderr_hint = result.stderr.strip()[:200] if result.stderr else "" 235 # Retry with sonnet if haiku failed 236 if model == "haiku": 237 model = "sonnet" 238 result = subprocess.run( 239 ["claude", "-p", "-", "--model", model, "--max-turns", "1"], 240 input=prompt, 241 capture_output=True, text=True, timeout=600, 242 cwd=str(ROOT), 243 ) 244 if result.returncode != 0: 245 stderr_hint2 = result.stderr.strip()[:200] if result.stderr else "" 246 return paper_id, False, f"claude exit {result.returncode} (sonnet fallback): {stderr_hint2 or stderr_hint}", {} 247 else: 248 return paper_id, False, f"claude exit {result.returncode}: {stderr_hint}", {} 249 250 output = result.stdout.strip() 251 json_start = output.find("{") 252 json_end = output.rfind("}") + 1 253 if json_start == -1 or json_end == 0: 254 return paper_id, False, "no JSON in output", {} 255 256 v5_scan = json.loads(output[json_start:json_end]) 257 258 # Ensure required fields 259 v5_scan["scan_version"] = 5 260 v5_scan["paper_type"] = paper_type 261 v5_scan["hn_data"] = hn_data 262 263 # Mark all answers with the model that produced them 264 scan_model = model # haiku or sonnet 265 for section_key in ["checklist", "type_checklist"]: 266 section = v5_scan.get(section_key, {}) 267 if section_key == "type_checklist": 268 for ptype_key, ptype_data in section.items(): 269 if isinstance(ptype_data, dict): 270 for cat_data in ptype_data.values(): 271 if isinstance(cat_data, dict): 272 for qd in cat_data.values(): 273 if isinstance(qd, dict) and "applies" in qd and "source" not in qd: 274 qd["source"] = scan_model 275 else: 276 for cat_data in section.values(): 277 if isinstance(cat_data, dict): 278 for qd in cat_data.values(): 279 if isinstance(qd, dict) and "applies" in qd and "source" not in qd: 280 qd["source"] = scan_model 281 282 # Write v5 scan — pure Haiku/Sonnet output, NO merge with Opus. 283 # The build pipeline will overlay Opus answers at read time when both exist. 284 # Keeping them separate preserves the ability to compare per-question. 285 with open(v5_path, "w") as f: 286 json.dump(v5_scan, f, ensure_ascii=False, indent=2) 287 288 return paper_id, True, f"{scan_model}-only", {} 289 290 except json.JSONDecodeError as e: 291 return paper_id, False, f"JSON error: {e}", {} 292 except subprocess.TimeoutExpired: 293 return paper_id, False, "timeout", {} 294 except Exception as e: 295 return paper_id, False, f"error: {e}", {} 296 297 298 # ── Main ────────────────────────────────────────────────────────────── 299 300 def main(): 301 args = sys.argv[1:] 302 force = "--force" in args 303 limit = None 304 specific_id = None 305 parallel = 1 306 307 for i, arg in enumerate(args): 308 if arg == "--limit" and i + 1 < len(args): 309 limit = int(args[i + 1]) 310 if arg == "--id" and i + 1 < len(args): 311 specific_id = args[i + 1] 312 if arg == "--parallel" and i + 1 < len(args): 313 parallel = int(args[i + 1]) 314 315 registry = load_registry() 316 317 # Collect candidates: papers with paper.txt + paper_type.json 318 candidates = [] 319 for type_path in sorted(PAPERS_DIR.glob("*/paper_type.json")): 320 pid = type_path.parent.name 321 if specific_id and pid != specific_id: 322 continue 323 v5_path = type_path.parent / "scan-v5.json" 324 if v5_path.exists() and not force and not specific_id: 325 continue 326 txt_path = type_path.parent / "paper.txt" 327 if not txt_path.exists(): 328 continue 329 candidates.append(pid) 330 331 if limit: 332 candidates = candidates[:limit] 333 334 if not candidates: 335 print("No papers to scan.") 336 return 337 338 print(f"V5 Haiku scan: {len(candidates)} papers" 339 f"{f' (parallel={parallel})' if parallel > 1 else ''}:\n") 340 341 ok_count = 0 342 fail_count = 0 343 344 if parallel > 1: 345 with ThreadPoolExecutor(max_workers=parallel) as executor: 346 futures = {executor.submit(scan_one, pid, registry, force): pid for pid in candidates} 347 for future in as_completed(futures): 348 pid, ok, reason, stats = future.result() 349 if ok: 350 ok_count += 1 351 if ok_count % 10 == 0: 352 print(f" ... {ok_count} OK so far") 353 else: 354 fail_count += 1 355 print(f" FAIL: {pid} — {reason}") 356 else: 357 for i, pid in enumerate(candidates): 358 if (i + 1) % 20 == 0: 359 print(f" ... {i+1}/{len(candidates)}") 360 pid, ok, reason, stats = scan_one(pid, registry, force) 361 if ok: 362 ok_count += 1 363 else: 364 fail_count += 1 365 print(f" FAIL: {pid} — {reason}") 366 367 print(f"\nDone. OK: {ok_count}, Failed: {fail_count}") 368 369 370 if __name__ == "__main__": 371 main()