run-scan-v4-haiku.py - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

run-scan-v4-haiku.py (20773B)
      1 #!/usr/bin/env python3
      2 """
      3 V4 Haiku scan: fast coverage pass for all papers.
      4 
      5 For each paper with paper.txt and paper_type.json:
      6 1. Read paper text + paper_type
      7 2. Run Haiku to answer shared core + type-specific questions
      8 3. If existing v2/v3 Opus scan exists, merge: Opus answers override Haiku
      9 4. Write scan-v4.json (separate from scan.json to preserve v2/v3 data)
     10 
     11 Usage:
     12     python3 scripts/run-scan-v4-haiku.py                    # All unscanned
     13     python3 scripts/run-scan-v4-haiku.py --limit 10         # First N
     14     python3 scripts/run-scan-v4-haiku.py --parallel 8       # Concurrent (Haiku is fast)
     15     python3 scripts/run-scan-v4-haiku.py --id metr-rct-2025 # Specific paper
     16     python3 scripts/run-scan-v4-haiku.py --force            # Re-scan all
     17 """
     18 
     19 import json
     20 import subprocess
     21 import sys
     22 import urllib.parse
     23 import urllib.request
     24 from concurrent.futures import ThreadPoolExecutor, as_completed
     25 from pathlib import Path
     26 
     27 ROOT = Path(__file__).resolve().parent.parent
     28 PAPERS_DIR = ROOT / "papers"
     29 SCHEMA_PATH = ROOT / "schema" / "scan-v4.schema.json"
     30 
     31 # Load schema for question descriptions
     32 with open(SCHEMA_PATH) as f:
     33     SCHEMA = json.load(f)
     34 
     35 # ── Build prompt from schema ──────────────────────────────────────────
     36 
     37 def build_questions_text(category_obj):
     38     """Extract question descriptions from a schema category object."""
     39     lines = []
     40     props = category_obj.get("properties", {})
     41     for qname, qdef in props.items():
     42         desc = qdef.get("description", "")
     43         if not desc and "$ref" in qdef:
     44             desc = qdef.get("description", qname)
     45         lines.append(f"  - **{qname}**: {desc}")
     46     return "\n".join(lines)
     47 
     48 
     49 def build_prompt(paper_type, paper_text, paper_id, registry_entry, hn_data):
     50     """Build the v4 Haiku scan prompt."""
     51     core_cats = SCHEMA["properties"]["checklist"]["properties"]
     52     type_mod = SCHEMA["properties"]["type_checklist"]["properties"].get(paper_type, {})
     53     type_cats = type_mod.get("properties", {})
     54 
     55     core_section = ""
     56     for cat_name, cat_def in core_cats.items():
     57         desc = cat_def.get("description", cat_name)
     58         core_section += f"\n### {cat_name}\n{desc}\n{build_questions_text(cat_def)}\n"
     59 
     60     type_section = ""
     61     for cat_name, cat_def in type_cats.items():
     62         desc = cat_def.get("description", cat_name)
     63         type_section += f"\n### {cat_name}\n{desc}\n{build_questions_text(cat_def)}\n"
     64 
     65     reg_json = json.dumps(registry_entry, indent=2, ensure_ascii=False) if registry_entry else "{}"
     66 
     67     return f"""You are scanning a research paper for methodological quality. This is a {paper_type} paper.
     68 
     69 Answer every question with a JSON object containing:
     70 - "applies": true/false (is this criterion relevant to this paper?)
     71 - "answer": true/false (does the paper satisfy it? false when applies=false)
     72 - "justification": "1-2 sentences citing specific evidence"
     73 - "source": "haiku"
     74 
     75 Be strict. Absence of evidence = answer: false. Do not be generous.
     76 
     77 ## Registry Entry
     78 ```json
     79 {reg_json}
     80 ```
     81 
     82 ## Shared Core Questions (answer ALL of these)
     83 {core_section}
     84 
     85 ## {paper_type.title()} Module Questions (answer ALL of these)
     86 {type_section}
     87 
     88 ## Additional Required Fields
     89 
     90 ### Claims
     91 Extract 3-8 key empirical claims. For each: {{"claim": "...", "evidence": "...", "supported": "strong|moderate|weak|unsupported"}}
     92 
     93 ### Key Findings
     94 2-4 sentence summary of the paper's most important findings.
     95 
     96 ### Red Flags
     97 List methodological concerns: {{"flag": "short label", "detail": "explanation"}}
     98 
     99 ### Methodology Tags
    100 Assign from: rct, observational, benchmark-eval, case-study, meta-analysis, theoretical, qualitative
    101 
    102 ### Cited Papers
    103 Extract 3-10 survey-relevant references: {{"title": "...", "relevance": "..."}}
    104 
    105 ### Engagement Factors
    106 Rate 0-3 on each dimension:
    107 - practical_relevance: Can practitioners use this?
    108 - surprise_contrarian: Challenges conventional wisdom?
    109 - fear_safety: Raises AI risk concerns?
    110 - drama_conflict: Controversy angle?
    111 - demo_ability: Can someone try it now?
    112 - brand_recognition: Famous lab or product?
    113 Each: {{"score": 0-3, "justification": "1 sentence"}}
    114 
    115 ## Output
    116 
    117 Respond with a single JSON object:
    118 {{
    119   "scan_version": 4,
    120   "paper_type": "{paper_type}",
    121   "paper": {{"title": "...", "authors": [...], "year": ..., "venue": "...", "arxiv_id": "...", "doi": "..."}},
    122   "checklist": {{<shared core categories with questions>}},
    123   "type_checklist": {{"{paper_type}": {{<type-specific categories with questions>}}}},
    124   "claims": [...],
    125   "methodology_tags": [...],
    126   "key_findings": "...",
    127   "red_flags": [...],
    128   "cited_papers": [...],
    129   "engagement_factors": {{...}},
    130   "hn_data": {json.dumps(hn_data)}
    131 }}
    132 
    133 ## Paper Text
    134 {paper_text}
    135 """
    136 
    137 
    138 # ── V2/V3 → V4 Merge Logic ───────────────────────────────────────────
    139 
    140 # Map v2/v3 category.question → v4 location
    141 V2_TO_V4_CORE = {
    142     # claims_and_evidence → checklist.claims_and_evidence
    143     "claims_and_evidence.abstract_claims_supported": ("checklist", "claims_and_evidence", "abstract_claims_supported"),
    144     "claims_and_evidence.causal_claims_justified": ("checklist", "claims_and_evidence", "causal_claims_justified"),
    145     "claims_and_evidence.generalization_bounded": ("checklist", "claims_and_evidence", "generalization_bounded"),
    146     "claims_and_evidence.alternative_explanations_discussed": ("checklist", "claims_and_evidence", "alternative_explanations_discussed"),
    147     "claims_and_evidence.proxy_outcome_distinction": ("checklist", "claims_and_evidence", "proxy_outcome_distinction"),
    148     # limitations_and_scope → checklist.limitations_and_scope
    149     "limitations_and_scope.limitations_section_present": ("checklist", "limitations_and_scope", "limitations_section_present"),
    150     "limitations_and_scope.threats_to_validity_specific": ("checklist", "limitations_and_scope", "threats_to_validity_specific"),
    151     "limitations_and_scope.scope_boundaries_stated": ("checklist", "limitations_and_scope", "scope_boundaries_stated"),
    152     # conflicts_of_interest → checklist.conflicts_of_interest
    153     "conflicts_of_interest.funding_disclosed": ("checklist", "conflicts_of_interest", "funding_disclosed"),
    154     "conflicts_of_interest.affiliations_disclosed": ("checklist", "conflicts_of_interest", "affiliations_disclosed"),
    155     "conflicts_of_interest.funder_independent_of_outcome": ("checklist", "conflicts_of_interest", "funder_independent_of_outcome"),
    156     "conflicts_of_interest.financial_interests_declared": ("checklist", "conflicts_of_interest", "financial_interests_declared"),
    157 }
    158 
    159 # v2/v3 empirical questions → v4 type_checklist.empirical
    160 V2_TO_V4_EMPIRICAL = {}
    161 for cat in ["artifacts", "statistical_methodology", "evaluation_design", "setup_transparency",
    162             "data_integrity", "contamination", "human_studies", "cost_and_practicality"]:
    163     # Get question names from v2 schema
    164     v2_schema_path = ROOT / "schema" / "scan.schema.json"
    165     with open(v2_schema_path) as f:
    166         v2_schema = json.load(f)
    167     cat_props = v2_schema["properties"]["checklist"]["properties"].get(cat, {}).get("properties", {})
    168     for qname in cat_props:
    169         V2_TO_V4_EMPIRICAL[f"{cat}.{qname}"] = ("type_checklist", "empirical", cat, qname)
    170 
    171 # Also map conditional modules
    172 for cat in ["experimental_rigor", "data_leakage", "survey_methodology"]:
    173     cat_props = v2_schema["properties"]["checklist"]["properties"].get(cat, {}).get("properties", {})
    174     for qname in cat_props:
    175         V2_TO_V4_EMPIRICAL[f"{cat}.{qname}"] = ("type_checklist", "empirical", cat, qname)
    176 
    177 
    178 def merge_opus_answers(v4_scan, v2_scan, paper_type):
    179     """Overlay Opus v2/v3 answers onto Haiku v4 scan. Returns merged scan + agreement stats."""
    180     agreements = 0
    181     disagreements = 0
    182     opus_overrides = 0
    183 
    184     v2_checklist = v2_scan.get("checklist", {})
    185 
    186     # Merge core questions
    187     for v2_key, (section, cat, qname) in V2_TO_V4_CORE.items():
    188         v2_cat, v2_qname = v2_key.split(".")
    189         v2_answer = v2_checklist.get(v2_cat, {}).get(v2_qname)
    190         if not v2_answer or not isinstance(v2_answer, dict):
    191             continue
    192 
    193         v4_section = v4_scan.get(section, {})
    194         v4_cat_data = v4_section.get(cat, {})
    195         v4_answer = v4_cat_data.get(qname)
    196 
    197         if v4_answer and isinstance(v4_answer, dict):
    198             # Compare
    199             if v4_answer.get("applies") == v2_answer.get("applies") and v4_answer.get("answer") == v2_answer.get("answer"):
    200                 agreements += 1
    201             else:
    202                 disagreements += 1
    203 
    204         # Override with Opus
    205         opus_item = {
    206             "applies": v2_answer["applies"],
    207             "answer": v2_answer["answer"],
    208             "justification": v2_answer.get("justification", ""),
    209             "source": "opus",
    210         }
    211         if cat not in v4_section:
    212             v4_section[cat] = {}
    213         v4_section[cat][qname] = opus_item
    214         v4_scan[section] = v4_section
    215         opus_overrides += 1
    216 
    217     # Merge empirical type questions (only if paper is empirical)
    218     if paper_type == "empirical":
    219         for v2_key, path in V2_TO_V4_EMPIRICAL.items():
    220             v2_cat, v2_qname = v2_key.split(".")
    221             v2_answer = v2_checklist.get(v2_cat, {}).get(v2_qname)
    222             if not v2_answer or not isinstance(v2_answer, dict):
    223                 continue
    224 
    225             if len(path) == 4:
    226                 section, ptype, cat, qname = path
    227             else:
    228                 continue
    229 
    230             # Navigate to v4 location
    231             type_cl = v4_scan.get(section, {})
    232             type_data = type_cl.get(ptype, {})
    233             cat_data = type_data.get(cat, {})
    234             v4_answer = cat_data.get(qname)
    235 
    236             if v4_answer and isinstance(v4_answer, dict):
    237                 if v4_answer.get("applies") == v2_answer.get("applies") and v4_answer.get("answer") == v2_answer.get("answer"):
    238                     agreements += 1
    239                 else:
    240                     disagreements += 1
    241 
    242             opus_item = {
    243                 "applies": v2_answer["applies"],
    244                 "answer": v2_answer["answer"],
    245                 "justification": v2_answer.get("justification", ""),
    246                 "source": "opus",
    247             }
    248             if cat not in cat_data:
    249                 cat_data[qname] = opus_item
    250             else:
    251                 cat_data[qname] = opus_item
    252             type_data[cat] = cat_data
    253             type_cl[ptype] = type_data
    254             v4_scan[section] = type_cl
    255             opus_overrides += 1
    256 
    257     # Also merge engagement factors from v3 if Opus-generated
    258     v2_ef = v2_scan.get("engagement_factors")
    259     if v2_ef and v2_scan.get("scan_version", 1) >= 3:
    260         v4_scan["engagement_factors"] = v2_ef
    261 
    262     return v4_scan, {"agreements": agreements, "disagreements": disagreements, "opus_overrides": opus_overrides}
    263 
    264 
    265 # ── HN Fetch ──────────────────────────────────────────────────────────
    266 
    267 def fetch_hn(paper_id, arxiv_id=""):
    268     """Fetch HN data. Returns dict compatible with hn_data schema."""
    269     hn_path = PAPERS_DIR / paper_id / "hn.json"
    270     if hn_path.exists():
    271         with open(hn_path) as f:
    272             return json.load(f)
    273 
    274     if not arxiv_id:
    275         return {"threads": [], "top_points": 0, "total_points": 0, "total_comments": 0}
    276 
    277     try:
    278         params = urllib.parse.urlencode({"query": arxiv_id, "tags": "story", "hitsPerPage": 10})
    279         req = urllib.request.Request(f"https://hn.algolia.com/api/v1/search?{params}",
    280                                      headers={"User-Agent": "research-survey/1.0"})
    281         resp = urllib.request.urlopen(req, timeout=10)
    282         data = json.loads(resp.read())
    283         hits = data.get("hits", [])
    284         threads = []
    285         for h in hits:
    286             threads.append({
    287                 "hn_id": h.get("objectID", ""),
    288                 "title": h.get("title", ""),
    289                 "points": h.get("points", 0) or 0,
    290                 "comments": h.get("num_comments", 0) or 0,
    291                 "url": f"https://news.ycombinator.com/item?id={h.get('objectID', '')}",
    292             })
    293         threads.sort(key=lambda t: -t["points"])
    294         return {
    295             "threads": threads,
    296             "top_points": threads[0]["points"] if threads else 0,
    297             "total_points": sum(t["points"] for t in threads),
    298             "total_comments": sum(t["comments"] for t in threads),
    299         }
    300     except Exception:
    301         return {"threads": [], "top_points": 0, "total_points": 0, "total_comments": 0}
    302 
    303 
    304 # ── Scan One Paper ────────────────────────────────────────────────────
    305 
    306 def load_registry():
    307     entries = {}
    308     with open(ROOT / "registry.jsonl") as f:
    309         for line in f:
    310             if line.strip():
    311                 e = json.loads(line)
    312                 entries[e["id"]] = e
    313     return entries
    314 
    315 
    316 def scan_one(paper_id, registry, force=False):
    317     """Run v4 Haiku scan on one paper. Returns (paper_id, ok, reason, stats)."""
    318     v4_path = PAPERS_DIR / paper_id / "scan-v4.json"
    319     if v4_path.exists() and not force:
    320         return paper_id, True, "already scanned", {}
    321 
    322     txt_path = PAPERS_DIR / paper_id / "paper.txt"
    323     type_path = PAPERS_DIR / paper_id / "paper_type.json"
    324 
    325     if not txt_path.exists():
    326         return paper_id, False, "no paper.txt", {}
    327     if not type_path.exists():
    328         return paper_id, False, "no paper_type.json", {}
    329 
    330     with open(type_path) as f:
    331         paper_type = json.load(f).get("paper_type")
    332     if not paper_type:
    333         return paper_id, False, "invalid paper_type", {}
    334 
    335     paper_text = txt_path.read_text(encoding="utf-8").replace("\x00", "")
    336     reg_entry = registry.get(paper_id, {})
    337     arxiv_id = reg_entry.get("arxiv_id", "")
    338 
    339     # Fetch HN data
    340     hn_data = fetch_hn(paper_id, arxiv_id)
    341 
    342     # Build and run prompt
    343     prompt = build_prompt(paper_type, paper_text, paper_id, reg_entry, hn_data)
    344 
    345     # Pick model: haiku for most papers, sonnet for large ones
    346     model = "haiku"
    347     if len(paper_text) > 50000:
    348         model = "sonnet"
    349 
    350     try:
    351         result = subprocess.run(
    352             ["claude", "-p", "-", "--model", model, "--max-turns", "1"],
    353             input=prompt,
    354             capture_output=True, text=True, timeout=600,
    355             cwd=str(ROOT),
    356         )
    357 
    358         if result.returncode != 0:
    359             # Retry with sonnet if haiku failed
    360             if model == "haiku":
    361                 model = "sonnet"
    362                 result = subprocess.run(
    363                     ["claude", "-p", "-", "--model", model, "--max-turns", "1"],
    364                     input=prompt,
    365                     capture_output=True, text=True, timeout=600,
    366                     cwd=str(ROOT),
    367                 )
    368                 if result.returncode != 0:
    369                     return paper_id, False, f"claude exit {result.returncode} (sonnet fallback)", {}
    370             else:
    371                 return paper_id, False, f"claude exit {result.returncode}", {}
    372 
    373         output = result.stdout.strip()
    374         json_start = output.find("{")
    375         json_end = output.rfind("}") + 1
    376         if json_start == -1 or json_end == 0:
    377             return paper_id, False, "no JSON in output", {}
    378 
    379         v4_scan = json.loads(output[json_start:json_end])
    380 
    381         # Ensure required fields
    382         v4_scan["scan_version"] = 4
    383         v4_scan["paper_type"] = paper_type
    384         v4_scan["hn_data"] = hn_data
    385 
    386         # Mark all answers with the model that produced them
    387         scan_model = model  # haiku or sonnet
    388         for section_key in ["checklist", "type_checklist"]:
    389             section = v4_scan.get(section_key, {})
    390             if section_key == "type_checklist":
    391                 for ptype_key, ptype_data in section.items():
    392                     if isinstance(ptype_data, dict):
    393                         for cat_data in ptype_data.values():
    394                             if isinstance(cat_data, dict):
    395                                 for qd in cat_data.values():
    396                                     if isinstance(qd, dict) and "applies" in qd and "source" not in qd:
    397                                         qd["source"] = scan_model
    398             else:
    399                 for cat_data in section.values():
    400                     if isinstance(cat_data, dict):
    401                         for qd in cat_data.values():
    402                             if isinstance(qd, dict) and "applies" in qd and "source" not in qd:
    403                                 qd["source"] = scan_model
    404 
    405         # Merge Opus answers if v2/v3 scan exists
    406         merge_stats = {}
    407         v2_path = PAPERS_DIR / paper_id / "scan.json"
    408         if v2_path.exists():
    409             with open(v2_path) as f:
    410                 v2_scan = json.load(f)
    411             if v2_scan.get("scan_version", 1) >= 2:
    412                 v4_scan, merge_stats = merge_opus_answers(v4_scan, v2_scan, paper_type)
    413 
    414         # Write v4 scan
    415         with open(v4_path, "w") as f:
    416             json.dump(v4_scan, f, ensure_ascii=False, indent=2)
    417 
    418         opus_n = merge_stats.get("opus_overrides", 0)
    419         agree = merge_stats.get("agreements", 0)
    420         disagree = merge_stats.get("disagreements", 0)
    421         model_tag = scan_model
    422         coverage = f"{model_tag}-only" if opus_n == 0 else f"merged({model_tag}+opus={opus_n},agree={agree},disagree={disagree})"
    423         return paper_id, True, coverage, merge_stats
    424 
    425     except json.JSONDecodeError as e:
    426         return paper_id, False, f"JSON error: {e}", {}
    427     except subprocess.TimeoutExpired:
    428         return paper_id, False, "timeout", {}
    429     except Exception as e:
    430         return paper_id, False, f"error: {e}", {}
    431 
    432 
    433 # ── Main ──────────────────────────────────────────────────────────────
    434 
    435 def main():
    436     args = sys.argv[1:]
    437     force = "--force" in args
    438     limit = None
    439     specific_id = None
    440     parallel = 1
    441 
    442     for i, arg in enumerate(args):
    443         if arg == "--limit" and i + 1 < len(args):
    444             limit = int(args[i + 1])
    445         if arg == "--id" and i + 1 < len(args):
    446             specific_id = args[i + 1]
    447         if arg == "--parallel" and i + 1 < len(args):
    448             parallel = int(args[i + 1])
    449 
    450     registry = load_registry()
    451 
    452     # Collect candidates: papers with paper.txt + paper_type.json
    453     candidates = []
    454     for type_path in sorted(PAPERS_DIR.glob("*/paper_type.json")):
    455         pid = type_path.parent.name
    456         if specific_id and pid != specific_id:
    457             continue
    458         v4_path = type_path.parent / "scan-v4.json"
    459         if v4_path.exists() and not force and not specific_id:
    460             continue
    461         txt_path = type_path.parent / "paper.txt"
    462         if not txt_path.exists():
    463             continue
    464         candidates.append(pid)
    465 
    466     if limit:
    467         candidates = candidates[:limit]
    468 
    469     if not candidates:
    470         print("No papers to scan.")
    471         return
    472 
    473     print(f"V4 Haiku scan: {len(candidates)} papers"
    474           f"{f' (parallel={parallel})' if parallel > 1 else ''}:\n")
    475 
    476     total_agree = 0
    477     total_disagree = 0
    478     ok_count = 0
    479     fail_count = 0
    480 
    481     if parallel > 1:
    482         with ThreadPoolExecutor(max_workers=parallel) as executor:
    483             futures = {executor.submit(scan_one, pid, registry, force): pid for pid in candidates}
    484             for future in as_completed(futures):
    485                 pid, ok, reason, stats = future.result()
    486                 if ok:
    487                     ok_count += 1
    488                     total_agree += stats.get("agreements", 0)
    489                     total_disagree += stats.get("disagreements", 0)
    490                     if "merged" in reason:
    491                         print(f"  OK: {pid} — {reason}")
    492                 else:
    493                     fail_count += 1
    494                     print(f"  FAIL: {pid} — {reason}")
    495     else:
    496         for i, pid in enumerate(candidates):
    497             if (i + 1) % 20 == 0:
    498                 print(f"  ... {i+1}/{len(candidates)}")
    499             pid, ok, reason, stats = scan_one(pid, registry, force)
    500             if ok:
    501                 ok_count += 1
    502                 total_agree += stats.get("agreements", 0)
    503                 total_disagree += stats.get("disagreements", 0)
    504             else:
    505                 fail_count += 1
    506                 print(f"  FAIL: {pid} — {reason}")
    507 
    508     print(f"\nDone. OK: {ok_count}, Failed: {fail_count}")
    509     if total_agree + total_disagree > 0:
    510         rate = total_agree / (total_agree + total_disagree) * 100
    511         print(f"Haiku-Opus agreement: {total_agree}/{total_agree + total_disagree} ({rate:.1f}%)")
    512 
    513 
    514 if __name__ == "__main__":
    515     main()
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs