run-scan-v5-haiku.py - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

run-scan-v5-haiku.py (13974B)
      1 #!/usr/bin/env python3
      2 """
      3 V5 Haiku scan: fast coverage pass for all papers. PURE Haiku output — no merge.
      4 
      5 For each paper with paper.txt and paper_type.json:
      6 1. Read paper text + paper_type
      7 2. Run Haiku to answer shared core + type-specific questions
      8 3. Write scan-v5.json with raw Haiku/Sonnet answers (no Opus merge).
      9 
     10 The build pipeline handles Opus/Haiku merging at read time. This keeps v5
     11 files pure for calibration analysis (Haiku vs Opus per question).
     12 
     13 Usage:
     14     python3 scripts/run-scan-v5-haiku.py                    # All unscanned
     15     python3 scripts/run-scan-v5-haiku.py --limit 10         # First N
     16     python3 scripts/run-scan-v5-haiku.py --parallel 8       # Concurrent
     17     python3 scripts/run-scan-v5-haiku.py --id metr-rct-2025 # Specific paper
     18     python3 scripts/run-scan-v5-haiku.py --force            # Re-scan all
     19 """
     20 
     21 import json
     22 import subprocess
     23 import sys
     24 import urllib.parse
     25 import urllib.request
     26 from concurrent.futures import ThreadPoolExecutor, as_completed
     27 from pathlib import Path
     28 
     29 ROOT = Path(__file__).resolve().parent.parent
     30 PAPERS_DIR = ROOT / "papers"
     31 SCHEMA_PATH = ROOT / "schema" / "scan-v4.schema.json"
     32 
     33 # Load schema for question descriptions
     34 with open(SCHEMA_PATH) as f:
     35     SCHEMA = json.load(f)
     36 
     37 # ── Build prompt from schema ──────────────────────────────────────────
     38 
     39 def build_questions_text(category_obj):
     40     """Extract question descriptions from a schema category object."""
     41     lines = []
     42     props = category_obj.get("properties", {})
     43     for qname, qdef in props.items():
     44         desc = qdef.get("description", "")
     45         if not desc and "$ref" in qdef:
     46             desc = qdef.get("description", qname)
     47         lines.append(f"  - **{qname}**: {desc}")
     48     return "\n".join(lines)
     49 
     50 
     51 def build_prompt(paper_type, paper_text, paper_id, registry_entry, hn_data):
     52     """Build the v5 Haiku scan prompt."""
     53     core_cats = SCHEMA["properties"]["checklist"]["properties"]
     54     type_mod = SCHEMA["properties"]["type_checklist"]["properties"].get(paper_type, {})
     55     type_cats = type_mod.get("properties", {})
     56 
     57     core_section = ""
     58     for cat_name, cat_def in core_cats.items():
     59         desc = cat_def.get("description", cat_name)
     60         core_section += f"\n### {cat_name}\n{desc}\n{build_questions_text(cat_def)}\n"
     61 
     62     type_section = ""
     63     for cat_name, cat_def in type_cats.items():
     64         desc = cat_def.get("description", cat_name)
     65         type_section += f"\n### {cat_name}\n{desc}\n{build_questions_text(cat_def)}\n"
     66 
     67     reg_json = json.dumps(registry_entry, indent=2, ensure_ascii=False) if registry_entry else "{}"
     68 
     69     return f"""You are scanning a research paper for methodological quality. This is a {paper_type} paper.
     70 
     71 Answer every question with a JSON object containing:
     72 - "applies": true/false (is this criterion relevant to this paper?)
     73 - "answer": true/false (does the paper satisfy it? false when applies=false)
     74 - "justification": "1-2 sentences citing specific evidence"
     75 - "source": "haiku"
     76 
     77 Be strict. Absence of evidence = answer: false. Do not be generous.
     78 
     79 ## Registry Entry
     80 ```json
     81 {reg_json}
     82 ```
     83 
     84 ## Shared Core Questions (answer ALL of these)
     85 {core_section}
     86 
     87 ## {paper_type.title()} Module Questions (answer ALL of these)
     88 {type_section}
     89 
     90 ## Additional Required Fields
     91 
     92 ### Claims
     93 Extract 3-8 key empirical claims. For each: {{"claim": "...", "evidence": "...", "supported": "strong|moderate|weak|unsupported"}}
     94 
     95 ### Key Findings
     96 2-4 sentence summary of the paper's most important findings.
     97 
     98 ### Red Flags
     99 List methodological concerns: {{"flag": "short label", "detail": "explanation"}}
    100 
    101 ### Methodology Tags
    102 Assign from: rct, observational, benchmark-eval, case-study, meta-analysis, theoretical, qualitative
    103 
    104 ### Cited Papers
    105 Extract 3-10 survey-relevant references: {{"title": "...", "relevance": "..."}}
    106 
    107 ### Engagement Factors
    108 Rate 0-3 on each dimension:
    109 - practical_relevance: Can practitioners use this?
    110 - surprise_contrarian: Challenges conventional wisdom?
    111 - fear_safety: Raises AI risk concerns?
    112 - drama_conflict: Controversy angle?
    113 - demo_ability: Can someone try it now?
    114 - brand_recognition: Famous lab or product?
    115 Each: {{"score": 0-3, "justification": "1 sentence"}}
    116 
    117 ## Output
    118 
    119 Respond with a single JSON object:
    120 {{
    121   "scan_version": 5,
    122   "paper_type": "{paper_type}",
    123   "paper": {{"title": "...", "authors": [...], "year": ..., "venue": "...", "arxiv_id": "...", "doi": "..."}},
    124   "checklist": {{<shared core categories with questions>}},
    125   "type_checklist": {{"{paper_type}": {{<type-specific categories with questions>}}}},
    126   "claims": [...],
    127   "methodology_tags": [...],
    128   "key_findings": "...",
    129   "red_flags": [...],
    130   "cited_papers": [...],
    131   "engagement_factors": {{...}},
    132   "hn_data": {json.dumps(hn_data)}
    133 }}
    134 
    135 ## Paper Text
    136 {paper_text}
    137 """
    138 
    139 
    140 # ── HN Fetch ──────────────────────────────────────────────────────────
    141 
    142 def fetch_hn(paper_id, arxiv_id=""):
    143     """Fetch HN data. Returns dict compatible with hn_data schema."""
    144     hn_path = PAPERS_DIR / paper_id / "hn.json"
    145     if hn_path.exists():
    146         with open(hn_path) as f:
    147             return json.load(f)
    148 
    149     if not arxiv_id:
    150         return {"threads": [], "top_points": 0, "total_points": 0, "total_comments": 0}
    151 
    152     try:
    153         params = urllib.parse.urlencode({"query": arxiv_id, "tags": "story", "hitsPerPage": 10})
    154         req = urllib.request.Request(f"https://hn.algolia.com/api/v1/search?{params}",
    155                                      headers={"User-Agent": "research-survey/1.0"})
    156         resp = urllib.request.urlopen(req, timeout=10)
    157         data = json.loads(resp.read())
    158         hits = data.get("hits", [])
    159         threads = []
    160         for h in hits:
    161             threads.append({
    162                 "hn_id": h.get("objectID", ""),
    163                 "title": h.get("title", ""),
    164                 "points": h.get("points", 0) or 0,
    165                 "comments": h.get("num_comments", 0) or 0,
    166                 "url": f"https://news.ycombinator.com/item?id={h.get('objectID', '')}",
    167             })
    168         threads.sort(key=lambda t: -t["points"])
    169         return {
    170             "threads": threads,
    171             "top_points": threads[0]["points"] if threads else 0,
    172             "total_points": sum(t["points"] for t in threads),
    173             "total_comments": sum(t["comments"] for t in threads),
    174         }
    175     except Exception:
    176         return {"threads": [], "top_points": 0, "total_points": 0, "total_comments": 0}
    177 
    178 
    179 # ── Scan One Paper ────────────────────────────────────────────────────
    180 
    181 def load_registry():
    182     entries = {}
    183     with open(ROOT / "registry.jsonl") as f:
    184         for line in f:
    185             if line.strip():
    186                 e = json.loads(line)
    187                 entries[e["id"]] = e
    188     return entries
    189 
    190 
    191 def scan_one(paper_id, registry, force=False):
    192     """Run v5 Haiku scan on one paper. Returns (paper_id, ok, reason, stats)."""
    193     v5_path = PAPERS_DIR / paper_id / "scan-v5.json"
    194     if v5_path.exists() and not force:
    195         return paper_id, True, "already scanned", {}
    196 
    197     txt_path = PAPERS_DIR / paper_id / "paper.txt"
    198     type_path = PAPERS_DIR / paper_id / "paper_type.json"
    199 
    200     if not txt_path.exists():
    201         return paper_id, False, "no paper.txt", {}
    202     if not type_path.exists():
    203         return paper_id, False, "no paper_type.json", {}
    204 
    205     with open(type_path) as f:
    206         paper_type = json.load(f).get("paper_type")
    207     if not paper_type:
    208         return paper_id, False, "invalid paper_type", {}
    209 
    210     paper_text = txt_path.read_text(encoding="utf-8").replace("\x00", "")
    211     reg_entry = registry.get(paper_id, {})
    212     arxiv_id = reg_entry.get("arxiv_id", "")
    213 
    214     # Fetch HN data
    215     hn_data = fetch_hn(paper_id, arxiv_id)
    216 
    217     # Build and run prompt
    218     prompt = build_prompt(paper_type, paper_text, paper_id, reg_entry, hn_data)
    219 
    220     # Pick model: haiku for most papers, sonnet for large ones
    221     model = "haiku"
    222     if len(paper_text) > 50000:
    223         model = "sonnet"
    224 
    225     try:
    226         result = subprocess.run(
    227             ["claude", "-p", "-", "--model", model, "--max-turns", "1"],
    228             input=prompt,
    229             capture_output=True, text=True, timeout=600,
    230             cwd=str(ROOT),
    231         )
    232 
    233         if result.returncode != 0:
    234             stderr_hint = result.stderr.strip()[:200] if result.stderr else ""
    235             # Retry with sonnet if haiku failed
    236             if model == "haiku":
    237                 model = "sonnet"
    238                 result = subprocess.run(
    239                     ["claude", "-p", "-", "--model", model, "--max-turns", "1"],
    240                     input=prompt,
    241                     capture_output=True, text=True, timeout=600,
    242                     cwd=str(ROOT),
    243                 )
    244                 if result.returncode != 0:
    245                     stderr_hint2 = result.stderr.strip()[:200] if result.stderr else ""
    246                     return paper_id, False, f"claude exit {result.returncode} (sonnet fallback): {stderr_hint2 or stderr_hint}", {}
    247             else:
    248                 return paper_id, False, f"claude exit {result.returncode}: {stderr_hint}", {}
    249 
    250         output = result.stdout.strip()
    251         json_start = output.find("{")
    252         json_end = output.rfind("}") + 1
    253         if json_start == -1 or json_end == 0:
    254             return paper_id, False, "no JSON in output", {}
    255 
    256         v5_scan = json.loads(output[json_start:json_end])
    257 
    258         # Ensure required fields
    259         v5_scan["scan_version"] = 5
    260         v5_scan["paper_type"] = paper_type
    261         v5_scan["hn_data"] = hn_data
    262 
    263         # Mark all answers with the model that produced them
    264         scan_model = model  # haiku or sonnet
    265         for section_key in ["checklist", "type_checklist"]:
    266             section = v5_scan.get(section_key, {})
    267             if section_key == "type_checklist":
    268                 for ptype_key, ptype_data in section.items():
    269                     if isinstance(ptype_data, dict):
    270                         for cat_data in ptype_data.values():
    271                             if isinstance(cat_data, dict):
    272                                 for qd in cat_data.values():
    273                                     if isinstance(qd, dict) and "applies" in qd and "source" not in qd:
    274                                         qd["source"] = scan_model
    275             else:
    276                 for cat_data in section.values():
    277                     if isinstance(cat_data, dict):
    278                         for qd in cat_data.values():
    279                             if isinstance(qd, dict) and "applies" in qd and "source" not in qd:
    280                                 qd["source"] = scan_model
    281 
    282         # Write v5 scan — pure Haiku/Sonnet output, NO merge with Opus.
    283         # The build pipeline will overlay Opus answers at read time when both exist.
    284         # Keeping them separate preserves the ability to compare per-question.
    285         with open(v5_path, "w") as f:
    286             json.dump(v5_scan, f, ensure_ascii=False, indent=2)
    287 
    288         return paper_id, True, f"{scan_model}-only", {}
    289 
    290     except json.JSONDecodeError as e:
    291         return paper_id, False, f"JSON error: {e}", {}
    292     except subprocess.TimeoutExpired:
    293         return paper_id, False, "timeout", {}
    294     except Exception as e:
    295         return paper_id, False, f"error: {e}", {}
    296 
    297 
    298 # ── Main ──────────────────────────────────────────────────────────────
    299 
    300 def main():
    301     args = sys.argv[1:]
    302     force = "--force" in args
    303     limit = None
    304     specific_id = None
    305     parallel = 1
    306 
    307     for i, arg in enumerate(args):
    308         if arg == "--limit" and i + 1 < len(args):
    309             limit = int(args[i + 1])
    310         if arg == "--id" and i + 1 < len(args):
    311             specific_id = args[i + 1]
    312         if arg == "--parallel" and i + 1 < len(args):
    313             parallel = int(args[i + 1])
    314 
    315     registry = load_registry()
    316 
    317     # Collect candidates: papers with paper.txt + paper_type.json
    318     candidates = []
    319     for type_path in sorted(PAPERS_DIR.glob("*/paper_type.json")):
    320         pid = type_path.parent.name
    321         if specific_id and pid != specific_id:
    322             continue
    323         v5_path = type_path.parent / "scan-v5.json"
    324         if v5_path.exists() and not force and not specific_id:
    325             continue
    326         txt_path = type_path.parent / "paper.txt"
    327         if not txt_path.exists():
    328             continue
    329         candidates.append(pid)
    330 
    331     if limit:
    332         candidates = candidates[:limit]
    333 
    334     if not candidates:
    335         print("No papers to scan.")
    336         return
    337 
    338     print(f"V5 Haiku scan: {len(candidates)} papers"
    339           f"{f' (parallel={parallel})' if parallel > 1 else ''}:\n")
    340 
    341     ok_count = 0
    342     fail_count = 0
    343 
    344     if parallel > 1:
    345         with ThreadPoolExecutor(max_workers=parallel) as executor:
    346             futures = {executor.submit(scan_one, pid, registry, force): pid for pid in candidates}
    347             for future in as_completed(futures):
    348                 pid, ok, reason, stats = future.result()
    349                 if ok:
    350                     ok_count += 1
    351                     if ok_count % 10 == 0:
    352                         print(f"  ... {ok_count} OK so far")
    353                 else:
    354                     fail_count += 1
    355                     print(f"  FAIL: {pid} — {reason}")
    356     else:
    357         for i, pid in enumerate(candidates):
    358             if (i + 1) % 20 == 0:
    359                 print(f"  ... {i+1}/{len(candidates)}")
    360             pid, ok, reason, stats = scan_one(pid, registry, force)
    361             if ok:
    362                 ok_count += 1
    363             else:
    364                 fail_count += 1
    365                 print(f"  FAIL: {pid} — {reason}")
    366 
    367     print(f"\nDone. OK: {ok_count}, Failed: {fail_count}")
    368 
    369 
    370 if __name__ == "__main__":
    371     main()
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs