catchup-v3.py - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

catchup-v3.py (9619B)
      1 #!/usr/bin/env python3
      2 """
      3 Catch-up script: add v3 engagement factors to existing v2 scans.
      4 
      5 Reads each v2 scan.json, sends title + key findings + claims to Opus
      6 for engagement factor classification, merges results into v3 scan.json.
      7 
      8 Usage:
      9     python3 scripts/catchup-v3.py                    # All v2 scans without v3
     10     python3 scripts/catchup-v3.py --limit 10         # First N
     11     python3 scripts/catchup-v3.py --parallel 4       # Concurrent
     12     python3 scripts/catchup-v3.py --id metr-rct-2025 # Specific paper
     13 """
     14 
     15 import json
     16 import subprocess
     17 import sys
     18 from concurrent.futures import ThreadPoolExecutor, as_completed
     19 from pathlib import Path
     20 
     21 ROOT = Path(__file__).resolve().parent.parent
     22 PAPERS_DIR = ROOT / "papers"
     23 
     24 ENGAGEMENT_PROMPT = """You are classifying a research paper on 6 dimensions that predict social media engagement (Hacker News, Reddit, tech newsletters). Rate each dimension 0-3 based on the paper's content, framing, and likely audience appeal.
     25 
     26 ## Dimensions
     27 
     28 1. **practical_relevance** (0-3): Can a developer/practitioner use this at work? Tools, techniques, frameworks they can apply immediately.
     29    - 0: Pure theory, no practical application
     30    - 1: Potentially useful but requires significant adaptation
     31    - 2: Contains actionable techniques or findings practitioners can apply
     32    - 3: Directly usable tool, library, or technique with immediate workflow impact
     33 
     34 2. **surprise_contrarian** (0-3): Does this challenge conventional wisdom or reveal something unexpected?
     35    - 0: Confirms what everyone already believes
     36    - 1: Minor unexpected finding buried in expected results
     37    - 2: Main finding is somewhat surprising or counterintuitive
     38    - 3: Directly contradicts a widely-held belief with evidence
     39 
     40 3. **fear_safety** (0-3): Does this raise concerns about AI risks, security vulnerabilities, or misuse potential?
     41    - 0: No safety/risk angle
     42    - 1: Mentions risks as a secondary concern
     43    - 2: Safety/risk is a major theme with concrete demonstrations
     44    - 3: Demonstrates a novel attack, vulnerability, or existential concern
     45 
     46 4. **drama_conflict** (0-3): Is there a controversy, company rivalry, or "the emperor has no clothes" angle?
     47    - 0: No controversy or conflict
     48    - 1: Mild tension (e.g., questions a popular approach)
     49    - 2: Directly challenges a specific company's claims or a popular benchmark
     50    - 3: Major controversy — "benchmarks are fake", "company X is lying", replication failure
     51 
     52 5. **demo_ability** (0-3): Can someone try this themselves right now?
     53    - 0: No code, no demo, no way to interact
     54    - 1: Code exists but requires significant setup
     55    - 2: Reproducible with moderate effort, clear instructions
     56    - 3: Live demo, web app, or pip-installable tool you can try in minutes
     57 
     58 6. **brand_recognition** (0-3): Is this from a famous lab or about a famous product?
     59    - 0: Unknown lab/authors, obscure topic
     60    - 1: Recognized institution but not a household name in tech
     61    - 2: Major tech company or famous research lab (Google Brain, FAIR, etc.)
     62    - 3: About a product millions use (ChatGPT, Copilot, Cursor) or from OpenAI/Anthropic/DeepMind
     63 
     64 ## Paper Information
     65 
     66 Title: {title}
     67 Authors: {authors}
     68 Year: {year}
     69 Venue: {venue}
     70 Tags: {tags}
     71 
     72 Key Findings:
     73 {key_findings}
     74 
     75 Claims:
     76 {claims}
     77 
     78 Red Flags:
     79 {red_flags}
     80 
     81 ## Output
     82 
     83 Respond with ONLY a JSON object, no other text:
     84 {{
     85   "engagement_factors": {{
     86     "practical_relevance": {{"score": <0-3>, "justification": "<1 sentence>"}},
     87     "surprise_contrarian": {{"score": <0-3>, "justification": "<1 sentence>"}},
     88     "fear_safety": {{"score": <0-3>, "justification": "<1 sentence>"}},
     89     "drama_conflict": {{"score": <0-3>, "justification": "<1 sentence>"}},
     90     "demo_ability": {{"score": <0-3>, "justification": "<1 sentence>"}},
     91     "brand_recognition": {{"score": <0-3>, "justification": "<1 sentence>"}}
     92   }}
     93 }}"""
     94 
     95 
     96 def classify_one(paper_id):
     97     """Run engagement classification on one paper. Returns (paper_id, ok, reason)."""
     98     scan_path = PAPERS_DIR / paper_id / "scan.json"
     99     if not scan_path.exists():
    100         return paper_id, False, "no scan.json"
    101 
    102     with open(scan_path) as f:
    103         scan = json.load(f)
    104 
    105     if scan.get("scan_version", 1) < 2:
    106         return paper_id, False, "v1 scan, skip"
    107 
    108     if scan.get("scan_version") == 3:
    109         return paper_id, True, "already v3"
    110 
    111     # Read the full paper text
    112     txt_path = PAPERS_DIR / paper_id / "paper.txt"
    113     if not txt_path.exists():
    114         return paper_id, False, "no paper.txt"
    115     paper_text = txt_path.read_text(encoding="utf-8").replace("\x00", "")
    116 
    117     # Build prompt with full paper text + existing scan context
    118     paper = scan.get("paper", {})
    119     claims_text = "\n".join(
    120         f"- [{c.get('supported', '?')}] {c.get('claim', '')}"
    121         for c in scan.get("claims", [])
    122     )
    123     red_flags_text = "\n".join(
    124         f"- {r.get('flag', '')}: {r.get('detail', '')}"
    125         for r in scan.get("red_flags", [])
    126     )
    127 
    128     prompt = ENGAGEMENT_PROMPT.format(
    129         title=paper.get("title", ""),
    130         authors=", ".join(paper.get("authors", [])[:5]),
    131         year=paper.get("year", ""),
    132         venue=paper.get("venue", ""),
    133         tags=", ".join(scan.get("methodology_tags", [])),
    134         key_findings=scan.get("key_findings", ""),
    135         claims=claims_text or "(none)",
    136         red_flags=red_flags_text or "(none)",
    137     ) + f"\n\n## Full Paper Text\n{paper_text}"
    138 
    139     try:
    140         result = subprocess.run(
    141             ["claude", "-p", "-", "--model", "opus", "--max-turns", "1"],
    142             input=prompt,
    143             capture_output=True, text=True, timeout=300,
    144             cwd=str(ROOT),
    145         )
    146 
    147         if result.returncode != 0:
    148             return paper_id, False, f"claude exit {result.returncode}"
    149 
    150         # Parse JSON from output — find the JSON object in the response
    151         output = result.stdout.strip()
    152         # Try to extract JSON from the output
    153         json_start = output.find("{")
    154         json_end = output.rfind("}") + 1
    155         if json_start == -1 or json_end == 0:
    156             return paper_id, False, "no JSON in output"
    157 
    158         parsed = json.loads(output[json_start:json_end])
    159         factors = parsed.get("engagement_factors", parsed)
    160 
    161         # Validate structure
    162         required = ["practical_relevance", "surprise_contrarian", "fear_safety",
    163                      "drama_conflict", "demo_ability", "brand_recognition"]
    164         for key in required:
    165             if key not in factors:
    166                 return paper_id, False, f"missing factor: {key}"
    167             if "score" not in factors[key]:
    168                 return paper_id, False, f"missing score in {key}"
    169 
    170         # Merge into v3
    171         scan["scan_version"] = 3
    172         scan["engagement_factors"] = factors
    173 
    174         with open(scan_path, "w") as f:
    175             json.dump(scan, f, ensure_ascii=False, indent=2)
    176 
    177         scores = [factors[k]["score"] for k in required]
    178         return paper_id, True, f"v3 [{','.join(str(s) for s in scores)}]"
    179 
    180     except json.JSONDecodeError as e:
    181         return paper_id, False, f"JSON parse error: {e}"
    182     except subprocess.TimeoutExpired:
    183         return paper_id, False, "timeout"
    184     except Exception as e:
    185         return paper_id, False, f"error: {e}"
    186 
    187 
    188 def main():
    189     args = sys.argv[1:]
    190     limit = None
    191     specific_id = None
    192     parallel = 1
    193 
    194     for i, arg in enumerate(args):
    195         if arg == "--limit" and i + 1 < len(args):
    196             limit = int(args[i + 1])
    197         if arg == "--id" and i + 1 < len(args):
    198             specific_id = args[i + 1]
    199         if arg == "--parallel" and i + 1 < len(args):
    200             parallel = int(args[i + 1])
    201 
    202     # Collect candidates
    203     candidates = []
    204     for scan_path in sorted(PAPERS_DIR.glob("*/scan.json")):
    205         pid = scan_path.parent.name
    206         if specific_id and pid != specific_id:
    207             continue
    208         with open(scan_path) as f:
    209             s = json.load(f)
    210         if s.get("scan_version", 1) < 2:
    211             continue
    212         if s.get("scan_version") == 3 and not specific_id:
    213             continue
    214         candidates.append(pid)
    215 
    216     if limit:
    217         candidates = candidates[:limit]
    218 
    219     if not candidates:
    220         print("No papers to classify.")
    221         return
    222 
    223     print(f"Classifying engagement factors for {len(candidates)} papers"
    224           f"{f' (parallel={parallel})' if parallel > 1 else ''}:\n")
    225 
    226     results = {"ok": 0, "fail": 0}
    227     failures = []
    228 
    229     if parallel > 1:
    230         with ThreadPoolExecutor(max_workers=parallel) as executor:
    231             futures = {executor.submit(classify_one, pid): pid for pid in candidates}
    232             for future in as_completed(futures):
    233                 pid, ok, reason = future.result()
    234                 if ok:
    235                     results["ok"] += 1
    236                     if reason != "already v3":
    237                         print(f"  OK: {pid} — {reason}")
    238                 else:
    239                     results["fail"] += 1
    240                     failures.append((pid, reason))
    241                     print(f"  FAIL: {pid} — {reason}")
    242     else:
    243         for i, pid in enumerate(candidates):
    244             print(f"[{i+1}/{len(candidates)}] {pid}")
    245             _, ok, reason = classify_one(pid)
    246             if ok:
    247                 results["ok"] += 1
    248                 print(f"  {reason}")
    249             else:
    250                 results["fail"] += 1
    251                 failures.append((pid, reason))
    252                 print(f"  FAIL: {reason}")
    253 
    254     print(f"\nDone. OK: {results['ok']}, Failed: {results['fail']}")
    255     if failures:
    256         print("Failures:")
    257         for pid, reason in failures[:20]:
    258             print(f"  {pid}: {reason}")
    259 
    260 
    261 if __name__ == "__main__":
    262     main()
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs