build-explorer-data.py - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

build-explorer-data.py (59664B)
      1 #!/usr/bin/env python3
      2 """
      3 Build data files for the static data explorer.
      4 
      5 Reads v2 scan.json files, metadata.json, citation-graph.json, and registry.jsonl.
      6 Outputs view-specific JSON files for fast loading, plus a full explorer.json for power users.
      7 
      8 Output files:
      9   explorer/public/data/dashboard.json    — aggregation stats only
     10   explorer/public/data/findings.json     — deep analysis findings
     11   explorer/public/data/papers-index.json — table data without checklists
     12   explorer/public/data/papers/{slug}.json — full detail per paper
     13   explorer/public/data/network.json      — citation network
     14   explorer/public/data/tensions.json     — claim tensions
     15   explorer/public/data/explorer.json     — full monolith for queries
     16 
     17 Usage:
     18     python3 scripts/build-explorer-data.py
     19 """
     20 
     21 import json
     22 import re
     23 from collections import Counter, defaultdict
     24 from pathlib import Path
     25 
     26 ROOT = Path(__file__).resolve().parent.parent
     27 REGISTRY_PATH = ROOT / "registry.jsonl"
     28 PAPERS_DIR = ROOT / "papers"
     29 ANALYSIS_DIR = ROOT / "analysis"
     30 OUTPUT_DIR = ROOT / "explorer" / "public" / "data"
     31 
     32 BASE_CATEGORIES = [
     33     "artifacts", "statistical_methodology", "evaluation_design",
     34     "claims_and_evidence", "setup_transparency", "limitations_and_scope",
     35     "data_integrity", "conflicts_of_interest", "contamination",
     36     "human_studies", "cost_and_practicality",
     37 ]
     38 CONDITIONAL_CATEGORIES = [
     39     "experimental_rigor", "data_leakage", "survey_methodology",
     40 ]
     41 ALL_CATEGORIES = BASE_CATEGORIES + CONDITIONAL_CATEGORIES
     42 
     43 CODE_URL_RE = re.compile(
     44     r'https?://(?:github\.com|gitlab\.com|zenodo\.org|bitbucket\.org|huggingface\.co)[^\s,)\"\'<>]+'
     45 )
     46 
     47 
     48 def classify_archetype(cat_scores):
     49     ed = cat_scores.get("evaluation_design", 0)
     50     sm = cat_scores.get("statistical_methodology", 0)
     51     ar = cat_scores.get("artifacts", 0)
     52     if ed >= 0.8 and sm >= 0.5 and ar >= 0.5:
     53         return "Complete"
     54     if ed >= 0.8 and ar >= 0.4 and sm < 0.3:
     55         return "Builder"
     56     if ed >= 0.8 and sm < 0.3 and ar < 0.3:
     57         return "Theater"
     58     if ed < 0.6 and sm < 0.3:
     59         return "Minimal"
     60     return "Mixed"
     61 
     62 
     63 def compute_category_score(category_data):
     64     applicable = 0
     65     passed = 0
     66     for q_name, q_data in category_data.items():
     67         if not isinstance(q_data, dict) or "applies" not in q_data:
     68             continue
     69         if q_data["applies"]:
     70             applicable += 1
     71             if q_data.get("answer", False):
     72                 passed += 1
     73     if applicable == 0:
     74         return None
     75     return passed / applicable
     76 
     77 
     78 def compute_overall_score(checklist, category_weights=None):
     79     """Overall rubric score for a paper.
     80 
     81     Default (category_weights=None): flat per-question average. A category
     82     with more applicable questions naturally contributes more. Equivalent to
     83     the pre-calibration behavior.
     84 
     85     With category_weights: weighted mean of per-category pass rates. Each
     86     category contributes w_c * (passed_c / applicable_c). Categories with
     87     zero applicable questions drop out cleanly. Learned weights come from
     88     scripts/calibration/fit-weights.py.
     89     """
     90     if category_weights is None:
     91         applicable = 0
     92         passed = 0
     93         for cat_name, cat_data in checklist.items():
     94             if not isinstance(cat_data, dict):
     95                 continue
     96             for q_name, q_data in cat_data.items():
     97                 if not isinstance(q_data, dict) or "applies" not in q_data:
     98                     continue
     99                 if q_data["applies"]:
    100                     applicable += 1
    101                     if q_data.get("answer", False):
    102                         passed += 1
    103         if applicable == 0:
    104             return None
    105         return passed / applicable
    106 
    107     # Weighted-per-category mean
    108     num = 0.0
    109     den = 0.0
    110     any_applicable = False
    111     for cat_name, cat_data in checklist.items():
    112         if not isinstance(cat_data, dict):
    113             continue
    114         cat_app = 0
    115         cat_pas = 0
    116         for q_name, q_data in cat_data.items():
    117             if not isinstance(q_data, dict) or "applies" not in q_data:
    118                 continue
    119             if q_data["applies"]:
    120                 cat_app += 1
    121                 if q_data.get("answer", False):
    122                     cat_pas += 1
    123         if cat_app == 0:
    124             continue
    125         any_applicable = True
    126         w = category_weights.get(cat_name, 1.0)
    127         num += w * (cat_pas / cat_app)
    128         den += w
    129     if not any_applicable or den == 0:
    130         return None
    131     return num / den
    132 
    133 
    134 def flatten_checklist(checklist):
    135     flat = []
    136     for cat_name in ALL_CATEGORIES:
    137         cat_data = checklist.get(cat_name, {})
    138         if not isinstance(cat_data, dict):
    139             continue
    140         for q_name, q_data in cat_data.items():
    141             if not isinstance(q_data, dict) or "applies" not in q_data:
    142                 continue
    143             flat.append({
    144                 "category": cat_name,
    145                 "question": q_name,
    146                 "applies": q_data["applies"],
    147                 "answer": q_data.get("answer", False),
    148                 "justification": q_data.get("justification", ""),
    149             })
    150     return flat
    151 
    152 
    153 def detect_games(checklist, score, cat_scores):
    154     games = []
    155     ci = checklist.get("statistical_methodology", {}).get("confidence_intervals_or_error_bars", {})
    156     var = checklist.get("statistical_methodology", {}).get("variance_reported", {})
    157     if ci.get("applies") and not ci.get("answer") and var.get("applies") and not var.get("answer"):
    158         games.append("Big Numbers No Error Bars")
    159     ac = checklist.get("claims_and_evidence", {}).get("abstract_claims_supported", {})
    160     gb = checklist.get("claims_and_evidence", {}).get("generalization_bounded", {})
    161     if (ac.get("applies") and not ac.get("answer")) or (gb.get("applies") and not gb.get("answer")):
    162         games.append("Overclaiming")
    163     cr = checklist.get("artifacts", {}).get("code_released", {})
    164     env = checklist.get("artifacts", {}).get("environment_specified", {})
    165     ri = checklist.get("artifacts", {}).get("reproduction_instructions", {})
    166     if cr.get("applies") and cr.get("answer"):
    167         if (env.get("applies") and not env.get("answer")) or (ri.get("applies") and not ri.get("answer")):
    168             games.append("Open Source Theater")
    169     bc = checklist.get("contamination", {}).get("benchmark_contamination_addressed", {})
    170     if bc.get("applies") and not bc.get("answer"):
    171         games.append("Contamination Dodge")
    172     # Cherry-picked Comparisons
    173     bc2 = checklist.get("evaluation_design", {}).get("baselines_contemporary", {})
    174     if bc2.get("applies") and not bc2.get("answer"):
    175         games.append("Cherry-picked Comparisons")
    176     # All Show No Substance
    177     ed = cat_scores.get("evaluation_design", 0)
    178     sm = cat_scores.get("statistical_methodology", 0)
    179     ar = cat_scores.get("artifacts", 0)
    180     if ed >= 0.8 and sm < 0.2 and ar < 0.2:
    181         games.append("All Show No Substance")
    182     # Trust Us: no raw data AND no code — completely unverifiable
    183     rd = checklist.get("data_integrity", {}).get("raw_data_available", {})
    184     if rd.get("applies") and not rd.get("answer") and cr.get("applies") and not cr.get("answer"):
    185         games.append("Trust Us")
    186     # The Black Box: no prompts AND no hyperparameters — can't replicate
    187     pr = checklist.get("setup_transparency", {}).get("prompts_provided", {})
    188     hp = checklist.get("setup_transparency", {}).get("hyperparameters_reported", {})
    189     if pr.get("applies") and not pr.get("answer") and hp.get("applies") and not hp.get("answer"):
    190         games.append("The Black Box")
    191     # Moving Goalpost: causal claims without causal design
    192     cc = checklist.get("claims_and_evidence", {}).get("causal_claims_justified", {})
    193     if cc.get("applies") and not cc.get("answer"):
    194         games.append("Moving Goalpost")
    195     # Limitation Theater: has section but says nothing specific
    196     ls = checklist.get("limitations_and_scope", {}).get("limitations_section_present", {})
    197     tv = checklist.get("limitations_and_scope", {}).get("threats_to_validity_specific", {})
    198     sb = checklist.get("limitations_and_scope", {}).get("scope_boundaries_stated", {})
    199     if ls.get("applies") and ls.get("answer") and tv.get("applies") and not tv.get("answer") and sb.get("applies") and not sb.get("answer"):
    200         games.append("Limitation Theater")
    201     return games
    202 
    203 
    204 def extract_code_url(checklist):
    205     cr = checklist.get("artifacts", {}).get("code_released", {})
    206     if cr.get("applies") and cr.get("answer"):
    207         urls = CODE_URL_RE.findall(cr.get("justification", ""))
    208         if urls:
    209             return urls[0].rstrip(".,;:")
    210     return None
    211 
    212 
    213 def load_registry():
    214     entries = {}
    215     with open(REGISTRY_PATH) as f:
    216         for line in f:
    217             line = line.strip()
    218             if line:
    219                 entry = json.loads(line)
    220                 entries[entry["id"]] = entry
    221     return entries
    222 
    223 
    224 def load_citation_graph():
    225     path = ANALYSIS_DIR / "citation-graph.json"
    226     if not path.exists():
    227         return {"nodes": [], "edges": []}
    228     with open(path) as f:
    229         return json.load(f)
    230 
    231 
    232 def load_metadata(paper_id):
    233     path = PAPERS_DIR / paper_id / "metadata.json"
    234     if not path.exists():
    235         return {}
    236     with open(path) as f:
    237         return json.load(f)
    238 
    239 
    240 def load_hn(paper_id):
    241     path = PAPERS_DIR / paper_id / "hn.json"
    242     if not path.exists():
    243         return {}
    244     with open(path) as f:
    245         return json.load(f)
    246 
    247 
    248 def write_json(path, data):
    249     path.parent.mkdir(parents=True, exist_ok=True)
    250     with open(path, "w") as f:
    251         json.dump(data, f, ensure_ascii=False, separators=(",", ":"))
    252 
    253 
    254 def safe_mean(scores):
    255     return round(sum(scores) / len(scores), 1) if scores else 0
    256 
    257 
    258 def safe_median(scores):
    259     if not scores:
    260         return 0
    261     s = sorted(scores)
    262     return round(s[len(s) // 2], 1)
    263 
    264 
    265 def load_category_weights():
    266     """Load learned weights from scripts/calibration/weights.json if present.
    267     Falls back to None (uniform flat-question averaging) when absent."""
    268     path = Path(__file__).resolve().parent / "calibration" / "weights.json"
    269     if not path.exists():
    270         return None
    271     with open(path) as f:
    272         data = json.load(f)
    273     return data.get("weights")
    274 
    275 
    276 def build():
    277     registry = load_registry()
    278     citation_data = load_citation_graph()
    279     category_weights = load_category_weights()
    280     if category_weights:
    281         print(f"Using learned category weights ({len(category_weights)} categories)")
    282     else:
    283         print("No calibration/weights.json; using uniform per-question weights")
    284 
    285     # Accumulators
    286     papers_full = []
    287     papers_index = []
    288     paper_details = {}
    289     # Partitioned papers: scored the same way but kept out of aggregates and
    290     # papers-index so they don't skew the agentic-AI corpus numbers. Individual
    291     # detail JSONs still written so /sigint/papers/{slug} works.
    292     #   reference-benchmark: rubric calibration specimens (Wakefield, Ioannidis,
    293     #                        Attention). Listed in calibration.json.
    294     #   benchmark-eval:      papers that introduce benchmarks used BY the field.
    295     #                        They're reference material, not subjects being
    296     #                        evaluated. Listed in benchmarks.json.
    297     calibration_papers = []
    298     benchmark_papers = []
    299     all_scores = []
    300     cat_pass_counts = defaultdict(lambda: {"passed": 0, "applicable": 0})
    301     year_scores = defaultdict(list)
    302     tag_counts = Counter()
    303     archetype_counts = Counter()
    304     game_counts = Counter()
    305     total_papers = 0
    306 
    307     # Findings accumulators
    308     question_pass_counts = defaultdict(lambda: {"passed": 0, "applicable": 0})
    309     year_cat_scores = defaultdict(lambda: defaultdict(lambda: {"passed": 0, "applicable": 0}))
    310     venue_scores = defaultdict(list)
    311     citation_band_scores = defaultdict(list)
    312     benchmark_only_by_year = defaultdict(lambda: {"benchmark_only": 0, "total": 0})
    313     funding_groups = {"disclosed": [], "not_disclosed": []}
    314     score_map = {}  # paper_id -> score_pct (built incrementally for homophily)
    315 
    316     tensions = {
    317         "productivity": {"positive": [], "nuanced": []},
    318         "benchmarks": {"positive": [], "nuanced": []},
    319         "agents": {"positive": [], "nuanced": []},
    320         "security": {"positive": [], "nuanced": []},
    321         "code_quality": {"positive": [], "nuanced": []},
    322         "scaling": {"positive": [], "nuanced": []},
    323     }
    324 
    325     for scan_path in sorted(PAPERS_DIR.glob("*/scan.json")):
    326         paper_id = scan_path.parent.name
    327         with open(scan_path) as f:
    328             scan = json.load(f)
    329 
    330         # Include all scans regardless of version. The v1 rubric (50 questions)
    331         # is a proper subset of v2+ (57 questions, adding data_leakage,
    332         # experimental_rigor, and survey_methodology modules). compute_overall_score
    333         # uses passed/applicable over present questions, so v1 papers degrade
    334         # gracefully: their 50 applicable questions are scored normally and the
    335         # 7 v2+-only questions are treated as absent (same as any paper where
    336         # a conditional module doesn't apply).
    337 
    338         checklist = scan.get("checklist", {})
    339         paper_meta = scan.get("paper", {})
    340         reg_entry = registry.get(paper_id, {})
    341         metadata = load_metadata(paper_id)
    342         hn_data = load_hn(paper_id)
    343         reg_tags = reg_entry.get("tags") or []
    344         is_reference = "reference-benchmark" in reg_tags
    345         is_benchmark_paper = "benchmark-eval" in reg_tags
    346         is_calibration = is_reference or is_benchmark_paper
    347 
    348         overall = compute_overall_score(checklist, category_weights)
    349         if overall is None:
    350             continue
    351 
    352         # Classify paper type: empirical if both stats and eval have applicable questions
    353         def _has_applicable(cat_name):
    354             cd = checklist.get(cat_name, {})
    355             if not isinstance(cd, dict):
    356                 return False
    357             return any(isinstance(qd, dict) and qd.get("applies", False)
    358                        for qd in cd.values())
    359 
    360         is_empirical = _has_applicable("statistical_methodology") and _has_applicable("evaluation_design")
    361         paper_type = "empirical" if is_empirical else "non-empirical"
    362 
    363         cat_scores = {}
    364         for cat in ALL_CATEGORIES:
    365             cat_data = checklist.get(cat, {})
    366             if cat_data and isinstance(cat_data, dict):
    367                 cs = compute_category_score(cat_data)
    368                 if cs is not None:
    369                     cat_scores[cat] = cs
    370 
    371         score_pct = round(overall * 100, 1)
    372         score_map[paper_id] = score_pct
    373 
    374         year = paper_meta.get("year") or reg_entry.get("year")
    375         venue = metadata.get("venue") or paper_meta.get("venue") or reg_entry.get("venue", "")
    376         tags = scan.get("methodology_tags", []) or reg_entry.get("tags", [])
    377         archetype = classify_archetype(cat_scores) if is_empirical else None
    378         games = detect_games(checklist, overall, cat_scores) if is_empirical else []
    379 
    380         # External links
    381         arxiv_id = paper_meta.get("arxiv_id") or reg_entry.get("arxiv_id", "")
    382         doi = paper_meta.get("doi") or reg_entry.get("doi", "")
    383         source_url = reg_entry.get("source_url", "")
    384 
    385         # Code URL extraction
    386         code_url = extract_code_url(checklist)
    387 
    388         # Only empirical, non-calibration papers feed into findings aggregations
    389         if is_empirical and not is_calibration:
    390             total_papers += 1
    391             all_scores.append(score_pct)
    392             year_scores[year].append(score_pct)
    393             for t in tags:
    394                 tag_counts[t] += 1
    395             archetype_counts[archetype] += 1
    396             for g in games:
    397                 game_counts[g] += 1
    398 
    399         claims = scan.get("claims", [])
    400         red_flags = scan.get("red_flags", [])
    401 
    402         # All remaining aggregations are empirical-only and skip calibration
    403         if not is_empirical or is_calibration:
    404             pass  # skip to index/detail construction below
    405         else:
    406             # Category + question aggregations
    407             for cat in ALL_CATEGORIES:
    408                 cat_data = checklist.get(cat, {})
    409                 if not isinstance(cat_data, dict):
    410                     continue
    411                 for q_name, q_data in cat_data.items():
    412                     if not isinstance(q_data, dict) or "applies" not in q_data:
    413                         continue
    414                     if q_data["applies"]:
    415                         cat_pass_counts[cat]["applicable"] += 1
    416                         question_pass_counts[f"{cat}.{q_name}"]["applicable"] += 1
    417                         if q_data.get("answer", False):
    418                             cat_pass_counts[cat]["passed"] += 1
    419                             question_pass_counts[f"{cat}.{q_name}"]["passed"] += 1
    420                         # Year × category
    421                         if year:
    422                             year_cat_scores[year][cat]["applicable"] += 1
    423                             if q_data.get("answer", False):
    424                                 year_cat_scores[year][cat]["passed"] += 1
    425 
    426             # Venue scoring
    427             venue_clean = venue.strip()
    428             if venue_clean and venue_clean.lower() not in ("arxiv", "arxiv.org", ""):
    429                 venue_scores[venue_clean].append(score_pct)
    430 
    431             # Citation band scoring
    432             cit = metadata.get("citation_count")
    433             if cit is not None:
    434                 if cit == 0:
    435                     band = "0"
    436                 elif cit <= 50:
    437                     band = "1-50"
    438                 elif cit <= 500:
    439                     band = "51-500"
    440                 else:
    441                     band = "500+"
    442                 citation_band_scores[band].append(score_pct)
    443 
    444             # Benchmark monoculture
    445             if year:
    446                 benchmark_only_by_year[year]["total"] += 1
    447                 if tags == ["benchmark-eval"]:
    448                     benchmark_only_by_year[year]["benchmark_only"] += 1
    449 
    450             # Funding gap
    451             fd = checklist.get("conflicts_of_interest", {}).get("funding_disclosed", {})
    452             if fd.get("applies"):
    453                 if fd.get("answer"):
    454                     funding_groups["disclosed"].append(score_pct)
    455                 else:
    456                     funding_groups["not_disclosed"].append(score_pct)
    457 
    458         # Tension classification (empirical papers only, calibration excluded)
    459         if is_empirical and not is_calibration:
    460           for claim in claims:
    461             ct = claim.get("claim", "").lower()
    462             entry = {"paper_id": paper_id, "claim": claim["claim"],
    463                      "supported": claim.get("supported", ""), "score": score_pct, "year": year}
    464 
    465             # Productivity
    466             if any(k in ct for k in ["productivity", "developer speed", "completion time", "speedup",
    467                                       "faster", "developer productivity", "coding efficiency",
    468                                       "development time", "time savings", "code faster"]):
    469                 bucket = "positive" if any(k in ct for k in ["faster", "speedup", "improves", "increases",
    470                                                               "gain", "savings", "efficient"]) else "nuanced"
    471                 tensions["productivity"][bucket].append(entry)
    472 
    473             # Benchmarks (expanded)
    474             if any(k in ct for k in ["benchmark", "evaluation", "leaderboard", "swe-bench",
    475                                       "pass@", "accuracy", "f1 score", "performance on",
    476                                       "state-of-the-art", "sota"]):
    477                 bucket = "positive" if any(k in ct for k in ["state-of-the-art", "outperforms", "achieves",
    478                                                               "best", "surpasses", "exceeds"]) else "nuanced"
    479                 tensions["benchmarks"][bucket].append(entry)
    480 
    481             # Agents (expanded)
    482             if any(k in ct for k in ["agent", "autonomous", "multi-agent", "agentic",
    483                                       "tool use", "planning", "chain-of-thought",
    484                                       "reasoning capability"]):
    485                 bucket = "positive" if any(k in ct for k in ["solves", "achieves", "succeeds", "capable",
    486                                                               "outperforms", "enables", "improves"]) else "nuanced"
    487                 tensions["agents"][bucket].append(entry)
    488 
    489             # Security arms race (NEW)
    490             if any(k in ct for k in ["attack", "defense", "jailbreak", "injection", "adversarial",
    491                                       "vulnerability", "safety", "alignment", "harmful", "toxic",
    492                                       "secure", "exploit", "bypass", "mitigat"]):
    493                 bucket = "positive" if any(k in ct for k in ["defense", "protect", "mitigat", "detect",
    494                                                               "prevent", "secure", "effective", "reduces",
    495                                                               "robust"]) else "nuanced"
    496                 tensions["security"][bucket].append(entry)
    497 
    498             # Code quality (NEW)
    499             if any(k in ct for k in ["code quality", "bug", "vulnerability", "error", "defect",
    500                                       "repair", "fix", "correct", "hallucin", "incorrect code",
    501                                       "insecure code", "code generation"]):
    502                 bucket = "positive" if any(k in ct for k in ["repair", "fix", "correct", "improve",
    503                                                               "reduc", "effective", "resolve"]) else "nuanced"
    504                 tensions["code_quality"][bucket].append(entry)
    505 
    506             # Scaling debate (NEW)
    507             if any(k in ct for k in ["scaling", "scale", "cost", "efficient", "latency",
    508                                       "token", "compute", "inference", "smaller model",
    509                                       "distill", "compress"]):
    510                 bucket = "positive" if any(k in ct for k in ["efficient", "reduc", "cheaper", "faster",
    511                                                               "smaller", "compet", "saving"]) else "nuanced"
    512                 tensions["scaling"][bucket].append(entry)
    513 
    514         cat_scores_pct = {k: round(v * 100, 1) for k, v in cat_scores.items()}
    515 
    516         # DNA strip: compact array of base category scores (0-100, null if N/A)
    517         dna = [cat_scores_pct.get(cat) for cat in BASE_CATEGORIES]
    518 
    519         # Slim index entry
    520         index_entry = {
    521             "id": paper_id,
    522             "title": paper_meta.get("title", reg_entry.get("title", paper_id)),
    523             "year": year,
    524             "venue": venue,
    525             "tags": tags,
    526             "score": score_pct,
    527             "archetype": archetype,
    528             "games": games,
    529             "arxiv_id": arxiv_id,
    530             "doi": doi,
    531             "code_url": code_url,
    532             "dna": dna,
    533             "paper_type": paper_type,
    534             "hn_points": hn_data.get("top_points", 0),
    535             "engagement": [scan.get("engagement_factors", {}).get(d, {}).get("score") for d in
    536                            ["practical_relevance", "surprise_contrarian", "fear_safety",
    537                             "drama_conflict", "demo_ability", "brand_recognition"]
    538                            ] if scan.get("engagement_factors") else None,
    539         }
    540         if not is_calibration:
    541             papers_index.append(index_entry)
    542 
    543         # Full detail
    544         detail = {
    545             **index_entry,
    546             "category_scores": cat_scores_pct,
    547             "claims": [{"claim": c["claim"], "supported": c.get("supported", "")} for c in claims],
    548             "red_flags": [{"flag": r["flag"], "detail": r["detail"]} for r in red_flags],
    549             "checklist": flatten_checklist(checklist),
    550             "key_findings": scan.get("key_findings", ""),
    551             "active_modules": scan.get("active_modules", []),
    552             "source_url": source_url,
    553             "hn_threads": hn_data.get("threads", []),
    554             "engagement_factors": scan.get("engagement_factors"),
    555         }
    556         # Individual JSON gets written for everyone, so /sigint/papers/{slug}
    557         # continues to work. Only non-calibration contributes to the aggregates.
    558         paper_details[paper_id] = detail
    559         if is_calibration:
    560             # Carry the registry notes through so the consumer can explain
    561             # why each specimen is in the corpus without re-deriving it.
    562             cal_detail = dict(detail)
    563             cal_detail["calibration_notes"] = reg_entry.get("notes", "")
    564             if is_reference:
    565                 calibration_papers.append(cal_detail)
    566             else:  # benchmark-eval only (not also reference-benchmark)
    567                 benchmark_papers.append(cal_detail)
    568         else:
    569             papers_full.append(detail)
    570 
    571     # --- Dashboard aggregations ---
    572     all_scores.sort()
    573     n = len(all_scores)
    574     median = all_scores[n // 2] if n else 0
    575     mean = sum(all_scores) / n if n else 0
    576 
    577     hist_bins = []
    578     for lo in range(0, 100, 5):
    579         hi = lo + 5
    580         count = sum(1 for s in all_scores if lo <= s < hi)
    581         hist_bins.append({"lo": lo, "hi": hi, "count": count})
    582 
    583     cat_rates = {}
    584     for cat in ALL_CATEGORIES:
    585         d = cat_pass_counts[cat]
    586         if d["applicable"] > 0:
    587             cat_rates[cat] = round(d["passed"] / d["applicable"] * 100, 1)
    588 
    589     year_trends = {}
    590     for y in sorted(year_scores.keys()):
    591         scores = year_scores[y]
    592         year_trends[str(y)] = {
    593             "n": len(scores),
    594             "mean": round(sum(scores) / len(scores), 1),
    595             "median": round(sorted(scores)[len(scores) // 2], 1),
    596         }
    597 
    598     game_pcts = {g: round(c / total_papers * 100, 1) for g, c in game_counts.items()}
    599     repro_count = sum(1 for p in papers_full if p["category_scores"].get("artifacts", 0) == 100)
    600 
    601     # --- Registry pipeline stats ---
    602     reg_total = len(registry)
    603     v5_opus = 0
    604     v5_haiku = 0
    605     deprecated_scan = 0
    606     not_scanned = 0
    607     for e in registry.values():
    608         pid = e["id"]
    609         v5_path = PAPERS_DIR / pid / "scan-v5.json"
    610         old_path = PAPERS_DIR / pid / "scan.json"
    611         if v5_path.exists():
    612             with open(v5_path) as f:
    613                 v5 = json.load(f)
    614             # Check if any answers have source="opus"
    615             has_opus = False
    616             for cat_data in v5.get("checklist", {}).values():
    617                 if isinstance(cat_data, dict):
    618                     for qd in cat_data.values():
    619                         if isinstance(qd, dict) and qd.get("source") == "opus":
    620                             has_opus = True
    621                             break
    622                 if has_opus:
    623                     break
    624             if has_opus:
    625                 v5_opus += 1
    626             else:
    627                 v5_haiku += 1
    628         elif old_path.exists():
    629             deprecated_scan += 1
    630         else:
    631             not_scanned += 1
    632 
    633     pipeline = {
    634         "registry_total": reg_total,
    635         "v5_opus": v5_opus,
    636         "v5_haiku": v5_haiku,
    637         "deprecated_scan": deprecated_scan,
    638         "not_scanned": not_scanned,
    639     }
    640 
    641     dashboard = {
    642         "n": total_papers,
    643         "median": round(median, 1),
    644         "mean": round(mean, 1),
    645         "full_reproducibility_pct": round(repro_count / total_papers * 100, 1) if total_papers else 0,
    646         "histogram": hist_bins,
    647         "category_rates": cat_rates,
    648         "year_trends": year_trends,
    649         "game_pcts": game_pcts,
    650         "archetype_counts": dict(archetype_counts),
    651         "tag_counts": dict(tag_counts),
    652         "pipeline": pipeline,
    653     }
    654 
    655     # --- Findings aggregations ---
    656 
    657     # 1. Per-question pass rates (with human-readable descriptions)
    658     Q_DESCRIPTIONS = {
    659         "artifacts.code_released": "Source code publicly released",
    660         "artifacts.data_released": "Dataset publicly available",
    661         "artifacts.environment_specified": "Environment/dependency specs provided",
    662         "artifacts.reproduction_instructions": "Step-by-step reproduction instructions included",
    663         "statistical_methodology.confidence_intervals_or_error_bars": "Confidence intervals or error bars on main results",
    664         "statistical_methodology.significance_tests": "Statistical significance tests for comparative claims",
    665         "statistical_methodology.effect_sizes_reported": "Effect sizes reported, not just p-values",
    666         "statistical_methodology.sample_size_justified": "Sample size justified or power analysis discussed",
    667         "statistical_methodology.variance_reported": "Variance or std dev reported across runs",
    668         "evaluation_design.baselines_included": "Baseline comparisons included",
    669         "evaluation_design.baselines_contemporary": "Baselines are contemporary and competitive",
    670         "evaluation_design.ablation_study": "Ablation study showing which components matter",
    671         "evaluation_design.multiple_metrics": "Multiple evaluation metrics used",
    672         "evaluation_design.human_evaluation": "Human evaluation included, not just automated",
    673         "evaluation_design.held_out_test_set": "Results on held-out test set, not dev/val",
    674         "evaluation_design.per_category_breakdown": "Per-category or per-task breakdowns provided",
    675         "evaluation_design.failure_cases_discussed": "Failure cases shown or discussed",
    676         "evaluation_design.negative_results_reported": "Negative results reported",
    677         "claims_and_evidence.abstract_claims_supported": "All abstract claims supported by results",
    678         "claims_and_evidence.causal_claims_justified": "Causal claims backed by adequate study design",
    679         "claims_and_evidence.generalization_bounded": "Generalizations bounded to tested setting",
    680         "claims_and_evidence.alternative_explanations_discussed": "Alternative explanations discussed",
    681         "claims_and_evidence.proxy_outcome_distinction": "Proxy vs outcome distinction acknowledged",
    682         "setup_transparency.model_versions_specified": "Exact model versions specified",
    683         "setup_transparency.prompts_provided": "Actual prompts/system instructions provided",
    684         "setup_transparency.hyperparameters_reported": "Hyperparameters reported (temperature, etc.)",
    685         "setup_transparency.scaffolding_described": "Agentic scaffolding described in detail",
    686         "setup_transparency.data_preprocessing_documented": "Data preprocessing steps documented",
    687         "limitations_and_scope.limitations_section_present": "Dedicated limitations section present",
    688         "limitations_and_scope.threats_to_validity_specific": "Specific threats to validity discussed",
    689         "limitations_and_scope.scope_boundaries_stated": "Explicit scope boundaries stated",
    690         "data_integrity.raw_data_available": "Raw data available for verification",
    691         "data_integrity.data_collection_described": "Data collection procedure described",
    692         "data_integrity.recruitment_methods_described": "Participant/sample recruitment described",
    693         "data_integrity.data_pipeline_documented": "Full data pipeline documented",
    694         "conflicts_of_interest.funding_disclosed": "Funding source disclosed",
    695         "conflicts_of_interest.affiliations_disclosed": "Author affiliations with evaluated product disclosed",
    696         "conflicts_of_interest.funder_independent_of_outcome": "Funder independent of outcome",
    697         "conflicts_of_interest.financial_interests_declared": "Financial interests declared",
    698         "contamination.training_cutoff_stated": "Model training data cutoff stated",
    699         "contamination.train_test_overlap_discussed": "Train/test overlap discussed",
    700         "contamination.benchmark_contamination_addressed": "Benchmark contamination addressed",
    701         "human_studies.pre_registered": "Study pre-registered",
    702         "human_studies.irb_or_ethics_approval": "IRB or ethics approval mentioned",
    703         "human_studies.demographics_reported": "Participant demographics reported",
    704         "human_studies.inclusion_exclusion_criteria": "Inclusion/exclusion criteria stated",
    705         "human_studies.randomization_described": "Randomization procedure described",
    706         "human_studies.blinding_described": "Blinding described",
    707         "human_studies.attrition_reported": "Participant attrition reported",
    708         "cost_and_practicality.inference_cost_reported": "Inference cost or latency reported",
    709         "cost_and_practicality.compute_budget_stated": "Total computational budget stated",
    710         "experimental_rigor.seed_sensitivity_reported": "Results across multiple random seeds",
    711         "experimental_rigor.number_of_runs_stated": "Number of experimental runs stated",
    712         "experimental_rigor.hyperparameter_search_budget": "Hyperparameter search budget reported",
    713         "experimental_rigor.best_config_selection_justified": "Best config selection justified",
    714         "experimental_rigor.multiple_comparison_correction": "Multiple comparison correction applied",
    715         "experimental_rigor.self_comparison_bias_addressed": "Self-evaluation bias acknowledged",
    716         "experimental_rigor.compute_budget_vs_performance": "Performance reported vs compute budget",
    717         "experimental_rigor.benchmark_construct_validity": "Benchmark construct validity discussed",
    718         "experimental_rigor.scaffold_confound_addressed": "Scaffolding confound addressed",
    719         "data_leakage.temporal_leakage_addressed": "Temporal leakage addressed",
    720         "data_leakage.feature_leakage_addressed": "Feature leakage addressed",
    721         "data_leakage.non_independence_addressed": "Train/test non-independence addressed",
    722         "data_leakage.leakage_detection_method": "Concrete leakage detection method used",
    723         "survey_methodology.prisma_or_structured_protocol": "PRISMA or structured review protocol",
    724         "survey_methodology.quality_assessment_of_sources": "Quality assessment of source papers",
    725         "survey_methodology.publication_bias_discussed": "Publication bias discussed",
    726     }
    727 
    728     q_rates = {}
    729     for key, d in question_pass_counts.items():
    730         if d["applicable"] > 0:
    731             q_rates[key] = {
    732                 "rate": round(d["passed"] / d["applicable"] * 100, 1),
    733                 "n": d["applicable"],
    734                 "desc": Q_DESCRIPTIONS.get(key, ""),
    735             }
    736 
    737     # 2. Year trends by category
    738     year_cat_trends = {}
    739     for y in sorted(year_cat_scores.keys()):
    740         year_cat_trends[str(y)] = {}
    741         for cat in ALL_CATEGORIES:
    742             d = year_cat_scores[y][cat]
    743             if d["applicable"] > 0:
    744                 year_cat_trends[str(y)][cat] = round(d["passed"] / d["applicable"] * 100, 1)
    745 
    746     # 3. Venue & citation scoring
    747     venue_stats = {}
    748     for v, scores in venue_scores.items():
    749         if len(scores) >= 3:
    750             venue_stats[v] = {"n": len(scores), "mean": safe_mean(scores), "median": safe_median(scores)}
    751 
    752     cit_band_stats = {}
    753     for band in ["0", "1-50", "51-500", "500+"]:
    754         scores = citation_band_scores.get(band, [])
    755         if scores:
    756             cit_band_stats[band] = {"n": len(scores), "mean": safe_mean(scores), "median": safe_median(scores)}
    757 
    758     # 4. Optimism-rigor inversion
    759     optimism_rigor = {}
    760     for key, sides in tensions.items():
    761         pos = [c["score"] for c in sides["positive"]]
    762         nua = [c["score"] for c in sides["nuanced"]]
    763         optimism_rigor[key] = {
    764             "positive_n": len(pos), "positive_mean": safe_mean(pos),
    765             "nuanced_n": len(nua), "nuanced_mean": safe_mean(nua),
    766             "gap": round(safe_mean(nua) - safe_mean(pos), 1),
    767         }
    768 
    769     # 5. Quality homophily
    770     threshold = 60
    771     high_quality_ids = {pid for pid, sc in score_map.items() if sc >= threshold}
    772     baseline_pct = round(len(high_quality_ids) / total_papers * 100, 1) if total_papers else 0
    773 
    774     cited_high = 0
    775     cited_total = 0
    776     for edge in citation_data.get("edges", []):
    777         s, t = edge["source"], edge["target"]
    778         if s in high_quality_ids and t in score_map:
    779             cited_total += 1
    780             if score_map[t] >= threshold:
    781                 cited_high += 1
    782 
    783     homophily = {
    784         "threshold": threshold,
    785         "baseline_pct": baseline_pct,
    786         "high_cite_high_pct": round(cited_high / cited_total * 100, 1) if cited_total else 0,
    787         "high_cite_total": cited_total,
    788     }
    789 
    790     # 6. Sampling effect (historical checkpoints + current)
    791     sampling_effect = {
    792         "checkpoints": [
    793             {"n": 135, "median": 53.3},
    794             {"n": 271, "median": 50.6},
    795             {"n": 467, "median": 50.0},
    796             {"n": 745, "median": 48.1},
    797             {"n": 932, "median": 47.1},
    798             {"n": total_papers, "median": round(median, 1)},
    799         ]
    800     }
    801 
    802     # 7. Benchmark monoculture
    803     bench_mono = {}
    804     for y in sorted(benchmark_only_by_year.keys()):
    805         d = benchmark_only_by_year[y]
    806         if d["total"] > 0:
    807             bench_mono[str(y)] = {
    808                 "benchmark_only": d["benchmark_only"],
    809                 "total": d["total"],
    810                 "pct": round(d["benchmark_only"] / d["total"] * 100, 1),
    811             }
    812 
    813     # 8. Funding gap
    814     funding_gap = {}
    815     for group, scores in funding_groups.items():
    816         if scores:
    817             funding_gap[group] = {"n": len(scores), "mean": safe_mean(scores), "median": safe_median(scores)}
    818 
    819     # 9. Reproducibility drill-down
    820     artifacts_qs = ["code_released", "data_released", "environment_specified", "reproduction_instructions"]
    821     repro_detail = {}
    822     for q in artifacts_qs:
    823         key = f"artifacts.{q}"
    824         d = question_pass_counts.get(key, {"passed": 0, "applicable": 0})
    825         if d["applicable"] > 0:
    826             repro_detail[q] = {"rate": round(d["passed"] / d["applicable"] * 100, 1), "n": d["applicable"]}
    827     repro_detail["full_pass_count"] = repro_count
    828     repro_detail["full_pass_pct"] = round(repro_count / total_papers * 100, 1) if total_papers else 0
    829 
    830     # 9b. Reproducibility funnel — cascading filter
    831     repro_funnel = []
    832     step_papers = set(p["id"] for p in papers_full)
    833     repro_funnel.append({"step": "All papers", "n": len(step_papers)})
    834     for q_name, label in [
    835         ("code_released", "Code released"),
    836         ("data_released", "Data released"),
    837         ("environment_specified", "Environment specified"),
    838         ("reproduction_instructions", "Reproduction instructions"),
    839     ]:
    840         next_set = set()
    841         for p in papers_full:
    842             if p["id"] not in step_papers:
    843                 continue
    844             for item in p["checklist"]:
    845                 if item["category"] == "artifacts" and item["question"] == q_name:
    846                     if item["applies"] and item["answer"]:
    847                         next_set.add(p["id"])
    848                     elif not item["applies"]:
    849                         next_set.add(p["id"])  # N/A doesn't filter out
    850                     break
    851         step_papers = next_set
    852         repro_funnel.append({"step": label, "n": len(step_papers)})
    853     repro_detail["funnel"] = repro_funnel
    854 
    855     # 9c. Methodology tag treemap
    856     tag_treemap = []
    857     tag_score_map = defaultdict(list)
    858     for p in papers_full:
    859         for t in p["tags"]:
    860             tag_score_map[t].append(p["score"])
    861     for t, scores in tag_score_map.items():
    862         tag_treemap.append({
    863             "tag": t,
    864             "n": len(scores),
    865             "mean": safe_mean(scores),
    866         })
    867     tag_treemap.sort(key=lambda x: -x["n"])
    868 
    869     # 9d. Two cultures / three clusters
    870     # Cluster definitions based on correlation analysis
    871     cluster_defs = {
    872         "Transparency & Artifacts": ["artifacts", "setup_transparency", "data_integrity"],
    873         "Statistical & Experimental Rigor": ["statistical_methodology", "experimental_rigor", "claims_and_evidence"],
    874         "Contamination Awareness": ["contamination", "data_leakage"],
    875     }
    876     # Compute mean score per cluster per paper, then inter-cluster correlations
    877     cluster_vectors = defaultdict(list)  # cluster_name -> [mean_scores]
    878     for p in papers_full:
    879         cs = p["category_scores"]
    880         for cname, cats in cluster_defs.items():
    881             vals = [cs[c] for c in cats if c in cs]
    882             if vals:
    883                 cluster_vectors[cname].append(sum(vals) / len(vals))
    884             else:
    885                 cluster_vectors[cname].append(None)
    886 
    887     # Two cultures: compute overlap between human_studies and artifacts/setup
    888     two_cultures_papers = []
    889     for p in papers_full:
    890         cs = p["category_scores"]
    891         hs = cs.get("human_studies")
    892         art = cs.get("artifacts")
    893         if hs is not None and art is not None:
    894             two_cultures_papers.append({"human_studies": hs, "artifacts": art, "id": p["id"], "score": p["score"]})
    895 
    896     # 10. Category correlation matrix
    897     # Collect per-paper category score vectors
    898     paper_cat_vectors = []
    899     for p in papers_full:
    900         cs = p["category_scores"]
    901         # Convert percentage back to 0-1 for correlation
    902         vec = {cat: cs[cat] / 100.0 for cat in ALL_CATEGORIES if cat in cs}
    903         if len(vec) >= 5:
    904             paper_cat_vectors.append(vec)
    905 
    906     def pearson(xs, ys):
    907         n = len(xs)
    908         if n < 10:
    909             return None
    910         mx = sum(xs) / n
    911         my = sum(ys) / n
    912         num = sum((x - mx) * (y - my) for x, y in zip(xs, ys))
    913         dx = sum((x - mx) ** 2 for x in xs) ** 0.5
    914         dy = sum((y - my) ** 2 for y in ys) ** 0.5
    915         if dx == 0 or dy == 0:
    916             return None
    917         return num / (dx * dy)
    918 
    919     # Only include categories with enough data
    920     corr_cats = [c for c in ALL_CATEGORIES
    921                  if sum(1 for v in paper_cat_vectors if c in v) >= 30]
    922 
    923     corr_matrix = []
    924     for c1 in corr_cats:
    925         row = []
    926         for c2 in corr_cats:
    927             xs, ys = [], []
    928             for v in paper_cat_vectors:
    929                 if c1 in v and c2 in v:
    930                     xs.append(v[c1])
    931                     ys.append(v[c2])
    932             r = pearson(xs, ys)
    933             row.append({"r": round(r, 3) if r is not None else None, "n": len(xs)})
    934         corr_matrix.append(row)
    935 
    936     correlation = {
    937         "categories": corr_cats,
    938         "matrix": corr_matrix,
    939     }
    940 
    941     # 11. PCA scatter — project papers to 2D from category scores
    942     PCA_CATS = [
    943         "artifacts", "statistical_methodology", "evaluation_design",
    944         "claims_and_evidence", "setup_transparency", "limitations_and_scope",
    945         "data_integrity", "conflicts_of_interest", "cost_and_practicality",
    946     ]
    947 
    948     # Collect vectors, impute missing with median
    949     pca_raw = []
    950     for p in papers_full:
    951         cs = p["category_scores"]
    952         vec = {cat: cs[cat] / 100.0 for cat in PCA_CATS if cat in cs}
    953         non_none = len(vec)
    954         if non_none >= 6:
    955             pca_raw.append({"id": p["id"], "scores": vec, "archetype": p["archetype"], "title": p["title"], "score": p["score"]})
    956 
    957     pca_medians = {}
    958     for cat in PCA_CATS:
    959         vals = sorted(r["scores"].get(cat, None) for r in pca_raw if r["scores"].get(cat) is not None)
    960         pca_medians[cat] = vals[len(vals) // 2] if vals else 0.5
    961 
    962     pca_vecs = []
    963     for r in pca_raw:
    964         vec = [r["scores"].get(cat, pca_medians[cat]) for cat in PCA_CATS]
    965         pca_vecs.append(vec)
    966 
    967     pca_n = len(pca_vecs)
    968     pca_d = len(PCA_CATS)
    969 
    970     # Center
    971     pca_means = [sum(v[j] for v in pca_vecs) / pca_n for j in range(pca_d)]
    972     pca_centered = [[v[j] - pca_means[j] for j in range(pca_d)] for v in pca_vecs]
    973 
    974     # Covariance
    975     pca_cov = [[0.0] * pca_d for _ in range(pca_d)]
    976     for i in range(pca_d):
    977         for j in range(pca_d):
    978             pca_cov[i][j] = sum(row[i] * row[j] for row in pca_centered) / (pca_n - 1)
    979 
    980     # Power iteration for top 2 eigenvectors
    981     import random as _rng
    982     _rng.seed(42)
    983 
    984     def _power_iter(mat, num_iter=300, deflate=None):
    985         dd = len(mat)
    986         v = [_rng.gauss(0, 1) for _ in range(dd)]
    987         if deflate:
    988             for dv in deflate:
    989                 dot = sum(v[i] * dv[i] for i in range(dd))
    990                 v = [v[i] - dot * dv[i] for i in range(dd)]
    991         norm = sum(x * x for x in v) ** 0.5
    992         v = [x / norm for x in v]
    993         for _ in range(num_iter):
    994             nv = [sum(mat[i][j] * v[j] for j in range(dd)) for i in range(dd)]
    995             if deflate:
    996                 for dv in deflate:
    997                     dot = sum(nv[i] * dv[i] for i in range(dd))
    998                     nv = [nv[i] - dot * dv[i] for i in range(dd)]
    999             norm = sum(x * x for x in nv) ** 0.5
   1000             if norm == 0:
   1001                 break
   1002             v = [x / norm for x in nv]
   1003         ev = sum(sum(mat[i][j] * v[j] for j in range(dd)) * v[i] for i in range(dd))
   1004         return v, ev
   1005 
   1006     pc1_vec, ev1 = _power_iter(pca_cov)
   1007     pc2_vec, ev2 = _power_iter(pca_cov, deflate=[pc1_vec])
   1008     total_var = sum(pca_cov[i][i] for i in range(pca_d))
   1009 
   1010     # Project papers
   1011     pca_points = []
   1012     for i, r in enumerate(pca_raw):
   1013         row = pca_centered[i]
   1014         x = sum(row[j] * pc1_vec[j] for j in range(pca_d))
   1015         y = sum(row[j] * pc2_vec[j] for j in range(pca_d))
   1016         pca_points.append({
   1017             "id": r["id"],
   1018             "x": round(x, 4),
   1019             "y": round(y, 4),
   1020             "archetype": r["archetype"],
   1021             "score": r["score"],
   1022         })
   1023 
   1024     pca_result = {
   1025         "points": pca_points,
   1026         "categories": PCA_CATS,
   1027         "pc1_loadings": [round(v, 3) for v in pc1_vec],
   1028         "pc2_loadings": [round(v, 3) for v in pc2_vec],
   1029         "pc1_variance_pct": round(ev1 / total_var * 100, 1),
   1030         "pc2_variance_pct": round(ev2 / total_var * 100, 1),
   1031     }
   1032 
   1033     # 12. HN social attention analysis
   1034     hn_papers = []
   1035     for p in papers_full:
   1036         hn_pts = p.get("hn_points", 0)
   1037         hn_papers.append({"id": p["id"], "title": p["title"], "score": p["score"], "hn_points": hn_pts})
   1038 
   1039     hn_with_attention = [p for p in hn_papers if p["hn_points"] > 0]
   1040     hn_without = [p for p in hn_papers if p["hn_points"] == 0]
   1041 
   1042     # Correlation
   1043     hn_corr = None
   1044     if len(hn_with_attention) >= 10:
   1045         import math as _math
   1046         xs = [p["hn_points"] for p in hn_with_attention]
   1047         ys = [p["score"] for p in hn_with_attention]
   1048         n_hn = len(xs)
   1049         mx, my = sum(xs) / n_hn, sum(ys) / n_hn
   1050         num = sum((x - mx) * (y - my) for x, y in zip(xs, ys))
   1051         dxx = _math.sqrt(sum((x - mx) ** 2 for x in xs))
   1052         dyy = _math.sqrt(sum((y - my) ** 2 for y in ys))
   1053         hn_corr = round(num / (dxx * dyy), 3) if dxx and dyy else 0
   1054 
   1055     # Hidden gems: high methodology, low attention
   1056     hidden_gems = sorted(
   1057         [p for p in hn_papers if p["score"] >= 65 and p["hn_points"] <= 5],
   1058         key=lambda p: -p["score"]
   1059     )[:15]
   1060 
   1061     # Most hyped but weak
   1062     overhyped = sorted(
   1063         [p for p in hn_with_attention if p["score"] < 40 and p["hn_points"] >= 30],
   1064         key=lambda p: -p["hn_points"]
   1065     )[:15]
   1066 
   1067     # Engagement factor correlations with HN (v3 papers only)
   1068     ENGAGEMENT_DIMS = ["practical_relevance", "surprise_contrarian", "fear_safety",
   1069                        "drama_conflict", "demo_ability", "brand_recognition"]
   1070     engagement_corrs = {}
   1071     v3_hn_papers = []
   1072     for p in papers_full:
   1073         ef = p.get("engagement_factors")
   1074         hn_pts = p.get("hn_points", 0)
   1075         if ef and hn_pts > 0:
   1076             scores = {dim: ef[dim]["score"] for dim in ENGAGEMENT_DIMS if dim in ef}
   1077             if len(scores) == 6:
   1078                 v3_hn_papers.append({"hn": hn_pts, **scores})
   1079 
   1080     if len(v3_hn_papers) >= 10:
   1081         import math as _math
   1082         log_hn = [_math.log(p["hn"] + 1) for p in v3_hn_papers]
   1083         for dim in ENGAGEMENT_DIMS:
   1084             vals = [p[dim] for p in v3_hn_papers]
   1085             n_ef = len(vals)
   1086             mx, my = sum(log_hn) / n_ef, sum(vals) / n_ef
   1087             num = sum((x - mx) * (y - my) for x, y in zip(log_hn, vals))
   1088             dxx = _math.sqrt(sum((x - mx) ** 2 for x in log_hn))
   1089             dyy = _math.sqrt(sum((y - my) ** 2 for y in vals))
   1090             engagement_corrs[dim] = round(num / (dxx * dyy), 3) if dxx and dyy else 0
   1091 
   1092     # Engagement factor means for high-HN vs low-HN
   1093     engagement_split = {}
   1094     if v3_hn_papers:
   1095         median_hn = sorted(p["hn"] for p in v3_hn_papers)[len(v3_hn_papers) // 2]
   1096         high_hn = [p for p in v3_hn_papers if p["hn"] > median_hn]
   1097         low_hn = [p for p in v3_hn_papers if p["hn"] <= median_hn]
   1098         for dim in ENGAGEMENT_DIMS:
   1099             engagement_split[dim] = {
   1100                 "high_hn_mean": round(sum(p[dim] for p in high_hn) / len(high_hn), 2) if high_hn else 0,
   1101                 "low_hn_mean": round(sum(p[dim] for p in low_hn) / len(low_hn), 2) if low_hn else 0,
   1102             }
   1103 
   1104     # Scatter points for HN vs methodology (log scale, papers with HN > 0)
   1105     import math as _math
   1106     hn_scatter = [{"id": p["id"], "hn": p["hn_points"], "score": p["score"],
   1107                    "log_hn": round(_math.log(p["hn_points"] + 1), 2)}
   1108                   for p in hn_with_attention]
   1109 
   1110     # Tag comparison: avg HN attention + avg methodology per tag
   1111     tag_hn_comparison = {}
   1112     tag_groups = defaultdict(list)
   1113     for p in papers_full:
   1114         hn_pts = p.get("hn_points", 0)
   1115         for t in p["tags"]:
   1116             tag_groups[t].append({"hn": hn_pts, "score": p["score"]})
   1117     for tag, ps in tag_groups.items():
   1118         on_hn = [p for p in ps if p["hn"] > 0]
   1119         if len(on_hn) >= 5:
   1120             tag_hn_comparison[tag] = {
   1121                 "n": len(on_hn),
   1122                 "mean_hn": round(sum(p["hn"] for p in on_hn) / len(on_hn), 1),
   1123                 "mean_score": round(sum(p["score"] for p in on_hn) / len(on_hn), 1),
   1124             }
   1125 
   1126     # Repost signal: quality by number of HN threads
   1127     repost_bands = {}
   1128     for label, lo, hi in [("1 post", 1, 1), ("2-3", 2, 3), ("4-7", 4, 7), ("8+", 8, 999)]:
   1129         band_papers = [p for p in hn_with_attention
   1130                        if lo <= len(load_hn(p["id"]).get("threads", [])) <= hi]
   1131         if band_papers:
   1132             repost_bands[label] = {
   1133                 "n": len(band_papers),
   1134                 "mean_score": safe_mean([p["score"] for p in band_papers]),
   1135                 "mean_hn": safe_mean([p["hn_points"] for p in band_papers]),
   1136             }
   1137 
   1138     # Controversy signal: comment-to-point ratio vs quality
   1139     controversy = {}
   1140     comment_papers = [p for p in hn_with_attention if p["hn_points"] >= 10]
   1141     if comment_papers:
   1142         for p in comment_papers:
   1143             hn_d = load_hn(p["id"])
   1144             p["_total_comments"] = sum(t.get("comments", 0) for t in hn_d.get("threads", []))
   1145             p["_cpt"] = p["_total_comments"] / p["hn_points"] if p["hn_points"] else 0
   1146         med_cpt = sorted(p["_cpt"] for p in comment_papers)[len(comment_papers) // 2]
   1147         high_disc = [p for p in comment_papers if p["_cpt"] > med_cpt]
   1148         low_disc = [p for p in comment_papers if p["_cpt"] <= med_cpt]
   1149         controversy = {
   1150             "high_discussion_mean": safe_mean([p["score"] for p in high_disc]),
   1151             "low_discussion_mean": safe_mean([p["score"] for p in low_disc]),
   1152             "high_n": len(high_disc),
   1153             "low_n": len(low_disc),
   1154         }
   1155 
   1156     hn_analysis = {
   1157         "total_with_hn": len(hn_with_attention),
   1158         "total_without_hn": len(hn_without),
   1159         "correlation": hn_corr,
   1160         "with_attention_mean": safe_mean([p["score"] for p in hn_with_attention]) if hn_with_attention else 0,
   1161         "without_attention_mean": safe_mean([p["score"] for p in hn_without]) if hn_without else 0,
   1162         "top_hn": sorted(hn_with_attention, key=lambda p: -p["hn_points"])[:20],
   1163         "hidden_gems": hidden_gems,
   1164         "overhyped": overhyped,
   1165         "scatter": hn_scatter,
   1166         "tag_comparison": tag_hn_comparison,
   1167         "repost_signal": repost_bands,
   1168         "controversy": controversy,
   1169         "engagement_correlations": engagement_corrs,
   1170         "engagement_split": engagement_split,
   1171         "engagement_n": len(v3_hn_papers),
   1172     }
   1173 
   1174     findings = {
   1175         "question_rates": q_rates,
   1176         "year_category_trends": year_cat_trends,
   1177         "venue_stats": venue_stats,
   1178         "citation_band_stats": cit_band_stats,
   1179         "optimism_rigor": optimism_rigor,
   1180         "homophily": homophily,
   1181         "sampling_effect": sampling_effect,
   1182         "benchmark_monoculture": bench_mono,
   1183         "funding_gap": funding_gap,
   1184         "repro_detail": repro_detail,
   1185         "game_pcts": game_pcts,
   1186         "correlation": correlation,
   1187         "pca": pca_result,
   1188         "tag_treemap": tag_treemap,
   1189         "two_cultures": two_cultures_papers,
   1190         "hn_analysis": hn_analysis,
   1191     }
   1192 
   1193     # --- Citation network (built from cited_papers in scan.json) ---
   1194     v2_ids = {p["id"] for p in papers_full}
   1195     year_map = {p["id"]: p["year"] for p in papers_full}
   1196     title_map = {p["id"]: p["title"] for p in papers_full}
   1197 
   1198     # Build title→id index from registry (case-insensitive)
   1199     title_to_id = {}
   1200     for entry in registry.values():
   1201         t = entry.get("title", "").lower().strip()
   1202         if t:
   1203             title_to_id[t] = entry["id"]
   1204     for p in papers_full:
   1205         t = p["title"].lower().strip()
   1206         if t:
   1207             title_to_id[t] = p["id"]
   1208 
   1209     # Extract directed edges from cited_papers
   1210     net_edge_set = set()
   1211     for p in papers_full:
   1212         src = p["id"]
   1213         detail = paper_details.get(src, {})
   1214         for item in detail.get("checklist", []):
   1215             pass  # checklist doesn't have cited_papers
   1216         # Read cited_papers from the original scan.json
   1217         scan_path = PAPERS_DIR / src / "scan.json"
   1218         if scan_path.exists():
   1219             with open(scan_path) as f:
   1220                 scan_data = json.load(f)
   1221             for cited in scan_data.get("cited_papers", []):
   1222                 ct = cited.get("title", "").lower().strip()
   1223                 target = title_to_id.get(ct)
   1224                 if target and target != src:
   1225                     net_edge_set.add((src, target))
   1226 
   1227     net_edges = [[s, t] for s, t in net_edge_set]
   1228     in_degree = Counter(t for _, t in net_edge_set)
   1229     out_degree = Counter(s for s, _ in net_edge_set)
   1230 
   1231     # Collect all node IDs that appear in edges
   1232     all_net_ids = set()
   1233     for s, t in net_edge_set:
   1234         all_net_ids.add(s)
   1235         all_net_ids.add(t)
   1236 
   1237     net_nodes = []
   1238     for nid in sorted(all_net_ids):
   1239         reg = registry.get(nid, {})
   1240         net_nodes.append({
   1241             "id": nid,
   1242             "title": title_map.get(nid, reg.get("title", nid)),
   1243             "score": score_map.get(nid),
   1244             "year": year_map.get(nid, reg.get("year")),
   1245             "in_degree": in_degree.get(nid, 0),
   1246             "out_degree": out_degree.get(nid, 0),
   1247             "has_scan": nid in v2_ids,
   1248         })
   1249 
   1250     network = {"nodes": net_nodes, "edges": net_edges}
   1251 
   1252     # --- Network findings ---
   1253     # Foundational leaderboard: top 15 most-cited with scores
   1254     foundational = []
   1255     for nid, deg in in_degree.most_common(20):
   1256         foundational.append({
   1257             "id": nid,
   1258             "title": title_map.get(nid, registry.get(nid, {}).get("title", nid)),
   1259             "in_degree": deg,
   1260             "score": score_map.get(nid),
   1261         })
   1262 
   1263     # Quality contagion: mean score by % of high-quality citations
   1264     contagion_threshold = 50
   1265     high_q = {pid for pid, sc in score_map.items() if sc >= contagion_threshold}
   1266     contagion_bands = {"0%": [], "1-33%": [], "34-66%": [], "67-100%": []}
   1267     out_map = defaultdict(list)
   1268     for s, t in net_edge_set:
   1269         out_map[s].append(t)
   1270     for pid in score_map:
   1271         cited = [t for t in out_map.get(pid, []) if t in score_map]
   1272         if len(cited) < 2:
   1273             continue
   1274         pct = sum(1 for t in cited if t in high_q) / len(cited) * 100
   1275         if pct == 0:
   1276             band = "0%"
   1277         elif pct <= 33:
   1278             band = "1-33%"
   1279         elif pct <= 66:
   1280             band = "34-66%"
   1281         else:
   1282             band = "67-100%"
   1283         contagion_bands[band].append(score_map[pid])
   1284 
   1285     quality_contagion = {}
   1286     for band_name in ["0%", "1-33%", "34-66%", "67-100%"]:
   1287         ss = contagion_bands[band_name]
   1288         if ss:
   1289             quality_contagion[band_name] = {"n": len(ss), "mean": safe_mean(ss)}
   1290 
   1291     # Rigor diffusion: for top cited papers, mean score of their citers
   1292     in_map = defaultdict(list)
   1293     for s, t in net_edge_set:
   1294         in_map[t].append(s)
   1295     rigor_diffusion = []
   1296     for nid, deg in in_degree.most_common(15):
   1297         citers = [s for s in in_map[nid] if s in score_map]
   1298         rigor_diffusion.append({
   1299             "id": nid,
   1300             "title": title_map.get(nid, registry.get(nid, {}).get("title", nid)),
   1301             "score": score_map.get(nid),
   1302             "in_degree": deg,
   1303             "citer_mean": safe_mean([score_map[c] for c in citers]) if citers else None,
   1304             "citer_n": len(citers),
   1305         })
   1306 
   1307     findings["network_insights"] = {
   1308         "foundational": foundational,
   1309         "quality_contagion": quality_contagion,
   1310         "rigor_diffusion": rigor_diffusion,
   1311     }
   1312 
   1313     # --- Write files ---
   1314     OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
   1315     papers_detail_dir = OUTPUT_DIR / "papers"
   1316     papers_detail_dir.mkdir(parents=True, exist_ok=True)
   1317 
   1318     # Add unscanned registry entries to papers-index
   1319     scanned_ids = {p["id"] for p in papers_index}
   1320     for entry in registry.values():
   1321         if entry["id"] in scanned_ids:
   1322             continue
   1323         if entry.get("status") == "excluded":
   1324             continue
   1325         papers_index.append({
   1326             "id": entry["id"],
   1327             "title": entry.get("title", entry["id"]),
   1328             "year": entry.get("year"),
   1329             "venue": entry.get("venue", ""),
   1330             "tags": entry.get("tags", []),
   1331             "score": None,
   1332             "archetype": None,
   1333             "games": [],
   1334             "arxiv_id": entry.get("arxiv_id", ""),
   1335             "doi": entry.get("doi", ""),
   1336             "code_url": None,
   1337             "dna": None,
   1338             "paper_type": None,
   1339             "hn_points": 0,
   1340             "engagement": None,
   1341         })
   1342 
   1343     write_json(OUTPUT_DIR / "dashboard.json", dashboard)
   1344     write_json(OUTPUT_DIR / "findings.json", findings)
   1345     write_json(OUTPUT_DIR / "papers-index.json", papers_index)
   1346     write_json(OUTPUT_DIR / "network.json", network)
   1347     write_json(OUTPUT_DIR / "tensions.json", tensions)
   1348     write_json(OUTPUT_DIR / "calibration.json", {"papers": calibration_papers})
   1349     write_json(OUTPUT_DIR / "benchmarks.json", {"papers": benchmark_papers})
   1350 
   1351     for slug, detail in paper_details.items():
   1352         write_json(papers_detail_dir / f"{slug}.json", detail)
   1353 
   1354     # Full monolith
   1355     explorer = {
   1356         "generated": "2026-03-21",
   1357         "papers": papers_full,
   1358         "agg": dashboard,
   1359         "findings": findings,
   1360         "tensions": tensions,
   1361         "citation_network": network,
   1362     }
   1363     write_json(OUTPUT_DIR / "explorer.json", explorer)
   1364 
   1365     # Report
   1366     dash_size = (OUTPUT_DIR / "dashboard.json").stat().st_size
   1367     find_size = (OUTPUT_DIR / "findings.json").stat().st_size
   1368     idx_size = (OUTPUT_DIR / "papers-index.json").stat().st_size
   1369     net_size = (OUTPUT_DIR / "network.json").stat().st_size
   1370     tens_size = (OUTPUT_DIR / "tensions.json").stat().st_size
   1371     full_size = (OUTPUT_DIR / "explorer.json").stat().st_size
   1372     code_url_count = sum(1 for p in papers_full if p.get("code_url"))
   1373     print(f"Papers: {total_papers}")
   1374     print(f"Median score: {median:.1f}%")
   1375     print(f"Code URLs extracted: {code_url_count}")
   1376     print(f"Network: {len(net_nodes)} nodes, {len(net_edges)} edges")
   1377     print(f"Files:")
   1378     print(f"  dashboard.json:    {dash_size:>8,} bytes")
   1379     print(f"  findings.json:     {find_size:>8,} bytes")
   1380     print(f"  papers-index.json: {idx_size:>8,} bytes")
   1381     print(f"  papers/*.json:     {len(paper_details):>5} files")
   1382     print(f"  network.json:      {net_size:>8,} bytes")
   1383     print(f"  tensions.json:     {tens_size:>8,} bytes")
   1384     print(f"  explorer.json:     {full_size:>8,} bytes")
   1385 
   1386 
   1387 if __name__ == "__main__":
   1388     build()
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs