build-explorer-data.py (59664B)
1 #!/usr/bin/env python3 2 """ 3 Build data files for the static data explorer. 4 5 Reads v2 scan.json files, metadata.json, citation-graph.json, and registry.jsonl. 6 Outputs view-specific JSON files for fast loading, plus a full explorer.json for power users. 7 8 Output files: 9 explorer/public/data/dashboard.json — aggregation stats only 10 explorer/public/data/findings.json — deep analysis findings 11 explorer/public/data/papers-index.json — table data without checklists 12 explorer/public/data/papers/{slug}.json — full detail per paper 13 explorer/public/data/network.json — citation network 14 explorer/public/data/tensions.json — claim tensions 15 explorer/public/data/explorer.json — full monolith for queries 16 17 Usage: 18 python3 scripts/build-explorer-data.py 19 """ 20 21 import json 22 import re 23 from collections import Counter, defaultdict 24 from pathlib import Path 25 26 ROOT = Path(__file__).resolve().parent.parent 27 REGISTRY_PATH = ROOT / "registry.jsonl" 28 PAPERS_DIR = ROOT / "papers" 29 ANALYSIS_DIR = ROOT / "analysis" 30 OUTPUT_DIR = ROOT / "explorer" / "public" / "data" 31 32 BASE_CATEGORIES = [ 33 "artifacts", "statistical_methodology", "evaluation_design", 34 "claims_and_evidence", "setup_transparency", "limitations_and_scope", 35 "data_integrity", "conflicts_of_interest", "contamination", 36 "human_studies", "cost_and_practicality", 37 ] 38 CONDITIONAL_CATEGORIES = [ 39 "experimental_rigor", "data_leakage", "survey_methodology", 40 ] 41 ALL_CATEGORIES = BASE_CATEGORIES + CONDITIONAL_CATEGORIES 42 43 CODE_URL_RE = re.compile( 44 r'https?://(?:github\.com|gitlab\.com|zenodo\.org|bitbucket\.org|huggingface\.co)[^\s,)\"\'<>]+' 45 ) 46 47 48 def classify_archetype(cat_scores): 49 ed = cat_scores.get("evaluation_design", 0) 50 sm = cat_scores.get("statistical_methodology", 0) 51 ar = cat_scores.get("artifacts", 0) 52 if ed >= 0.8 and sm >= 0.5 and ar >= 0.5: 53 return "Complete" 54 if ed >= 0.8 and ar >= 0.4 and sm < 0.3: 55 return "Builder" 56 if ed >= 0.8 and sm < 0.3 and ar < 0.3: 57 return "Theater" 58 if ed < 0.6 and sm < 0.3: 59 return "Minimal" 60 return "Mixed" 61 62 63 def compute_category_score(category_data): 64 applicable = 0 65 passed = 0 66 for q_name, q_data in category_data.items(): 67 if not isinstance(q_data, dict) or "applies" not in q_data: 68 continue 69 if q_data["applies"]: 70 applicable += 1 71 if q_data.get("answer", False): 72 passed += 1 73 if applicable == 0: 74 return None 75 return passed / applicable 76 77 78 def compute_overall_score(checklist, category_weights=None): 79 """Overall rubric score for a paper. 80 81 Default (category_weights=None): flat per-question average. A category 82 with more applicable questions naturally contributes more. Equivalent to 83 the pre-calibration behavior. 84 85 With category_weights: weighted mean of per-category pass rates. Each 86 category contributes w_c * (passed_c / applicable_c). Categories with 87 zero applicable questions drop out cleanly. Learned weights come from 88 scripts/calibration/fit-weights.py. 89 """ 90 if category_weights is None: 91 applicable = 0 92 passed = 0 93 for cat_name, cat_data in checklist.items(): 94 if not isinstance(cat_data, dict): 95 continue 96 for q_name, q_data in cat_data.items(): 97 if not isinstance(q_data, dict) or "applies" not in q_data: 98 continue 99 if q_data["applies"]: 100 applicable += 1 101 if q_data.get("answer", False): 102 passed += 1 103 if applicable == 0: 104 return None 105 return passed / applicable 106 107 # Weighted-per-category mean 108 num = 0.0 109 den = 0.0 110 any_applicable = False 111 for cat_name, cat_data in checklist.items(): 112 if not isinstance(cat_data, dict): 113 continue 114 cat_app = 0 115 cat_pas = 0 116 for q_name, q_data in cat_data.items(): 117 if not isinstance(q_data, dict) or "applies" not in q_data: 118 continue 119 if q_data["applies"]: 120 cat_app += 1 121 if q_data.get("answer", False): 122 cat_pas += 1 123 if cat_app == 0: 124 continue 125 any_applicable = True 126 w = category_weights.get(cat_name, 1.0) 127 num += w * (cat_pas / cat_app) 128 den += w 129 if not any_applicable or den == 0: 130 return None 131 return num / den 132 133 134 def flatten_checklist(checklist): 135 flat = [] 136 for cat_name in ALL_CATEGORIES: 137 cat_data = checklist.get(cat_name, {}) 138 if not isinstance(cat_data, dict): 139 continue 140 for q_name, q_data in cat_data.items(): 141 if not isinstance(q_data, dict) or "applies" not in q_data: 142 continue 143 flat.append({ 144 "category": cat_name, 145 "question": q_name, 146 "applies": q_data["applies"], 147 "answer": q_data.get("answer", False), 148 "justification": q_data.get("justification", ""), 149 }) 150 return flat 151 152 153 def detect_games(checklist, score, cat_scores): 154 games = [] 155 ci = checklist.get("statistical_methodology", {}).get("confidence_intervals_or_error_bars", {}) 156 var = checklist.get("statistical_methodology", {}).get("variance_reported", {}) 157 if ci.get("applies") and not ci.get("answer") and var.get("applies") and not var.get("answer"): 158 games.append("Big Numbers No Error Bars") 159 ac = checklist.get("claims_and_evidence", {}).get("abstract_claims_supported", {}) 160 gb = checklist.get("claims_and_evidence", {}).get("generalization_bounded", {}) 161 if (ac.get("applies") and not ac.get("answer")) or (gb.get("applies") and not gb.get("answer")): 162 games.append("Overclaiming") 163 cr = checklist.get("artifacts", {}).get("code_released", {}) 164 env = checklist.get("artifacts", {}).get("environment_specified", {}) 165 ri = checklist.get("artifacts", {}).get("reproduction_instructions", {}) 166 if cr.get("applies") and cr.get("answer"): 167 if (env.get("applies") and not env.get("answer")) or (ri.get("applies") and not ri.get("answer")): 168 games.append("Open Source Theater") 169 bc = checklist.get("contamination", {}).get("benchmark_contamination_addressed", {}) 170 if bc.get("applies") and not bc.get("answer"): 171 games.append("Contamination Dodge") 172 # Cherry-picked Comparisons 173 bc2 = checklist.get("evaluation_design", {}).get("baselines_contemporary", {}) 174 if bc2.get("applies") and not bc2.get("answer"): 175 games.append("Cherry-picked Comparisons") 176 # All Show No Substance 177 ed = cat_scores.get("evaluation_design", 0) 178 sm = cat_scores.get("statistical_methodology", 0) 179 ar = cat_scores.get("artifacts", 0) 180 if ed >= 0.8 and sm < 0.2 and ar < 0.2: 181 games.append("All Show No Substance") 182 # Trust Us: no raw data AND no code — completely unverifiable 183 rd = checklist.get("data_integrity", {}).get("raw_data_available", {}) 184 if rd.get("applies") and not rd.get("answer") and cr.get("applies") and not cr.get("answer"): 185 games.append("Trust Us") 186 # The Black Box: no prompts AND no hyperparameters — can't replicate 187 pr = checklist.get("setup_transparency", {}).get("prompts_provided", {}) 188 hp = checklist.get("setup_transparency", {}).get("hyperparameters_reported", {}) 189 if pr.get("applies") and not pr.get("answer") and hp.get("applies") and not hp.get("answer"): 190 games.append("The Black Box") 191 # Moving Goalpost: causal claims without causal design 192 cc = checklist.get("claims_and_evidence", {}).get("causal_claims_justified", {}) 193 if cc.get("applies") and not cc.get("answer"): 194 games.append("Moving Goalpost") 195 # Limitation Theater: has section but says nothing specific 196 ls = checklist.get("limitations_and_scope", {}).get("limitations_section_present", {}) 197 tv = checklist.get("limitations_and_scope", {}).get("threats_to_validity_specific", {}) 198 sb = checklist.get("limitations_and_scope", {}).get("scope_boundaries_stated", {}) 199 if ls.get("applies") and ls.get("answer") and tv.get("applies") and not tv.get("answer") and sb.get("applies") and not sb.get("answer"): 200 games.append("Limitation Theater") 201 return games 202 203 204 def extract_code_url(checklist): 205 cr = checklist.get("artifacts", {}).get("code_released", {}) 206 if cr.get("applies") and cr.get("answer"): 207 urls = CODE_URL_RE.findall(cr.get("justification", "")) 208 if urls: 209 return urls[0].rstrip(".,;:") 210 return None 211 212 213 def load_registry(): 214 entries = {} 215 with open(REGISTRY_PATH) as f: 216 for line in f: 217 line = line.strip() 218 if line: 219 entry = json.loads(line) 220 entries[entry["id"]] = entry 221 return entries 222 223 224 def load_citation_graph(): 225 path = ANALYSIS_DIR / "citation-graph.json" 226 if not path.exists(): 227 return {"nodes": [], "edges": []} 228 with open(path) as f: 229 return json.load(f) 230 231 232 def load_metadata(paper_id): 233 path = PAPERS_DIR / paper_id / "metadata.json" 234 if not path.exists(): 235 return {} 236 with open(path) as f: 237 return json.load(f) 238 239 240 def load_hn(paper_id): 241 path = PAPERS_DIR / paper_id / "hn.json" 242 if not path.exists(): 243 return {} 244 with open(path) as f: 245 return json.load(f) 246 247 248 def write_json(path, data): 249 path.parent.mkdir(parents=True, exist_ok=True) 250 with open(path, "w") as f: 251 json.dump(data, f, ensure_ascii=False, separators=(",", ":")) 252 253 254 def safe_mean(scores): 255 return round(sum(scores) / len(scores), 1) if scores else 0 256 257 258 def safe_median(scores): 259 if not scores: 260 return 0 261 s = sorted(scores) 262 return round(s[len(s) // 2], 1) 263 264 265 def load_category_weights(): 266 """Load learned weights from scripts/calibration/weights.json if present. 267 Falls back to None (uniform flat-question averaging) when absent.""" 268 path = Path(__file__).resolve().parent / "calibration" / "weights.json" 269 if not path.exists(): 270 return None 271 with open(path) as f: 272 data = json.load(f) 273 return data.get("weights") 274 275 276 def build(): 277 registry = load_registry() 278 citation_data = load_citation_graph() 279 category_weights = load_category_weights() 280 if category_weights: 281 print(f"Using learned category weights ({len(category_weights)} categories)") 282 else: 283 print("No calibration/weights.json; using uniform per-question weights") 284 285 # Accumulators 286 papers_full = [] 287 papers_index = [] 288 paper_details = {} 289 # Partitioned papers: scored the same way but kept out of aggregates and 290 # papers-index so they don't skew the agentic-AI corpus numbers. Individual 291 # detail JSONs still written so /sigint/papers/{slug} works. 292 # reference-benchmark: rubric calibration specimens (Wakefield, Ioannidis, 293 # Attention). Listed in calibration.json. 294 # benchmark-eval: papers that introduce benchmarks used BY the field. 295 # They're reference material, not subjects being 296 # evaluated. Listed in benchmarks.json. 297 calibration_papers = [] 298 benchmark_papers = [] 299 all_scores = [] 300 cat_pass_counts = defaultdict(lambda: {"passed": 0, "applicable": 0}) 301 year_scores = defaultdict(list) 302 tag_counts = Counter() 303 archetype_counts = Counter() 304 game_counts = Counter() 305 total_papers = 0 306 307 # Findings accumulators 308 question_pass_counts = defaultdict(lambda: {"passed": 0, "applicable": 0}) 309 year_cat_scores = defaultdict(lambda: defaultdict(lambda: {"passed": 0, "applicable": 0})) 310 venue_scores = defaultdict(list) 311 citation_band_scores = defaultdict(list) 312 benchmark_only_by_year = defaultdict(lambda: {"benchmark_only": 0, "total": 0}) 313 funding_groups = {"disclosed": [], "not_disclosed": []} 314 score_map = {} # paper_id -> score_pct (built incrementally for homophily) 315 316 tensions = { 317 "productivity": {"positive": [], "nuanced": []}, 318 "benchmarks": {"positive": [], "nuanced": []}, 319 "agents": {"positive": [], "nuanced": []}, 320 "security": {"positive": [], "nuanced": []}, 321 "code_quality": {"positive": [], "nuanced": []}, 322 "scaling": {"positive": [], "nuanced": []}, 323 } 324 325 for scan_path in sorted(PAPERS_DIR.glob("*/scan.json")): 326 paper_id = scan_path.parent.name 327 with open(scan_path) as f: 328 scan = json.load(f) 329 330 # Include all scans regardless of version. The v1 rubric (50 questions) 331 # is a proper subset of v2+ (57 questions, adding data_leakage, 332 # experimental_rigor, and survey_methodology modules). compute_overall_score 333 # uses passed/applicable over present questions, so v1 papers degrade 334 # gracefully: their 50 applicable questions are scored normally and the 335 # 7 v2+-only questions are treated as absent (same as any paper where 336 # a conditional module doesn't apply). 337 338 checklist = scan.get("checklist", {}) 339 paper_meta = scan.get("paper", {}) 340 reg_entry = registry.get(paper_id, {}) 341 metadata = load_metadata(paper_id) 342 hn_data = load_hn(paper_id) 343 reg_tags = reg_entry.get("tags") or [] 344 is_reference = "reference-benchmark" in reg_tags 345 is_benchmark_paper = "benchmark-eval" in reg_tags 346 is_calibration = is_reference or is_benchmark_paper 347 348 overall = compute_overall_score(checklist, category_weights) 349 if overall is None: 350 continue 351 352 # Classify paper type: empirical if both stats and eval have applicable questions 353 def _has_applicable(cat_name): 354 cd = checklist.get(cat_name, {}) 355 if not isinstance(cd, dict): 356 return False 357 return any(isinstance(qd, dict) and qd.get("applies", False) 358 for qd in cd.values()) 359 360 is_empirical = _has_applicable("statistical_methodology") and _has_applicable("evaluation_design") 361 paper_type = "empirical" if is_empirical else "non-empirical" 362 363 cat_scores = {} 364 for cat in ALL_CATEGORIES: 365 cat_data = checklist.get(cat, {}) 366 if cat_data and isinstance(cat_data, dict): 367 cs = compute_category_score(cat_data) 368 if cs is not None: 369 cat_scores[cat] = cs 370 371 score_pct = round(overall * 100, 1) 372 score_map[paper_id] = score_pct 373 374 year = paper_meta.get("year") or reg_entry.get("year") 375 venue = metadata.get("venue") or paper_meta.get("venue") or reg_entry.get("venue", "") 376 tags = scan.get("methodology_tags", []) or reg_entry.get("tags", []) 377 archetype = classify_archetype(cat_scores) if is_empirical else None 378 games = detect_games(checklist, overall, cat_scores) if is_empirical else [] 379 380 # External links 381 arxiv_id = paper_meta.get("arxiv_id") or reg_entry.get("arxiv_id", "") 382 doi = paper_meta.get("doi") or reg_entry.get("doi", "") 383 source_url = reg_entry.get("source_url", "") 384 385 # Code URL extraction 386 code_url = extract_code_url(checklist) 387 388 # Only empirical, non-calibration papers feed into findings aggregations 389 if is_empirical and not is_calibration: 390 total_papers += 1 391 all_scores.append(score_pct) 392 year_scores[year].append(score_pct) 393 for t in tags: 394 tag_counts[t] += 1 395 archetype_counts[archetype] += 1 396 for g in games: 397 game_counts[g] += 1 398 399 claims = scan.get("claims", []) 400 red_flags = scan.get("red_flags", []) 401 402 # All remaining aggregations are empirical-only and skip calibration 403 if not is_empirical or is_calibration: 404 pass # skip to index/detail construction below 405 else: 406 # Category + question aggregations 407 for cat in ALL_CATEGORIES: 408 cat_data = checklist.get(cat, {}) 409 if not isinstance(cat_data, dict): 410 continue 411 for q_name, q_data in cat_data.items(): 412 if not isinstance(q_data, dict) or "applies" not in q_data: 413 continue 414 if q_data["applies"]: 415 cat_pass_counts[cat]["applicable"] += 1 416 question_pass_counts[f"{cat}.{q_name}"]["applicable"] += 1 417 if q_data.get("answer", False): 418 cat_pass_counts[cat]["passed"] += 1 419 question_pass_counts[f"{cat}.{q_name}"]["passed"] += 1 420 # Year × category 421 if year: 422 year_cat_scores[year][cat]["applicable"] += 1 423 if q_data.get("answer", False): 424 year_cat_scores[year][cat]["passed"] += 1 425 426 # Venue scoring 427 venue_clean = venue.strip() 428 if venue_clean and venue_clean.lower() not in ("arxiv", "arxiv.org", ""): 429 venue_scores[venue_clean].append(score_pct) 430 431 # Citation band scoring 432 cit = metadata.get("citation_count") 433 if cit is not None: 434 if cit == 0: 435 band = "0" 436 elif cit <= 50: 437 band = "1-50" 438 elif cit <= 500: 439 band = "51-500" 440 else: 441 band = "500+" 442 citation_band_scores[band].append(score_pct) 443 444 # Benchmark monoculture 445 if year: 446 benchmark_only_by_year[year]["total"] += 1 447 if tags == ["benchmark-eval"]: 448 benchmark_only_by_year[year]["benchmark_only"] += 1 449 450 # Funding gap 451 fd = checklist.get("conflicts_of_interest", {}).get("funding_disclosed", {}) 452 if fd.get("applies"): 453 if fd.get("answer"): 454 funding_groups["disclosed"].append(score_pct) 455 else: 456 funding_groups["not_disclosed"].append(score_pct) 457 458 # Tension classification (empirical papers only, calibration excluded) 459 if is_empirical and not is_calibration: 460 for claim in claims: 461 ct = claim.get("claim", "").lower() 462 entry = {"paper_id": paper_id, "claim": claim["claim"], 463 "supported": claim.get("supported", ""), "score": score_pct, "year": year} 464 465 # Productivity 466 if any(k in ct for k in ["productivity", "developer speed", "completion time", "speedup", 467 "faster", "developer productivity", "coding efficiency", 468 "development time", "time savings", "code faster"]): 469 bucket = "positive" if any(k in ct for k in ["faster", "speedup", "improves", "increases", 470 "gain", "savings", "efficient"]) else "nuanced" 471 tensions["productivity"][bucket].append(entry) 472 473 # Benchmarks (expanded) 474 if any(k in ct for k in ["benchmark", "evaluation", "leaderboard", "swe-bench", 475 "pass@", "accuracy", "f1 score", "performance on", 476 "state-of-the-art", "sota"]): 477 bucket = "positive" if any(k in ct for k in ["state-of-the-art", "outperforms", "achieves", 478 "best", "surpasses", "exceeds"]) else "nuanced" 479 tensions["benchmarks"][bucket].append(entry) 480 481 # Agents (expanded) 482 if any(k in ct for k in ["agent", "autonomous", "multi-agent", "agentic", 483 "tool use", "planning", "chain-of-thought", 484 "reasoning capability"]): 485 bucket = "positive" if any(k in ct for k in ["solves", "achieves", "succeeds", "capable", 486 "outperforms", "enables", "improves"]) else "nuanced" 487 tensions["agents"][bucket].append(entry) 488 489 # Security arms race (NEW) 490 if any(k in ct for k in ["attack", "defense", "jailbreak", "injection", "adversarial", 491 "vulnerability", "safety", "alignment", "harmful", "toxic", 492 "secure", "exploit", "bypass", "mitigat"]): 493 bucket = "positive" if any(k in ct for k in ["defense", "protect", "mitigat", "detect", 494 "prevent", "secure", "effective", "reduces", 495 "robust"]) else "nuanced" 496 tensions["security"][bucket].append(entry) 497 498 # Code quality (NEW) 499 if any(k in ct for k in ["code quality", "bug", "vulnerability", "error", "defect", 500 "repair", "fix", "correct", "hallucin", "incorrect code", 501 "insecure code", "code generation"]): 502 bucket = "positive" if any(k in ct for k in ["repair", "fix", "correct", "improve", 503 "reduc", "effective", "resolve"]) else "nuanced" 504 tensions["code_quality"][bucket].append(entry) 505 506 # Scaling debate (NEW) 507 if any(k in ct for k in ["scaling", "scale", "cost", "efficient", "latency", 508 "token", "compute", "inference", "smaller model", 509 "distill", "compress"]): 510 bucket = "positive" if any(k in ct for k in ["efficient", "reduc", "cheaper", "faster", 511 "smaller", "compet", "saving"]) else "nuanced" 512 tensions["scaling"][bucket].append(entry) 513 514 cat_scores_pct = {k: round(v * 100, 1) for k, v in cat_scores.items()} 515 516 # DNA strip: compact array of base category scores (0-100, null if N/A) 517 dna = [cat_scores_pct.get(cat) for cat in BASE_CATEGORIES] 518 519 # Slim index entry 520 index_entry = { 521 "id": paper_id, 522 "title": paper_meta.get("title", reg_entry.get("title", paper_id)), 523 "year": year, 524 "venue": venue, 525 "tags": tags, 526 "score": score_pct, 527 "archetype": archetype, 528 "games": games, 529 "arxiv_id": arxiv_id, 530 "doi": doi, 531 "code_url": code_url, 532 "dna": dna, 533 "paper_type": paper_type, 534 "hn_points": hn_data.get("top_points", 0), 535 "engagement": [scan.get("engagement_factors", {}).get(d, {}).get("score") for d in 536 ["practical_relevance", "surprise_contrarian", "fear_safety", 537 "drama_conflict", "demo_ability", "brand_recognition"] 538 ] if scan.get("engagement_factors") else None, 539 } 540 if not is_calibration: 541 papers_index.append(index_entry) 542 543 # Full detail 544 detail = { 545 **index_entry, 546 "category_scores": cat_scores_pct, 547 "claims": [{"claim": c["claim"], "supported": c.get("supported", "")} for c in claims], 548 "red_flags": [{"flag": r["flag"], "detail": r["detail"]} for r in red_flags], 549 "checklist": flatten_checklist(checklist), 550 "key_findings": scan.get("key_findings", ""), 551 "active_modules": scan.get("active_modules", []), 552 "source_url": source_url, 553 "hn_threads": hn_data.get("threads", []), 554 "engagement_factors": scan.get("engagement_factors"), 555 } 556 # Individual JSON gets written for everyone, so /sigint/papers/{slug} 557 # continues to work. Only non-calibration contributes to the aggregates. 558 paper_details[paper_id] = detail 559 if is_calibration: 560 # Carry the registry notes through so the consumer can explain 561 # why each specimen is in the corpus without re-deriving it. 562 cal_detail = dict(detail) 563 cal_detail["calibration_notes"] = reg_entry.get("notes", "") 564 if is_reference: 565 calibration_papers.append(cal_detail) 566 else: # benchmark-eval only (not also reference-benchmark) 567 benchmark_papers.append(cal_detail) 568 else: 569 papers_full.append(detail) 570 571 # --- Dashboard aggregations --- 572 all_scores.sort() 573 n = len(all_scores) 574 median = all_scores[n // 2] if n else 0 575 mean = sum(all_scores) / n if n else 0 576 577 hist_bins = [] 578 for lo in range(0, 100, 5): 579 hi = lo + 5 580 count = sum(1 for s in all_scores if lo <= s < hi) 581 hist_bins.append({"lo": lo, "hi": hi, "count": count}) 582 583 cat_rates = {} 584 for cat in ALL_CATEGORIES: 585 d = cat_pass_counts[cat] 586 if d["applicable"] > 0: 587 cat_rates[cat] = round(d["passed"] / d["applicable"] * 100, 1) 588 589 year_trends = {} 590 for y in sorted(year_scores.keys()): 591 scores = year_scores[y] 592 year_trends[str(y)] = { 593 "n": len(scores), 594 "mean": round(sum(scores) / len(scores), 1), 595 "median": round(sorted(scores)[len(scores) // 2], 1), 596 } 597 598 game_pcts = {g: round(c / total_papers * 100, 1) for g, c in game_counts.items()} 599 repro_count = sum(1 for p in papers_full if p["category_scores"].get("artifacts", 0) == 100) 600 601 # --- Registry pipeline stats --- 602 reg_total = len(registry) 603 v5_opus = 0 604 v5_haiku = 0 605 deprecated_scan = 0 606 not_scanned = 0 607 for e in registry.values(): 608 pid = e["id"] 609 v5_path = PAPERS_DIR / pid / "scan-v5.json" 610 old_path = PAPERS_DIR / pid / "scan.json" 611 if v5_path.exists(): 612 with open(v5_path) as f: 613 v5 = json.load(f) 614 # Check if any answers have source="opus" 615 has_opus = False 616 for cat_data in v5.get("checklist", {}).values(): 617 if isinstance(cat_data, dict): 618 for qd in cat_data.values(): 619 if isinstance(qd, dict) and qd.get("source") == "opus": 620 has_opus = True 621 break 622 if has_opus: 623 break 624 if has_opus: 625 v5_opus += 1 626 else: 627 v5_haiku += 1 628 elif old_path.exists(): 629 deprecated_scan += 1 630 else: 631 not_scanned += 1 632 633 pipeline = { 634 "registry_total": reg_total, 635 "v5_opus": v5_opus, 636 "v5_haiku": v5_haiku, 637 "deprecated_scan": deprecated_scan, 638 "not_scanned": not_scanned, 639 } 640 641 dashboard = { 642 "n": total_papers, 643 "median": round(median, 1), 644 "mean": round(mean, 1), 645 "full_reproducibility_pct": round(repro_count / total_papers * 100, 1) if total_papers else 0, 646 "histogram": hist_bins, 647 "category_rates": cat_rates, 648 "year_trends": year_trends, 649 "game_pcts": game_pcts, 650 "archetype_counts": dict(archetype_counts), 651 "tag_counts": dict(tag_counts), 652 "pipeline": pipeline, 653 } 654 655 # --- Findings aggregations --- 656 657 # 1. Per-question pass rates (with human-readable descriptions) 658 Q_DESCRIPTIONS = { 659 "artifacts.code_released": "Source code publicly released", 660 "artifacts.data_released": "Dataset publicly available", 661 "artifacts.environment_specified": "Environment/dependency specs provided", 662 "artifacts.reproduction_instructions": "Step-by-step reproduction instructions included", 663 "statistical_methodology.confidence_intervals_or_error_bars": "Confidence intervals or error bars on main results", 664 "statistical_methodology.significance_tests": "Statistical significance tests for comparative claims", 665 "statistical_methodology.effect_sizes_reported": "Effect sizes reported, not just p-values", 666 "statistical_methodology.sample_size_justified": "Sample size justified or power analysis discussed", 667 "statistical_methodology.variance_reported": "Variance or std dev reported across runs", 668 "evaluation_design.baselines_included": "Baseline comparisons included", 669 "evaluation_design.baselines_contemporary": "Baselines are contemporary and competitive", 670 "evaluation_design.ablation_study": "Ablation study showing which components matter", 671 "evaluation_design.multiple_metrics": "Multiple evaluation metrics used", 672 "evaluation_design.human_evaluation": "Human evaluation included, not just automated", 673 "evaluation_design.held_out_test_set": "Results on held-out test set, not dev/val", 674 "evaluation_design.per_category_breakdown": "Per-category or per-task breakdowns provided", 675 "evaluation_design.failure_cases_discussed": "Failure cases shown or discussed", 676 "evaluation_design.negative_results_reported": "Negative results reported", 677 "claims_and_evidence.abstract_claims_supported": "All abstract claims supported by results", 678 "claims_and_evidence.causal_claims_justified": "Causal claims backed by adequate study design", 679 "claims_and_evidence.generalization_bounded": "Generalizations bounded to tested setting", 680 "claims_and_evidence.alternative_explanations_discussed": "Alternative explanations discussed", 681 "claims_and_evidence.proxy_outcome_distinction": "Proxy vs outcome distinction acknowledged", 682 "setup_transparency.model_versions_specified": "Exact model versions specified", 683 "setup_transparency.prompts_provided": "Actual prompts/system instructions provided", 684 "setup_transparency.hyperparameters_reported": "Hyperparameters reported (temperature, etc.)", 685 "setup_transparency.scaffolding_described": "Agentic scaffolding described in detail", 686 "setup_transparency.data_preprocessing_documented": "Data preprocessing steps documented", 687 "limitations_and_scope.limitations_section_present": "Dedicated limitations section present", 688 "limitations_and_scope.threats_to_validity_specific": "Specific threats to validity discussed", 689 "limitations_and_scope.scope_boundaries_stated": "Explicit scope boundaries stated", 690 "data_integrity.raw_data_available": "Raw data available for verification", 691 "data_integrity.data_collection_described": "Data collection procedure described", 692 "data_integrity.recruitment_methods_described": "Participant/sample recruitment described", 693 "data_integrity.data_pipeline_documented": "Full data pipeline documented", 694 "conflicts_of_interest.funding_disclosed": "Funding source disclosed", 695 "conflicts_of_interest.affiliations_disclosed": "Author affiliations with evaluated product disclosed", 696 "conflicts_of_interest.funder_independent_of_outcome": "Funder independent of outcome", 697 "conflicts_of_interest.financial_interests_declared": "Financial interests declared", 698 "contamination.training_cutoff_stated": "Model training data cutoff stated", 699 "contamination.train_test_overlap_discussed": "Train/test overlap discussed", 700 "contamination.benchmark_contamination_addressed": "Benchmark contamination addressed", 701 "human_studies.pre_registered": "Study pre-registered", 702 "human_studies.irb_or_ethics_approval": "IRB or ethics approval mentioned", 703 "human_studies.demographics_reported": "Participant demographics reported", 704 "human_studies.inclusion_exclusion_criteria": "Inclusion/exclusion criteria stated", 705 "human_studies.randomization_described": "Randomization procedure described", 706 "human_studies.blinding_described": "Blinding described", 707 "human_studies.attrition_reported": "Participant attrition reported", 708 "cost_and_practicality.inference_cost_reported": "Inference cost or latency reported", 709 "cost_and_practicality.compute_budget_stated": "Total computational budget stated", 710 "experimental_rigor.seed_sensitivity_reported": "Results across multiple random seeds", 711 "experimental_rigor.number_of_runs_stated": "Number of experimental runs stated", 712 "experimental_rigor.hyperparameter_search_budget": "Hyperparameter search budget reported", 713 "experimental_rigor.best_config_selection_justified": "Best config selection justified", 714 "experimental_rigor.multiple_comparison_correction": "Multiple comparison correction applied", 715 "experimental_rigor.self_comparison_bias_addressed": "Self-evaluation bias acknowledged", 716 "experimental_rigor.compute_budget_vs_performance": "Performance reported vs compute budget", 717 "experimental_rigor.benchmark_construct_validity": "Benchmark construct validity discussed", 718 "experimental_rigor.scaffold_confound_addressed": "Scaffolding confound addressed", 719 "data_leakage.temporal_leakage_addressed": "Temporal leakage addressed", 720 "data_leakage.feature_leakage_addressed": "Feature leakage addressed", 721 "data_leakage.non_independence_addressed": "Train/test non-independence addressed", 722 "data_leakage.leakage_detection_method": "Concrete leakage detection method used", 723 "survey_methodology.prisma_or_structured_protocol": "PRISMA or structured review protocol", 724 "survey_methodology.quality_assessment_of_sources": "Quality assessment of source papers", 725 "survey_methodology.publication_bias_discussed": "Publication bias discussed", 726 } 727 728 q_rates = {} 729 for key, d in question_pass_counts.items(): 730 if d["applicable"] > 0: 731 q_rates[key] = { 732 "rate": round(d["passed"] / d["applicable"] * 100, 1), 733 "n": d["applicable"], 734 "desc": Q_DESCRIPTIONS.get(key, ""), 735 } 736 737 # 2. Year trends by category 738 year_cat_trends = {} 739 for y in sorted(year_cat_scores.keys()): 740 year_cat_trends[str(y)] = {} 741 for cat in ALL_CATEGORIES: 742 d = year_cat_scores[y][cat] 743 if d["applicable"] > 0: 744 year_cat_trends[str(y)][cat] = round(d["passed"] / d["applicable"] * 100, 1) 745 746 # 3. Venue & citation scoring 747 venue_stats = {} 748 for v, scores in venue_scores.items(): 749 if len(scores) >= 3: 750 venue_stats[v] = {"n": len(scores), "mean": safe_mean(scores), "median": safe_median(scores)} 751 752 cit_band_stats = {} 753 for band in ["0", "1-50", "51-500", "500+"]: 754 scores = citation_band_scores.get(band, []) 755 if scores: 756 cit_band_stats[band] = {"n": len(scores), "mean": safe_mean(scores), "median": safe_median(scores)} 757 758 # 4. Optimism-rigor inversion 759 optimism_rigor = {} 760 for key, sides in tensions.items(): 761 pos = [c["score"] for c in sides["positive"]] 762 nua = [c["score"] for c in sides["nuanced"]] 763 optimism_rigor[key] = { 764 "positive_n": len(pos), "positive_mean": safe_mean(pos), 765 "nuanced_n": len(nua), "nuanced_mean": safe_mean(nua), 766 "gap": round(safe_mean(nua) - safe_mean(pos), 1), 767 } 768 769 # 5. Quality homophily 770 threshold = 60 771 high_quality_ids = {pid for pid, sc in score_map.items() if sc >= threshold} 772 baseline_pct = round(len(high_quality_ids) / total_papers * 100, 1) if total_papers else 0 773 774 cited_high = 0 775 cited_total = 0 776 for edge in citation_data.get("edges", []): 777 s, t = edge["source"], edge["target"] 778 if s in high_quality_ids and t in score_map: 779 cited_total += 1 780 if score_map[t] >= threshold: 781 cited_high += 1 782 783 homophily = { 784 "threshold": threshold, 785 "baseline_pct": baseline_pct, 786 "high_cite_high_pct": round(cited_high / cited_total * 100, 1) if cited_total else 0, 787 "high_cite_total": cited_total, 788 } 789 790 # 6. Sampling effect (historical checkpoints + current) 791 sampling_effect = { 792 "checkpoints": [ 793 {"n": 135, "median": 53.3}, 794 {"n": 271, "median": 50.6}, 795 {"n": 467, "median": 50.0}, 796 {"n": 745, "median": 48.1}, 797 {"n": 932, "median": 47.1}, 798 {"n": total_papers, "median": round(median, 1)}, 799 ] 800 } 801 802 # 7. Benchmark monoculture 803 bench_mono = {} 804 for y in sorted(benchmark_only_by_year.keys()): 805 d = benchmark_only_by_year[y] 806 if d["total"] > 0: 807 bench_mono[str(y)] = { 808 "benchmark_only": d["benchmark_only"], 809 "total": d["total"], 810 "pct": round(d["benchmark_only"] / d["total"] * 100, 1), 811 } 812 813 # 8. Funding gap 814 funding_gap = {} 815 for group, scores in funding_groups.items(): 816 if scores: 817 funding_gap[group] = {"n": len(scores), "mean": safe_mean(scores), "median": safe_median(scores)} 818 819 # 9. Reproducibility drill-down 820 artifacts_qs = ["code_released", "data_released", "environment_specified", "reproduction_instructions"] 821 repro_detail = {} 822 for q in artifacts_qs: 823 key = f"artifacts.{q}" 824 d = question_pass_counts.get(key, {"passed": 0, "applicable": 0}) 825 if d["applicable"] > 0: 826 repro_detail[q] = {"rate": round(d["passed"] / d["applicable"] * 100, 1), "n": d["applicable"]} 827 repro_detail["full_pass_count"] = repro_count 828 repro_detail["full_pass_pct"] = round(repro_count / total_papers * 100, 1) if total_papers else 0 829 830 # 9b. Reproducibility funnel — cascading filter 831 repro_funnel = [] 832 step_papers = set(p["id"] for p in papers_full) 833 repro_funnel.append({"step": "All papers", "n": len(step_papers)}) 834 for q_name, label in [ 835 ("code_released", "Code released"), 836 ("data_released", "Data released"), 837 ("environment_specified", "Environment specified"), 838 ("reproduction_instructions", "Reproduction instructions"), 839 ]: 840 next_set = set() 841 for p in papers_full: 842 if p["id"] not in step_papers: 843 continue 844 for item in p["checklist"]: 845 if item["category"] == "artifacts" and item["question"] == q_name: 846 if item["applies"] and item["answer"]: 847 next_set.add(p["id"]) 848 elif not item["applies"]: 849 next_set.add(p["id"]) # N/A doesn't filter out 850 break 851 step_papers = next_set 852 repro_funnel.append({"step": label, "n": len(step_papers)}) 853 repro_detail["funnel"] = repro_funnel 854 855 # 9c. Methodology tag treemap 856 tag_treemap = [] 857 tag_score_map = defaultdict(list) 858 for p in papers_full: 859 for t in p["tags"]: 860 tag_score_map[t].append(p["score"]) 861 for t, scores in tag_score_map.items(): 862 tag_treemap.append({ 863 "tag": t, 864 "n": len(scores), 865 "mean": safe_mean(scores), 866 }) 867 tag_treemap.sort(key=lambda x: -x["n"]) 868 869 # 9d. Two cultures / three clusters 870 # Cluster definitions based on correlation analysis 871 cluster_defs = { 872 "Transparency & Artifacts": ["artifacts", "setup_transparency", "data_integrity"], 873 "Statistical & Experimental Rigor": ["statistical_methodology", "experimental_rigor", "claims_and_evidence"], 874 "Contamination Awareness": ["contamination", "data_leakage"], 875 } 876 # Compute mean score per cluster per paper, then inter-cluster correlations 877 cluster_vectors = defaultdict(list) # cluster_name -> [mean_scores] 878 for p in papers_full: 879 cs = p["category_scores"] 880 for cname, cats in cluster_defs.items(): 881 vals = [cs[c] for c in cats if c in cs] 882 if vals: 883 cluster_vectors[cname].append(sum(vals) / len(vals)) 884 else: 885 cluster_vectors[cname].append(None) 886 887 # Two cultures: compute overlap between human_studies and artifacts/setup 888 two_cultures_papers = [] 889 for p in papers_full: 890 cs = p["category_scores"] 891 hs = cs.get("human_studies") 892 art = cs.get("artifacts") 893 if hs is not None and art is not None: 894 two_cultures_papers.append({"human_studies": hs, "artifacts": art, "id": p["id"], "score": p["score"]}) 895 896 # 10. Category correlation matrix 897 # Collect per-paper category score vectors 898 paper_cat_vectors = [] 899 for p in papers_full: 900 cs = p["category_scores"] 901 # Convert percentage back to 0-1 for correlation 902 vec = {cat: cs[cat] / 100.0 for cat in ALL_CATEGORIES if cat in cs} 903 if len(vec) >= 5: 904 paper_cat_vectors.append(vec) 905 906 def pearson(xs, ys): 907 n = len(xs) 908 if n < 10: 909 return None 910 mx = sum(xs) / n 911 my = sum(ys) / n 912 num = sum((x - mx) * (y - my) for x, y in zip(xs, ys)) 913 dx = sum((x - mx) ** 2 for x in xs) ** 0.5 914 dy = sum((y - my) ** 2 for y in ys) ** 0.5 915 if dx == 0 or dy == 0: 916 return None 917 return num / (dx * dy) 918 919 # Only include categories with enough data 920 corr_cats = [c for c in ALL_CATEGORIES 921 if sum(1 for v in paper_cat_vectors if c in v) >= 30] 922 923 corr_matrix = [] 924 for c1 in corr_cats: 925 row = [] 926 for c2 in corr_cats: 927 xs, ys = [], [] 928 for v in paper_cat_vectors: 929 if c1 in v and c2 in v: 930 xs.append(v[c1]) 931 ys.append(v[c2]) 932 r = pearson(xs, ys) 933 row.append({"r": round(r, 3) if r is not None else None, "n": len(xs)}) 934 corr_matrix.append(row) 935 936 correlation = { 937 "categories": corr_cats, 938 "matrix": corr_matrix, 939 } 940 941 # 11. PCA scatter — project papers to 2D from category scores 942 PCA_CATS = [ 943 "artifacts", "statistical_methodology", "evaluation_design", 944 "claims_and_evidence", "setup_transparency", "limitations_and_scope", 945 "data_integrity", "conflicts_of_interest", "cost_and_practicality", 946 ] 947 948 # Collect vectors, impute missing with median 949 pca_raw = [] 950 for p in papers_full: 951 cs = p["category_scores"] 952 vec = {cat: cs[cat] / 100.0 for cat in PCA_CATS if cat in cs} 953 non_none = len(vec) 954 if non_none >= 6: 955 pca_raw.append({"id": p["id"], "scores": vec, "archetype": p["archetype"], "title": p["title"], "score": p["score"]}) 956 957 pca_medians = {} 958 for cat in PCA_CATS: 959 vals = sorted(r["scores"].get(cat, None) for r in pca_raw if r["scores"].get(cat) is not None) 960 pca_medians[cat] = vals[len(vals) // 2] if vals else 0.5 961 962 pca_vecs = [] 963 for r in pca_raw: 964 vec = [r["scores"].get(cat, pca_medians[cat]) for cat in PCA_CATS] 965 pca_vecs.append(vec) 966 967 pca_n = len(pca_vecs) 968 pca_d = len(PCA_CATS) 969 970 # Center 971 pca_means = [sum(v[j] for v in pca_vecs) / pca_n for j in range(pca_d)] 972 pca_centered = [[v[j] - pca_means[j] for j in range(pca_d)] for v in pca_vecs] 973 974 # Covariance 975 pca_cov = [[0.0] * pca_d for _ in range(pca_d)] 976 for i in range(pca_d): 977 for j in range(pca_d): 978 pca_cov[i][j] = sum(row[i] * row[j] for row in pca_centered) / (pca_n - 1) 979 980 # Power iteration for top 2 eigenvectors 981 import random as _rng 982 _rng.seed(42) 983 984 def _power_iter(mat, num_iter=300, deflate=None): 985 dd = len(mat) 986 v = [_rng.gauss(0, 1) for _ in range(dd)] 987 if deflate: 988 for dv in deflate: 989 dot = sum(v[i] * dv[i] for i in range(dd)) 990 v = [v[i] - dot * dv[i] for i in range(dd)] 991 norm = sum(x * x for x in v) ** 0.5 992 v = [x / norm for x in v] 993 for _ in range(num_iter): 994 nv = [sum(mat[i][j] * v[j] for j in range(dd)) for i in range(dd)] 995 if deflate: 996 for dv in deflate: 997 dot = sum(nv[i] * dv[i] for i in range(dd)) 998 nv = [nv[i] - dot * dv[i] for i in range(dd)] 999 norm = sum(x * x for x in nv) ** 0.5 1000 if norm == 0: 1001 break 1002 v = [x / norm for x in nv] 1003 ev = sum(sum(mat[i][j] * v[j] for j in range(dd)) * v[i] for i in range(dd)) 1004 return v, ev 1005 1006 pc1_vec, ev1 = _power_iter(pca_cov) 1007 pc2_vec, ev2 = _power_iter(pca_cov, deflate=[pc1_vec]) 1008 total_var = sum(pca_cov[i][i] for i in range(pca_d)) 1009 1010 # Project papers 1011 pca_points = [] 1012 for i, r in enumerate(pca_raw): 1013 row = pca_centered[i] 1014 x = sum(row[j] * pc1_vec[j] for j in range(pca_d)) 1015 y = sum(row[j] * pc2_vec[j] for j in range(pca_d)) 1016 pca_points.append({ 1017 "id": r["id"], 1018 "x": round(x, 4), 1019 "y": round(y, 4), 1020 "archetype": r["archetype"], 1021 "score": r["score"], 1022 }) 1023 1024 pca_result = { 1025 "points": pca_points, 1026 "categories": PCA_CATS, 1027 "pc1_loadings": [round(v, 3) for v in pc1_vec], 1028 "pc2_loadings": [round(v, 3) for v in pc2_vec], 1029 "pc1_variance_pct": round(ev1 / total_var * 100, 1), 1030 "pc2_variance_pct": round(ev2 / total_var * 100, 1), 1031 } 1032 1033 # 12. HN social attention analysis 1034 hn_papers = [] 1035 for p in papers_full: 1036 hn_pts = p.get("hn_points", 0) 1037 hn_papers.append({"id": p["id"], "title": p["title"], "score": p["score"], "hn_points": hn_pts}) 1038 1039 hn_with_attention = [p for p in hn_papers if p["hn_points"] > 0] 1040 hn_without = [p for p in hn_papers if p["hn_points"] == 0] 1041 1042 # Correlation 1043 hn_corr = None 1044 if len(hn_with_attention) >= 10: 1045 import math as _math 1046 xs = [p["hn_points"] for p in hn_with_attention] 1047 ys = [p["score"] for p in hn_with_attention] 1048 n_hn = len(xs) 1049 mx, my = sum(xs) / n_hn, sum(ys) / n_hn 1050 num = sum((x - mx) * (y - my) for x, y in zip(xs, ys)) 1051 dxx = _math.sqrt(sum((x - mx) ** 2 for x in xs)) 1052 dyy = _math.sqrt(sum((y - my) ** 2 for y in ys)) 1053 hn_corr = round(num / (dxx * dyy), 3) if dxx and dyy else 0 1054 1055 # Hidden gems: high methodology, low attention 1056 hidden_gems = sorted( 1057 [p for p in hn_papers if p["score"] >= 65 and p["hn_points"] <= 5], 1058 key=lambda p: -p["score"] 1059 )[:15] 1060 1061 # Most hyped but weak 1062 overhyped = sorted( 1063 [p for p in hn_with_attention if p["score"] < 40 and p["hn_points"] >= 30], 1064 key=lambda p: -p["hn_points"] 1065 )[:15] 1066 1067 # Engagement factor correlations with HN (v3 papers only) 1068 ENGAGEMENT_DIMS = ["practical_relevance", "surprise_contrarian", "fear_safety", 1069 "drama_conflict", "demo_ability", "brand_recognition"] 1070 engagement_corrs = {} 1071 v3_hn_papers = [] 1072 for p in papers_full: 1073 ef = p.get("engagement_factors") 1074 hn_pts = p.get("hn_points", 0) 1075 if ef and hn_pts > 0: 1076 scores = {dim: ef[dim]["score"] for dim in ENGAGEMENT_DIMS if dim in ef} 1077 if len(scores) == 6: 1078 v3_hn_papers.append({"hn": hn_pts, **scores}) 1079 1080 if len(v3_hn_papers) >= 10: 1081 import math as _math 1082 log_hn = [_math.log(p["hn"] + 1) for p in v3_hn_papers] 1083 for dim in ENGAGEMENT_DIMS: 1084 vals = [p[dim] for p in v3_hn_papers] 1085 n_ef = len(vals) 1086 mx, my = sum(log_hn) / n_ef, sum(vals) / n_ef 1087 num = sum((x - mx) * (y - my) for x, y in zip(log_hn, vals)) 1088 dxx = _math.sqrt(sum((x - mx) ** 2 for x in log_hn)) 1089 dyy = _math.sqrt(sum((y - my) ** 2 for y in vals)) 1090 engagement_corrs[dim] = round(num / (dxx * dyy), 3) if dxx and dyy else 0 1091 1092 # Engagement factor means for high-HN vs low-HN 1093 engagement_split = {} 1094 if v3_hn_papers: 1095 median_hn = sorted(p["hn"] for p in v3_hn_papers)[len(v3_hn_papers) // 2] 1096 high_hn = [p for p in v3_hn_papers if p["hn"] > median_hn] 1097 low_hn = [p for p in v3_hn_papers if p["hn"] <= median_hn] 1098 for dim in ENGAGEMENT_DIMS: 1099 engagement_split[dim] = { 1100 "high_hn_mean": round(sum(p[dim] for p in high_hn) / len(high_hn), 2) if high_hn else 0, 1101 "low_hn_mean": round(sum(p[dim] for p in low_hn) / len(low_hn), 2) if low_hn else 0, 1102 } 1103 1104 # Scatter points for HN vs methodology (log scale, papers with HN > 0) 1105 import math as _math 1106 hn_scatter = [{"id": p["id"], "hn": p["hn_points"], "score": p["score"], 1107 "log_hn": round(_math.log(p["hn_points"] + 1), 2)} 1108 for p in hn_with_attention] 1109 1110 # Tag comparison: avg HN attention + avg methodology per tag 1111 tag_hn_comparison = {} 1112 tag_groups = defaultdict(list) 1113 for p in papers_full: 1114 hn_pts = p.get("hn_points", 0) 1115 for t in p["tags"]: 1116 tag_groups[t].append({"hn": hn_pts, "score": p["score"]}) 1117 for tag, ps in tag_groups.items(): 1118 on_hn = [p for p in ps if p["hn"] > 0] 1119 if len(on_hn) >= 5: 1120 tag_hn_comparison[tag] = { 1121 "n": len(on_hn), 1122 "mean_hn": round(sum(p["hn"] for p in on_hn) / len(on_hn), 1), 1123 "mean_score": round(sum(p["score"] for p in on_hn) / len(on_hn), 1), 1124 } 1125 1126 # Repost signal: quality by number of HN threads 1127 repost_bands = {} 1128 for label, lo, hi in [("1 post", 1, 1), ("2-3", 2, 3), ("4-7", 4, 7), ("8+", 8, 999)]: 1129 band_papers = [p for p in hn_with_attention 1130 if lo <= len(load_hn(p["id"]).get("threads", [])) <= hi] 1131 if band_papers: 1132 repost_bands[label] = { 1133 "n": len(band_papers), 1134 "mean_score": safe_mean([p["score"] for p in band_papers]), 1135 "mean_hn": safe_mean([p["hn_points"] for p in band_papers]), 1136 } 1137 1138 # Controversy signal: comment-to-point ratio vs quality 1139 controversy = {} 1140 comment_papers = [p for p in hn_with_attention if p["hn_points"] >= 10] 1141 if comment_papers: 1142 for p in comment_papers: 1143 hn_d = load_hn(p["id"]) 1144 p["_total_comments"] = sum(t.get("comments", 0) for t in hn_d.get("threads", [])) 1145 p["_cpt"] = p["_total_comments"] / p["hn_points"] if p["hn_points"] else 0 1146 med_cpt = sorted(p["_cpt"] for p in comment_papers)[len(comment_papers) // 2] 1147 high_disc = [p for p in comment_papers if p["_cpt"] > med_cpt] 1148 low_disc = [p for p in comment_papers if p["_cpt"] <= med_cpt] 1149 controversy = { 1150 "high_discussion_mean": safe_mean([p["score"] for p in high_disc]), 1151 "low_discussion_mean": safe_mean([p["score"] for p in low_disc]), 1152 "high_n": len(high_disc), 1153 "low_n": len(low_disc), 1154 } 1155 1156 hn_analysis = { 1157 "total_with_hn": len(hn_with_attention), 1158 "total_without_hn": len(hn_without), 1159 "correlation": hn_corr, 1160 "with_attention_mean": safe_mean([p["score"] for p in hn_with_attention]) if hn_with_attention else 0, 1161 "without_attention_mean": safe_mean([p["score"] for p in hn_without]) if hn_without else 0, 1162 "top_hn": sorted(hn_with_attention, key=lambda p: -p["hn_points"])[:20], 1163 "hidden_gems": hidden_gems, 1164 "overhyped": overhyped, 1165 "scatter": hn_scatter, 1166 "tag_comparison": tag_hn_comparison, 1167 "repost_signal": repost_bands, 1168 "controversy": controversy, 1169 "engagement_correlations": engagement_corrs, 1170 "engagement_split": engagement_split, 1171 "engagement_n": len(v3_hn_papers), 1172 } 1173 1174 findings = { 1175 "question_rates": q_rates, 1176 "year_category_trends": year_cat_trends, 1177 "venue_stats": venue_stats, 1178 "citation_band_stats": cit_band_stats, 1179 "optimism_rigor": optimism_rigor, 1180 "homophily": homophily, 1181 "sampling_effect": sampling_effect, 1182 "benchmark_monoculture": bench_mono, 1183 "funding_gap": funding_gap, 1184 "repro_detail": repro_detail, 1185 "game_pcts": game_pcts, 1186 "correlation": correlation, 1187 "pca": pca_result, 1188 "tag_treemap": tag_treemap, 1189 "two_cultures": two_cultures_papers, 1190 "hn_analysis": hn_analysis, 1191 } 1192 1193 # --- Citation network (built from cited_papers in scan.json) --- 1194 v2_ids = {p["id"] for p in papers_full} 1195 year_map = {p["id"]: p["year"] for p in papers_full} 1196 title_map = {p["id"]: p["title"] for p in papers_full} 1197 1198 # Build title→id index from registry (case-insensitive) 1199 title_to_id = {} 1200 for entry in registry.values(): 1201 t = entry.get("title", "").lower().strip() 1202 if t: 1203 title_to_id[t] = entry["id"] 1204 for p in papers_full: 1205 t = p["title"].lower().strip() 1206 if t: 1207 title_to_id[t] = p["id"] 1208 1209 # Extract directed edges from cited_papers 1210 net_edge_set = set() 1211 for p in papers_full: 1212 src = p["id"] 1213 detail = paper_details.get(src, {}) 1214 for item in detail.get("checklist", []): 1215 pass # checklist doesn't have cited_papers 1216 # Read cited_papers from the original scan.json 1217 scan_path = PAPERS_DIR / src / "scan.json" 1218 if scan_path.exists(): 1219 with open(scan_path) as f: 1220 scan_data = json.load(f) 1221 for cited in scan_data.get("cited_papers", []): 1222 ct = cited.get("title", "").lower().strip() 1223 target = title_to_id.get(ct) 1224 if target and target != src: 1225 net_edge_set.add((src, target)) 1226 1227 net_edges = [[s, t] for s, t in net_edge_set] 1228 in_degree = Counter(t for _, t in net_edge_set) 1229 out_degree = Counter(s for s, _ in net_edge_set) 1230 1231 # Collect all node IDs that appear in edges 1232 all_net_ids = set() 1233 for s, t in net_edge_set: 1234 all_net_ids.add(s) 1235 all_net_ids.add(t) 1236 1237 net_nodes = [] 1238 for nid in sorted(all_net_ids): 1239 reg = registry.get(nid, {}) 1240 net_nodes.append({ 1241 "id": nid, 1242 "title": title_map.get(nid, reg.get("title", nid)), 1243 "score": score_map.get(nid), 1244 "year": year_map.get(nid, reg.get("year")), 1245 "in_degree": in_degree.get(nid, 0), 1246 "out_degree": out_degree.get(nid, 0), 1247 "has_scan": nid in v2_ids, 1248 }) 1249 1250 network = {"nodes": net_nodes, "edges": net_edges} 1251 1252 # --- Network findings --- 1253 # Foundational leaderboard: top 15 most-cited with scores 1254 foundational = [] 1255 for nid, deg in in_degree.most_common(20): 1256 foundational.append({ 1257 "id": nid, 1258 "title": title_map.get(nid, registry.get(nid, {}).get("title", nid)), 1259 "in_degree": deg, 1260 "score": score_map.get(nid), 1261 }) 1262 1263 # Quality contagion: mean score by % of high-quality citations 1264 contagion_threshold = 50 1265 high_q = {pid for pid, sc in score_map.items() if sc >= contagion_threshold} 1266 contagion_bands = {"0%": [], "1-33%": [], "34-66%": [], "67-100%": []} 1267 out_map = defaultdict(list) 1268 for s, t in net_edge_set: 1269 out_map[s].append(t) 1270 for pid in score_map: 1271 cited = [t for t in out_map.get(pid, []) if t in score_map] 1272 if len(cited) < 2: 1273 continue 1274 pct = sum(1 for t in cited if t in high_q) / len(cited) * 100 1275 if pct == 0: 1276 band = "0%" 1277 elif pct <= 33: 1278 band = "1-33%" 1279 elif pct <= 66: 1280 band = "34-66%" 1281 else: 1282 band = "67-100%" 1283 contagion_bands[band].append(score_map[pid]) 1284 1285 quality_contagion = {} 1286 for band_name in ["0%", "1-33%", "34-66%", "67-100%"]: 1287 ss = contagion_bands[band_name] 1288 if ss: 1289 quality_contagion[band_name] = {"n": len(ss), "mean": safe_mean(ss)} 1290 1291 # Rigor diffusion: for top cited papers, mean score of their citers 1292 in_map = defaultdict(list) 1293 for s, t in net_edge_set: 1294 in_map[t].append(s) 1295 rigor_diffusion = [] 1296 for nid, deg in in_degree.most_common(15): 1297 citers = [s for s in in_map[nid] if s in score_map] 1298 rigor_diffusion.append({ 1299 "id": nid, 1300 "title": title_map.get(nid, registry.get(nid, {}).get("title", nid)), 1301 "score": score_map.get(nid), 1302 "in_degree": deg, 1303 "citer_mean": safe_mean([score_map[c] for c in citers]) if citers else None, 1304 "citer_n": len(citers), 1305 }) 1306 1307 findings["network_insights"] = { 1308 "foundational": foundational, 1309 "quality_contagion": quality_contagion, 1310 "rigor_diffusion": rigor_diffusion, 1311 } 1312 1313 # --- Write files --- 1314 OUTPUT_DIR.mkdir(parents=True, exist_ok=True) 1315 papers_detail_dir = OUTPUT_DIR / "papers" 1316 papers_detail_dir.mkdir(parents=True, exist_ok=True) 1317 1318 # Add unscanned registry entries to papers-index 1319 scanned_ids = {p["id"] for p in papers_index} 1320 for entry in registry.values(): 1321 if entry["id"] in scanned_ids: 1322 continue 1323 if entry.get("status") == "excluded": 1324 continue 1325 papers_index.append({ 1326 "id": entry["id"], 1327 "title": entry.get("title", entry["id"]), 1328 "year": entry.get("year"), 1329 "venue": entry.get("venue", ""), 1330 "tags": entry.get("tags", []), 1331 "score": None, 1332 "archetype": None, 1333 "games": [], 1334 "arxiv_id": entry.get("arxiv_id", ""), 1335 "doi": entry.get("doi", ""), 1336 "code_url": None, 1337 "dna": None, 1338 "paper_type": None, 1339 "hn_points": 0, 1340 "engagement": None, 1341 }) 1342 1343 write_json(OUTPUT_DIR / "dashboard.json", dashboard) 1344 write_json(OUTPUT_DIR / "findings.json", findings) 1345 write_json(OUTPUT_DIR / "papers-index.json", papers_index) 1346 write_json(OUTPUT_DIR / "network.json", network) 1347 write_json(OUTPUT_DIR / "tensions.json", tensions) 1348 write_json(OUTPUT_DIR / "calibration.json", {"papers": calibration_papers}) 1349 write_json(OUTPUT_DIR / "benchmarks.json", {"papers": benchmark_papers}) 1350 1351 for slug, detail in paper_details.items(): 1352 write_json(papers_detail_dir / f"{slug}.json", detail) 1353 1354 # Full monolith 1355 explorer = { 1356 "generated": "2026-03-21", 1357 "papers": papers_full, 1358 "agg": dashboard, 1359 "findings": findings, 1360 "tensions": tensions, 1361 "citation_network": network, 1362 } 1363 write_json(OUTPUT_DIR / "explorer.json", explorer) 1364 1365 # Report 1366 dash_size = (OUTPUT_DIR / "dashboard.json").stat().st_size 1367 find_size = (OUTPUT_DIR / "findings.json").stat().st_size 1368 idx_size = (OUTPUT_DIR / "papers-index.json").stat().st_size 1369 net_size = (OUTPUT_DIR / "network.json").stat().st_size 1370 tens_size = (OUTPUT_DIR / "tensions.json").stat().st_size 1371 full_size = (OUTPUT_DIR / "explorer.json").stat().st_size 1372 code_url_count = sum(1 for p in papers_full if p.get("code_url")) 1373 print(f"Papers: {total_papers}") 1374 print(f"Median score: {median:.1f}%") 1375 print(f"Code URLs extracted: {code_url_count}") 1376 print(f"Network: {len(net_nodes)} nodes, {len(net_edges)} edges") 1377 print(f"Files:") 1378 print(f" dashboard.json: {dash_size:>8,} bytes") 1379 print(f" findings.json: {find_size:>8,} bytes") 1380 print(f" papers-index.json: {idx_size:>8,} bytes") 1381 print(f" papers/*.json: {len(paper_details):>5} files") 1382 print(f" network.json: {net_size:>8,} bytes") 1383 print(f" tensions.json: {tens_size:>8,} bytes") 1384 print(f" explorer.json: {full_size:>8,} bytes") 1385 1386 1387 if __name__ == "__main__": 1388 build()