early-findings.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

early-findings.json (25947B)
      1 {
      2   "n_papers": 561,
      3   "per_question": {
      4     "artifacts.code_released": {
      5       "applies_count": 547,
      6       "applies_rate": 0.975,
      7       "yes_count": 295,
      8       "compliance_rate": 0.5393
      9     },
     10     "artifacts.data_released": {
     11       "applies_count": 544,
     12       "applies_rate": 0.9697,
     13       "yes_count": 347,
     14       "compliance_rate": 0.6379
     15     },
     16     "artifacts.environment_specified": {
     17       "applies_count": 511,
     18       "applies_rate": 0.9109,
     19       "yes_count": 48,
     20       "compliance_rate": 0.0939
     21     },
     22     "artifacts.reproduction_instructions": {
     23       "applies_count": 534,
     24       "applies_rate": 0.9519,
     25       "yes_count": 30,
     26       "compliance_rate": 0.0562
     27     },
     28     "statistical_methodology.confidence_intervals_or_error_bars": {
     29       "applies_count": 485,
     30       "applies_rate": 0.8645,
     31       "yes_count": 102,
     32       "compliance_rate": 0.2103
     33     },
     34     "statistical_methodology.significance_tests": {
     35       "applies_count": 474,
     36       "applies_rate": 0.8449,
     37       "yes_count": 92,
     38       "compliance_rate": 0.1941
     39     },
     40     "statistical_methodology.effect_sizes_reported": {
     41       "applies_count": 478,
     42       "applies_rate": 0.852,
     43       "yes_count": 342,
     44       "compliance_rate": 0.7155
     45     },
     46     "statistical_methodology.sample_size_justified": {
     47       "applies_count": 488,
     48       "applies_rate": 0.8699,
     49       "yes_count": 22,
     50       "compliance_rate": 0.0451
     51     },
     52     "statistical_methodology.variance_reported": {
     53       "applies_count": 482,
     54       "applies_rate": 0.8592,
     55       "yes_count": 104,
     56       "compliance_rate": 0.2158
     57     },
     58     "evaluation_design.baselines_included": {
     59       "applies_count": 521,
     60       "applies_rate": 0.9287,
     61       "yes_count": 458,
     62       "compliance_rate": 0.8791
     63     },
     64     "evaluation_design.baselines_contemporary": {
     65       "applies_count": 507,
     66       "applies_rate": 0.9037,
     67       "yes_count": 399,
     68       "compliance_rate": 0.787
     69     },
     70     "evaluation_design.ablation_study": {
     71       "applies_count": 452,
     72       "applies_rate": 0.8057,
     73       "yes_count": 335,
     74       "compliance_rate": 0.7412
     75     },
     76     "evaluation_design.multiple_metrics": {
     77       "applies_count": 491,
     78       "applies_rate": 0.8752,
     79       "yes_count": 415,
     80       "compliance_rate": 0.8452
     81     },
     82     "evaluation_design.human_evaluation": {
     83       "applies_count": 367,
     84       "applies_rate": 0.6542,
     85       "yes_count": 151,
     86       "compliance_rate": 0.4114
     87     },
     88     "evaluation_design.held_out_test_set": {
     89       "applies_count": 361,
     90       "applies_rate": 0.6435,
     91       "yes_count": 266,
     92       "compliance_rate": 0.7368
     93     },
     94     "evaluation_design.per_category_breakdown": {
     95       "applies_count": 539,
     96       "applies_rate": 0.9608,
     97       "yes_count": 481,
     98       "compliance_rate": 0.8924
     99     },
    100     "evaluation_design.failure_cases_discussed": {
    101       "applies_count": 555,
    102       "applies_rate": 0.9893,
    103       "yes_count": 460,
    104       "compliance_rate": 0.8288
    105     },
    106     "evaluation_design.negative_results_reported": {
    107       "applies_count": 545,
    108       "applies_rate": 0.9715,
    109       "yes_count": 482,
    110       "compliance_rate": 0.8844
    111     },
    112     "claims_and_evidence.abstract_claims_supported": {
    113       "applies_count": 561,
    114       "applies_rate": 1.0,
    115       "yes_count": 513,
    116       "compliance_rate": 0.9144
    117     },
    118     "claims_and_evidence.causal_claims_justified": {
    119       "applies_count": 505,
    120       "applies_rate": 0.9002,
    121       "yes_count": 324,
    122       "compliance_rate": 0.6416
    123     },
    124     "claims_and_evidence.generalization_bounded": {
    125       "applies_count": 561,
    126       "applies_rate": 1.0,
    127       "yes_count": 200,
    128       "compliance_rate": 0.3565
    129     },
    130     "claims_and_evidence.alternative_explanations_discussed": {
    131       "applies_count": 534,
    132       "applies_rate": 0.9519,
    133       "yes_count": 189,
    134       "compliance_rate": 0.3539
    135     },
    136     "setup_transparency.model_versions_specified": {
    137       "applies_count": 473,
    138       "applies_rate": 0.8431,
    139       "yes_count": 182,
    140       "compliance_rate": 0.3848
    141     },
    142     "setup_transparency.prompts_provided": {
    143       "applies_count": 415,
    144       "applies_rate": 0.7398,
    145       "yes_count": 268,
    146       "compliance_rate": 0.6458
    147     },
    148     "setup_transparency.hyperparameters_reported": {
    149       "applies_count": 483,
    150       "applies_rate": 0.861,
    151       "yes_count": 278,
    152       "compliance_rate": 0.5756
    153     },
    154     "setup_transparency.scaffolding_described": {
    155       "applies_count": 238,
    156       "applies_rate": 0.4242,
    157       "yes_count": 224,
    158       "compliance_rate": 0.9412
    159     },
    160     "setup_transparency.data_preprocessing_documented": {
    161       "applies_count": 529,
    162       "applies_rate": 0.943,
    163       "yes_count": 403,
    164       "compliance_rate": 0.7618
    165     },
    166     "limitations_and_scope.limitations_section_present": {
    167       "applies_count": 561,
    168       "applies_rate": 1.0,
    169       "yes_count": 358,
    170       "compliance_rate": 0.6381
    171     },
    172     "limitations_and_scope.threats_to_validity_specific": {
    173       "applies_count": 561,
    174       "applies_rate": 1.0,
    175       "yes_count": 301,
    176       "compliance_rate": 0.5365
    177     },
    178     "limitations_and_scope.scope_boundaries_stated": {
    179       "applies_count": 561,
    180       "applies_rate": 1.0,
    181       "yes_count": 219,
    182       "compliance_rate": 0.3904
    183     },
    184     "data_integrity.raw_data_available": {
    185       "applies_count": 535,
    186       "applies_rate": 0.9537,
    187       "yes_count": 216,
    188       "compliance_rate": 0.4037
    189     },
    190     "data_integrity.data_collection_described": {
    191       "applies_count": 541,
    192       "applies_rate": 0.9643,
    193       "yes_count": 469,
    194       "compliance_rate": 0.8669
    195     },
    196     "data_integrity.recruitment_methods_described": {
    197       "applies_count": 112,
    198       "applies_rate": 0.1996,
    199       "yes_count": 50,
    200       "compliance_rate": 0.4464
    201     },
    202     "data_integrity.data_pipeline_documented": {
    203       "applies_count": 536,
    204       "applies_rate": 0.9554,
    205       "yes_count": 400,
    206       "compliance_rate": 0.7463
    207     },
    208     "conflicts_of_interest.funding_disclosed": {
    209       "applies_count": 560,
    210       "applies_rate": 0.9982,
    211       "yes_count": 183,
    212       "compliance_rate": 0.3268
    213     },
    214     "conflicts_of_interest.affiliations_disclosed": {
    215       "applies_count": 561,
    216       "applies_rate": 1.0,
    217       "yes_count": 555,
    218       "compliance_rate": 0.9893
    219     },
    220     "conflicts_of_interest.funder_independent_of_outcome": {
    221       "applies_count": 509,
    222       "applies_rate": 0.9073,
    223       "yes_count": 157,
    224       "compliance_rate": 0.3084
    225     },
    226     "conflicts_of_interest.financial_interests_declared": {
    227       "applies_count": 561,
    228       "applies_rate": 1.0,
    229       "yes_count": 23,
    230       "compliance_rate": 0.041
    231     },
    232     "contamination.training_cutoff_stated": {
    233       "applies_count": 327,
    234       "applies_rate": 0.5829,
    235       "yes_count": 23,
    236       "compliance_rate": 0.0703
    237     },
    238     "contamination.train_test_overlap_discussed": {
    239       "applies_count": 328,
    240       "applies_rate": 0.5847,
    241       "yes_count": 95,
    242       "compliance_rate": 0.2896
    243     },
    244     "contamination.benchmark_contamination_addressed": {
    245       "applies_count": 323,
    246       "applies_rate": 0.5758,
    247       "yes_count": 82,
    248       "compliance_rate": 0.2539
    249     },
    250     "human_studies.pre_registered": {
    251       "applies_count": 79,
    252       "applies_rate": 0.1408,
    253       "yes_count": 1,
    254       "compliance_rate": 0.0127
    255     },
    256     "human_studies.irb_or_ethics_approval": {
    257       "applies_count": 79,
    258       "applies_rate": 0.1408,
    259       "yes_count": 10,
    260       "compliance_rate": 0.1266
    261     },
    262     "human_studies.demographics_reported": {
    263       "applies_count": 80,
    264       "applies_rate": 0.1426,
    265       "yes_count": 34,
    266       "compliance_rate": 0.425
    267     },
    268     "human_studies.inclusion_exclusion_criteria": {
    269       "applies_count": 80,
    270       "applies_rate": 0.1426,
    271       "yes_count": 23,
    272       "compliance_rate": 0.2875
    273     },
    274     "human_studies.randomization_described": {
    275       "applies_count": 29,
    276       "applies_rate": 0.0517,
    277       "yes_count": 14,
    278       "compliance_rate": 0.4828
    279     },
    280     "human_studies.blinding_described": {
    281       "applies_count": 47,
    282       "applies_rate": 0.0838,
    283       "yes_count": 22,
    284       "compliance_rate": 0.4681
    285     },
    286     "human_studies.attrition_reported": {
    287       "applies_count": 77,
    288       "applies_rate": 0.1373,
    289       "yes_count": 15,
    290       "compliance_rate": 0.1948
    291     },
    292     "cost_and_practicality.inference_cost_reported": {
    293       "applies_count": 466,
    294       "applies_rate": 0.8307,
    295       "yes_count": 156,
    296       "compliance_rate": 0.3348
    297     },
    298     "cost_and_practicality.compute_budget_stated": {
    299       "applies_count": 470,
    300       "applies_rate": 0.8378,
    301       "yes_count": 96,
    302       "compliance_rate": 0.2043
    303     }
    304   },
    305   "per_category": {
    306     "artifacts": {
    307       "mean": 0.3297,
    308       "median": 0.25,
    309       "std": 0.2654,
    310       "n_papers_with_applicable": 548
    311     },
    312     "statistical_methodology": {
    313       "mean": 0.2727,
    314       "median": 0.2,
    315       "std": 0.2416,
    316       "n_papers_with_applicable": 490
    317     },
    318     "evaluation_design": {
    319       "mean": 0.7924,
    320       "median": 0.875,
    321       "std": 0.2411,
    322       "n_papers_with_applicable": 556
    323     },
    324     "claims_and_evidence": {
    325       "mean": 0.5735,
    326       "median": 0.5,
    327       "std": 0.2986,
    328       "n_papers_with_applicable": 561
    329     },
    330     "setup_transparency": {
    331       "mean": 0.6194,
    332       "median": 0.6,
    333       "std": 0.3106,
    334       "n_papers_with_applicable": 539
    335     },
    336     "limitations_and_scope": {
    337       "mean": 0.5217,
    338       "median": 0.6667,
    339       "std": 0.4164,
    340       "n_papers_with_applicable": 561
    341     },
    342     "data_integrity": {
    343       "mean": 0.6548,
    344       "median": 0.6667,
    345       "std": 0.3195,
    346       "n_papers_with_applicable": 542
    347     },
    348     "conflicts_of_interest": {
    349       "mean": 0.4186,
    350       "median": 0.25,
    351       "std": 0.238,
    352       "n_papers_with_applicable": 561
    353     },
    354     "contamination": {
    355       "mean": 0.2053,
    356       "median": 0.0,
    357       "std": 0.3347,
    358       "n_papers_with_applicable": 328
    359     },
    360     "human_studies": {
    361       "mean": 0.2555,
    362       "median": 0.2,
    363       "std": 0.2302,
    364       "n_papers_with_applicable": 80
    365     },
    366     "cost_and_practicality": {
    367       "mean": 0.2696,
    368       "median": 0.0,
    369       "std": 0.3697,
    370       "n_papers_with_applicable": 471
    371     }
    372   },
    373   "paper_score_distribution": {
    374     "mean": 0.5093,
    375     "median": 0.5278,
    376     "std": 0.174,
    377     "min": 0.0238,
    378     "max": 0.878,
    379     "q25": 0.4146,
    380     "q75": 0.625
    381   },
    382   "lowest_10": [
    383     {
    384       "slug": "automating-rest-api-2024",
    385       "year": 2024,
    386       "score": 0.0238,
    387       "applicable": 42,
    388       "satisfied": 1,
    389       "title": "Automating REST API Postman Test Cases Using LLM",
    390       "methodology_tags": [
    391         "case-study"
    392       ]
    393     },
    394     {
    395       "slug": "chatofthought-collaborative-multiagent-2025",
    396       "year": 2025,
    397       "score": 0.0256,
    398       "applicable": 39,
    399       "satisfied": 1,
    400       "title": "Chat-of-Thought: Collaborative Multi-Agent System for Generating Domain Specific Information",
    401       "methodology_tags": [
    402         "case-study"
    403       ]
    404     },
    405     {
    406       "slug": "aidriven-software-engineering-2023",
    407       "year": 2023,
    408       "score": 0.027,
    409       "applicable": 37,
    410       "satisfied": 1,
    411       "title": "AI-driven software engineering",
    412       "methodology_tags": [
    413         "qualitative",
    414         "observational"
    415       ]
    416     },
    417     {
    418       "slug": "aiassisted-code-editors-2025",
    419       "year": 2025,
    420       "score": 0.0435,
    421       "applicable": 23,
    422       "satisfied": 1,
    423       "title": "AI-Assisted Code Editors with Real-Time Collaboration: A Comprehensive Review",
    424       "methodology_tags": [
    425         "meta-analysis"
    426       ]
    427     },
    428     {
    429       "slug": "ai-agents-software-2025",
    430       "year": 2025,
    431       "score": 0.0526,
    432       "applicable": 38,
    433       "satisfied": 2,
    434       "title": "AI Agents in Software Engineering Optimizing Software Development Processes and Enhancing Security Management in Learning Management Systems",
    435       "methodology_tags": [
    436         "theoretical"
    437       ]
    438     },
    439     {
    440       "slug": "attacking-llms-ai-2025",
    441       "year": 2025,
    442       "score": 0.0526,
    443       "applicable": 38,
    444       "satisfied": 2,
    445       "title": "Attacking LLMs and AI Agents: Advertisement Embedding Attacks Against Large Language Models",
    446       "methodology_tags": [
    447         "case-study"
    448       ]
    449     },
    450     {
    451       "slug": "breaking-prompt-wall-2025",
    452       "year": 2025,
    453       "score": 0.0556,
    454       "applicable": 36,
    455       "satisfied": 2,
    456       "title": "Breaking the Prompt Wall (I): A Real-World Case Study of Attacking ChatGPT via Lightweight Prompt Injection",
    457       "methodology_tags": [
    458         "case-study"
    459       ]
    460     },
    461     {
    462       "slug": "agentic-ai-software-2025",
    463       "year": 2025,
    464       "score": 0.0667,
    465       "applicable": 15,
    466       "satisfied": 1,
    467       "title": "Agentic AI for Software: thoughts from Software Engineering community",
    468       "methodology_tags": [
    469         "theoretical",
    470         "case-study"
    471       ]
    472     },
    473     {
    474       "slug": "aipowered-code-review-2024",
    475       "year": 2024,
    476       "score": 0.0769,
    477       "applicable": 39,
    478       "satisfied": 3,
    479       "title": "AI-powered Code Review with LLMs: Early Results",
    480       "methodology_tags": [
    481         "case-study"
    482       ]
    483     },
    484     {
    485       "slug": "aipowered-software-development-2025-2",
    486       "year": 2025,
    487       "score": 0.0769,
    488       "applicable": 13,
    489       "satisfied": 1,
    490       "title": "AI-Powered Software Development Life Cycle: From Requirements to Maintenance",
    491       "methodology_tags": [
    492         "theoretical"
    493       ]
    494     }
    495   ],
    496   "highest_10": [
    497     {
    498       "slug": "concrete-roadmap-safety-2025",
    499       "year": 2025,
    500       "score": 0.8333,
    501       "applicable": 12,
    502       "satisfied": 10,
    503       "title": "A Concrete Roadmap towards Safety Cases based on Chain-of-Thought Monitoring",
    504       "methodology_tags": [
    505         "theoretical"
    506       ]
    507     },
    508     {
    509       "slug": "ai-alignment-strategies-2025",
    510       "year": 2025,
    511       "score": 0.8462,
    512       "applicable": 13,
    513       "satisfied": 11,
    514       "title": "AI Alignment Strategies from a Risk Perspective: Independent Safety Mechanisms or Shared Failures?",
    515       "methodology_tags": [
    516         "theoretical"
    517       ]
    518     },
    519     {
    520       "slug": "ai-testing-should-2025",
    521       "year": 2025,
    522       "score": 0.8462,
    523       "applicable": 13,
    524       "satisfied": 11,
    525       "title": "AI Testing Should Account for Sophisticated Strategic Behaviour",
    526       "methodology_tags": [
    527         "theoretical"
    528       ]
    529     },
    530     {
    531       "slug": "alphacode-competition-level-2022",
    532       "year": 2022,
    533       "score": 0.8571,
    534       "applicable": 42,
    535       "satisfied": 36,
    536       "title": "Competition-Level Code Generation with AlphaCode",
    537       "methodology_tags": [
    538         "benchmark-eval"
    539       ]
    540     },
    541     {
    542       "slug": "appworld-controllable-world-2024",
    543       "year": 2024,
    544       "score": 0.8571,
    545       "applicable": 42,
    546       "satisfied": 36,
    547       "title": "AppWorld: A Controllable World of Apps and People for Benchmarking Interactive Coding Agents",
    548       "methodology_tags": [
    549         "benchmark-eval"
    550       ]
    551     },
    552     {
    553       "slug": "agentdojo-dynamic-environment-2024",
    554       "year": 2024,
    555       "score": 0.8611,
    556       "applicable": 36,
    557       "satisfied": 31,
    558       "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents",
    559       "methodology_tags": [
    560         "benchmark-eval"
    561       ]
    562     },
    563     {
    564       "slug": "attention-pruning-automated-2025",
    565       "year": 2025,
    566       "score": 0.8684,
    567       "applicable": 38,
    568       "satisfied": 33,
    569       "title": "Attention Pruning: Automated Fairness Repair of Language Models via Surrogate Simulated Annealing",
    570       "methodology_tags": [
    571         "benchmark-eval"
    572       ]
    573     },
    574     {
    575       "slug": "bridging-mde-ai-2023",
    576       "year": 2023,
    577       "score": 0.8696,
    578       "applicable": 23,
    579       "satisfied": 20,
    580       "title": "Bridging MDE and AI: A Systematic Review of Domain-Specific Languages and Model-Driven Practices in AI Software Systems Engineering",
    581       "methodology_tags": [
    582         "meta-analysis"
    583       ]
    584     },
    585     {
    586       "slug": "annotation-alignment-comparing-2024",
    587       "year": 2024,
    588       "score": 0.8718,
    589       "applicable": 39,
    590       "satisfied": 34,
    591       "title": "Annotation alignment: Comparing LLM and human annotations of conversational safety",
    592       "methodology_tags": [
    593         "benchmark-eval",
    594         "observational"
    595       ]
    596     },
    597     {
    598       "slug": "assessing-latent-automated-2024",
    599       "year": 2024,
    600       "score": 0.878,
    601       "applicable": 41,
    602       "satisfied": 36,
    603       "title": "Assessing the Latent Automated Program Repair Capabilities of Large Language Models using Round-Trip Translation",
    604       "methodology_tags": [
    605         "benchmark-eval"
    606       ]
    607     }
    608   ],
    609   "lowest_compliance_questions": [
    610     [
    611       "human_studies.pre_registered",
    612       {
    613         "applies_count": 79,
    614         "applies_rate": 0.1408,
    615         "yes_count": 1,
    616         "compliance_rate": 0.0127
    617       }
    618     ],
    619     [
    620       "conflicts_of_interest.financial_interests_declared",
    621       {
    622         "applies_count": 561,
    623         "applies_rate": 1.0,
    624         "yes_count": 23,
    625         "compliance_rate": 0.041
    626       }
    627     ],
    628     [
    629       "statistical_methodology.sample_size_justified",
    630       {
    631         "applies_count": 488,
    632         "applies_rate": 0.8699,
    633         "yes_count": 22,
    634         "compliance_rate": 0.0451
    635       }
    636     ],
    637     [
    638       "artifacts.reproduction_instructions",
    639       {
    640         "applies_count": 534,
    641         "applies_rate": 0.9519,
    642         "yes_count": 30,
    643         "compliance_rate": 0.0562
    644       }
    645     ],
    646     [
    647       "contamination.training_cutoff_stated",
    648       {
    649         "applies_count": 327,
    650         "applies_rate": 0.5829,
    651         "yes_count": 23,
    652         "compliance_rate": 0.0703
    653       }
    654     ],
    655     [
    656       "artifacts.environment_specified",
    657       {
    658         "applies_count": 511,
    659         "applies_rate": 0.9109,
    660         "yes_count": 48,
    661         "compliance_rate": 0.0939
    662       }
    663     ],
    664     [
    665       "human_studies.irb_or_ethics_approval",
    666       {
    667         "applies_count": 79,
    668         "applies_rate": 0.1408,
    669         "yes_count": 10,
    670         "compliance_rate": 0.1266
    671       }
    672     ],
    673     [
    674       "statistical_methodology.significance_tests",
    675       {
    676         "applies_count": 474,
    677         "applies_rate": 0.8449,
    678         "yes_count": 92,
    679         "compliance_rate": 0.1941
    680       }
    681     ],
    682     [
    683       "human_studies.attrition_reported",
    684       {
    685         "applies_count": 77,
    686         "applies_rate": 0.1373,
    687         "yes_count": 15,
    688         "compliance_rate": 0.1948
    689       }
    690     ],
    691     [
    692       "cost_and_practicality.compute_budget_stated",
    693       {
    694         "applies_count": 470,
    695         "applies_rate": 0.8378,
    696         "yes_count": 96,
    697         "compliance_rate": 0.2043
    698       }
    699     ],
    700     [
    701       "statistical_methodology.confidence_intervals_or_error_bars",
    702       {
    703         "applies_count": 485,
    704         "applies_rate": 0.8645,
    705         "yes_count": 102,
    706         "compliance_rate": 0.2103
    707       }
    708     ],
    709     [
    710       "statistical_methodology.variance_reported",
    711       {
    712         "applies_count": 482,
    713         "applies_rate": 0.8592,
    714         "yes_count": 104,
    715         "compliance_rate": 0.2158
    716       }
    717     ],
    718     [
    719       "contamination.benchmark_contamination_addressed",
    720       {
    721         "applies_count": 323,
    722         "applies_rate": 0.5758,
    723         "yes_count": 82,
    724         "compliance_rate": 0.2539
    725       }
    726     ],
    727     [
    728       "human_studies.inclusion_exclusion_criteria",
    729       {
    730         "applies_count": 80,
    731         "applies_rate": 0.1426,
    732         "yes_count": 23,
    733         "compliance_rate": 0.2875
    734       }
    735     ],
    736     [
    737       "contamination.train_test_overlap_discussed",
    738       {
    739         "applies_count": 328,
    740         "applies_rate": 0.5847,
    741         "yes_count": 95,
    742         "compliance_rate": 0.2896
    743       }
    744     ]
    745   ],
    746   "highest_compliance_questions": [
    747     [
    748       "evaluation_design.baselines_contemporary",
    749       {
    750         "applies_count": 507,
    751         "applies_rate": 0.9037,
    752         "yes_count": 399,
    753         "compliance_rate": 0.787
    754       }
    755     ],
    756     [
    757       "evaluation_design.failure_cases_discussed",
    758       {
    759         "applies_count": 555,
    760         "applies_rate": 0.9893,
    761         "yes_count": 460,
    762         "compliance_rate": 0.8288
    763       }
    764     ],
    765     [
    766       "evaluation_design.multiple_metrics",
    767       {
    768         "applies_count": 491,
    769         "applies_rate": 0.8752,
    770         "yes_count": 415,
    771         "compliance_rate": 0.8452
    772       }
    773     ],
    774     [
    775       "data_integrity.data_collection_described",
    776       {
    777         "applies_count": 541,
    778         "applies_rate": 0.9643,
    779         "yes_count": 469,
    780         "compliance_rate": 0.8669
    781       }
    782     ],
    783     [
    784       "evaluation_design.baselines_included",
    785       {
    786         "applies_count": 521,
    787         "applies_rate": 0.9287,
    788         "yes_count": 458,
    789         "compliance_rate": 0.8791
    790       }
    791     ],
    792     [
    793       "evaluation_design.negative_results_reported",
    794       {
    795         "applies_count": 545,
    796         "applies_rate": 0.9715,
    797         "yes_count": 482,
    798         "compliance_rate": 0.8844
    799       }
    800     ],
    801     [
    802       "evaluation_design.per_category_breakdown",
    803       {
    804         "applies_count": 539,
    805         "applies_rate": 0.9608,
    806         "yes_count": 481,
    807         "compliance_rate": 0.8924
    808       }
    809     ],
    810     [
    811       "claims_and_evidence.abstract_claims_supported",
    812       {
    813         "applies_count": 561,
    814         "applies_rate": 1.0,
    815         "yes_count": 513,
    816         "compliance_rate": 0.9144
    817       }
    818     ],
    819     [
    820       "setup_transparency.scaffolding_described",
    821       {
    822         "applies_count": 238,
    823         "applies_rate": 0.4242,
    824         "yes_count": 224,
    825         "compliance_rate": 0.9412
    826       }
    827     ],
    828     [
    829       "conflicts_of_interest.affiliations_disclosed",
    830       {
    831         "applies_count": 561,
    832         "applies_rate": 1.0,
    833         "yes_count": 555,
    834         "compliance_rate": 0.9893
    835       }
    836     ]
    837   ],
    838   "methodology_tags": {
    839     "benchmark-eval": 424,
    840     "case-study": 115,
    841     "qualitative": 68,
    842     "theoretical": 62,
    843     "observational": 50,
    844     "meta-analysis": 31,
    845     "rct": 7
    846   },
    847   "claim_support": {
    848     "moderate": 1243,
    849     "strong": 1163,
    850     "weak": 404,
    851     "unsupported": 75
    852   },
    853   "by_year": {
    854     "2017": {
    855       "n": 1,
    856       "mean": 0.5278,
    857       "median": 0.5278
    858     },
    859     "2018": {
    860       "n": 1,
    861       "mean": 0.55,
    862       "median": 0.55
    863     },
    864     "2019": {
    865       "n": 1,
    866       "mean": 0.6471,
    867       "median": 0.6471
    868     },
    869     "2020": {
    870       "n": 1,
    871       "mean": 0.525,
    872       "median": 0.525
    873     },
    874     "2021": {
    875       "n": 1,
    876       "mean": 0.7073,
    877       "median": 0.7073
    878     },
    879     "2022": {
    880       "n": 18,
    881       "mean": 0.6239,
    882       "median": 0.6585
    883     },
    884     "2023": {
    885       "n": 61,
    886       "mean": 0.5068,
    887       "median": 0.525
    888     },
    889     "2024": {
    890       "n": 129,
    891       "mean": 0.5085,
    892       "median": 0.5278
    893     },
    894     "2025": {
    895       "n": 282,
    896       "mean": 0.4949,
    897       "median": 0.518
    898     },
    899     "2026": {
    900       "n": 66,
    901       "mean": 0.537,
    902       "median": 0.5583
    903     }
    904   },
    905   "by_methodology_tag": {
    906     "benchmark-eval": {
    907       "n": 424,
    908       "mean": 0.5355,
    909       "median": 0.5385
    910     },
    911     "case-study": {
    912       "n": 115,
    913       "mean": 0.4134,
    914       "median": 0.4167
    915     },
    916     "meta-analysis": {
    917       "n": 31,
    918       "mean": 0.408,
    919       "median": 0.3514
    920     },
    921     "observational": {
    922       "n": 50,
    923       "mean": 0.5767,
    924       "median": 0.5973
    925     },
    926     "qualitative": {
    927       "n": 68,
    928       "mean": 0.4469,
    929       "median": 0.5
    930     },
    931     "rct": {
    932       "n": 7,
    933       "mean": 0.5273,
    934       "median": 0.4872
    935     },
    936     "theoretical": {
    937       "n": 62,
    938       "mean": 0.5099,
    939       "median": 0.5556
    940     }
    941   },
    942   "code_released_crosstab": {
    943     "with_code": {
    944       "n": 295,
    945       "mean": 0.5372,
    946       "median": 0.5476
    947     },
    948     "without_code": {
    949       "n": 252,
    950       "mean": 0.4855,
    951       "median": 0.5
    952     }
    953   },
    954   "code_vs_category_scores": {
    955     "statistical_methodology": {
    956       "with_code_mean": 0.3019,
    957       "without_code_mean": 0.2317,
    958       "diff": 0.0702
    959     },
    960     "evaluation_design": {
    961       "with_code_mean": 0.8314,
    962       "without_code_mean": 0.7492,
    963       "diff": 0.0823
    964     },
    965     "claims_and_evidence": {
    966       "with_code_mean": 0.6249,
    967       "without_code_mean": 0.5106,
    968       "diff": 0.1143
    969     },
    970     "setup_transparency": {
    971       "with_code_mean": 0.6973,
    972       "without_code_mean": 0.5286,
    973       "diff": 0.1687
    974     },
    975     "limitations_and_scope": {
    976       "with_code_mean": 0.5763,
    977       "without_code_mean": 0.4683,
    978       "diff": 0.108
    979     },
    980     "data_integrity": {
    981       "with_code_mean": 0.7931,
    982       "without_code_mean": 0.501,
    983       "diff": 0.2921
    984     },
    985     "conflicts_of_interest": {
    986       "with_code_mean": 0.4633,
    987       "without_code_mean": 0.3595,
    988       "diff": 0.1038
    989     },
    990     "contamination": {
    991       "with_code_mean": 0.2435,
    992       "without_code_mean": 0.1284,
    993       "diff": 0.1151
    994     },
    995     "human_studies": {
    996       "with_code_mean": 0.2713,
    997       "without_code_mean": 0.238,
    998       "diff": 0.0333
    999     },
   1000     "cost_and_practicality": {
   1001       "with_code_mean": 0.3138,
   1002       "without_code_mean": 0.2037,
   1003       "diff": 0.1101
   1004     }
   1005   },
   1006   "red_flags_distribution": {
   1007     "No limitations section": 121,
   1008     "No uncertainty quantification": 59,
   1009     "No statistical significance tests": 50,
   1010     "No statistical significance testing": 41,
   1011     "Benchmark contamination risk unaddressed": 36,
   1012     "No contamination analysis": 27,
   1013     "Benchmark contamination not addressed": 24,
   1014     "Contamination risk unaddressed": 19,
   1015     "No variance or uncertainty quantification": 18,
   1016     "Missing hyperparameters": 18,
   1017     "No statistical uncertainty quantification": 17,
   1018     "No statistical rigor": 15,
   1019     "No funding disclosure": 14,
   1020     "No ablation study": 14,
   1021     "Company evaluating its own product": 13,
   1022     "Company evaluating own product": 13,
   1023     "No systematic review methodology": 12,
   1024     "Benchmark contamination risk": 11,
   1025     "No code or data release": 11,
   1026     "No quantitative evaluation": 10
   1027   },
   1028   "score_histogram": {
   1029     "0-10%": 13,
   1030     "10-20%": 18,
   1031     "20-30%": 43,
   1032     "30-40%": 51,
   1033     "40-50%": 105,
   1034     "50-60%": 156,
   1035     "60-70%": 103,
   1036     "70-80%": 53,
   1037     "80-90%": 19
   1038   }
   1039 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs