scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22986B)
      1 {
      2   "paper": {
      3     "title": "Measuring what Matters: Construct Validity in Large Language Model Benchmarks",
      4     "authors": [
      5       "Andrew M. Bean",
      6       "Ryan Othniel Kearns",
      7       "Angelika Romanou",
      8       "Franziska Sofia Hafner",
      9       "Harry Mayne",
     10       "Jan Batzner",
     11       "Negar Foroutan",
     12       "Chris Schmitz",
     13       "Karolina Korgul",
     14       "Hunar Batra",
     15       "Oishi Deb",
     16       "Emma Beharry",
     17       "Cornelius Emde",
     18       "Thomas Foster",
     19       "Anna Gausen",
     20       "María Grandury",
     21       "Simeng Han",
     22       "Valentin Hofmann",
     23       "Lujain Ibrahim",
     24       "Hazel Kim",
     25       "Hannah Rose Kirk",
     26       "Fangru Lin",
     27       "Gabrielle Kaili-May Liu",
     28       "Lennart Luettgau",
     29       "Jabez Magomere",
     30       "Jonathan Rystrøm",
     31       "Anna Sotnikova",
     32       "Yushi Yang",
     33       "Yilun Zhao",
     34       "Adel Bibi",
     35       "Antoine Bosselut",
     36       "Ronald Clark",
     37       "Arman Cohan",
     38       "Jakob Foerster",
     39       "Yarin Gal",
     40       "Scott A. Hale",
     41       "Inioluwa Deborah Raji",
     42       "Christopher Summerfield",
     43       "Philip H.S. Torr",
     44       "Cozmin Ududec",
     45       "Luc Rocher",
     46       "Adam Mahdi"
     47     ],
     48     "year": 2025,
     49     "venue": "NeurIPS 2025 Track on Datasets and Benchmarks",
     50     "arxiv_id": "2511.04703",
     51     "doi": "10.48550/arXiv.2511.04703"
     52   },
     53   "scan_version": 2,
     54   "active_modules": ["survey_methodology"],
     55   "checklist": {
     56     "artifacts": {
     57       "code_released": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Appendix B states 'the code used to clean the dataset is available on GitHub' and the complete codebook dataset is available on Hugging Face."
     61       },
     62       "data_released": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Appendix B states 'The complete codebook is available as a dataset on Hugging Face.' The 445-paper annotation dataset is released."
     66       },
     67       "environment_specified": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No environment specifications, dependency lists, or setup instructions are mentioned for the analysis code."
     71       },
     72       "reproduction_instructions": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No step-by-step reproduction instructions are provided. The methodology is described at a high level but there are no runnable scripts or README with commands."
     76       }
     77     },
     78     "statistical_methodology": {
     79       "confidence_intervals_or_error_bars": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "All results are reported as point percentages (e.g., '78.2% of reviewed benchmarks provide definitions') without confidence intervals or error bars."
     83       },
     84       "significance_tests": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "The paper is a descriptive systematic review. It does not make comparative claims between systems that would require significance tests."
     88       },
     89       "effect_sizes_reported": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "No experimental comparisons are made. The paper reports descriptive statistics about benchmark practices."
     93       },
     94       "sample_size_justified": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "The corpus of 445 papers results from filtering 46,114 articles, but there is no justification for why this sample size is sufficient or discussion of statistical power for the descriptive claims."
     98       },
     99       "variance_reported": {
    100         "applies": false,
    101         "answer": false,
    102         "justification": "No experimental runs. The paper is a systematic review with descriptive coding."
    103       }
    104     },
    105     "evaluation_design": {
    106       "baselines_included": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "The paper references prior work on LLM benchmark quality (Reuel et al., Bowman & Dahl, Raji et al.) but does not formally compare its findings against prior surveys or establish baselines for comparison."
    110       },
    111       "baselines_contemporary": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The related work discussed is contemporary: Reuel et al. (2024), Biderman et al. (2024), Miller (2024), Weidinger et al. (2025)."
    115       },
    116       "ablation_study": {
    117         "applies": false,
    118         "answer": false,
    119         "justification": "No system with components to ablate. This is a systematic review."
    120       },
    121       "multiple_metrics": {
    122         "applies": false,
    123         "answer": false,
    124         "justification": "No system evaluation. The paper codes benchmark papers against a codebook."
    125       },
    126       "human_evaluation": {
    127         "applies": false,
    128         "answer": false,
    129         "justification": "No system outputs to evaluate. The 29 expert reviewers are the researchers conducting the coding, not evaluators of a system."
    130       },
    131       "held_out_test_set": {
    132         "applies": false,
    133         "answer": false,
    134         "justification": "No prediction task requiring train/test split."
    135       },
    136       "per_category_breakdown": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Results are broken down by phenomenon type (Fig 2A), year (Fig 2B), and across multiple codebook dimensions (Fig 3, §4). Detailed per-field inter-rater agreement in Table 9."
    140       },
    141       "failure_cases_discussed": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper discusses limitations of its own methodology: low inter-rater agreement on interpretive fields (task_ecology BPK=0.146, dataset_sampling_method BPK=0.122), potential false negatives from LLM filtering, and limited reviewers per paper (§6)."
    145       },
    146       "negative_results_reported": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "The paper reports that inter-rater agreement is only moderate (mean BPK=0.524), with several fields showing poor reliability. It also reports that only 16% of benchmarks use statistical tests and nearly every paper has weaknesses."
    150       }
    151     },
    152     "claims_and_evidence": {
    153       "abstract_claims_supported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The abstract claims about 'patterns related to measured phenomena, tasks, and scoring metrics which undermine validity' are supported by the codebook results in §4 and Fig 3."
    157       },
    158       "causal_claims_justified": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "The paper claims that identified patterns 'undermine the validity of the resulting claims' — this is a causal claim. However, the evidence is descriptive (counting practices) without empirically testing whether these practices actually reduce validity of conclusions."
    162       },
    163       "generalization_bounded": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "§6 explicitly states: 'Our focus on leading conference proceedings... may systematically exclude certain types of impactful benchmarks' including industry benchmarks and domain-specific venues."
    167       },
    168       "alternative_explanations_discussed": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "§6 discusses alternative explanations: distributional shift in language may bias toward recent papers, LLM screening may have introduced false negatives, and limited reviewers per paper reduces robustness."
    172       },
    173       "proxy_outcome_distinction": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The paper explicitly defines construct validity (§2), distinguishes between their codebook measurements and the underlying quality of benchmarks, and discusses the gap between coding annotations and actual validity (inter-rater agreement analysis in Appendix D)."
    177       }
    178     },
    179     "setup_transparency": {
    180       "model_versions_specified": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "GPT-4o mini is named with a citation [30] but no specific API version or snapshot date is provided. Per schema criteria, marketing names without snapshot dates do not count."
    184       },
    185       "prompts_provided": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Full prompt text for the GPT-4o mini screening is provided in Table 7 (Appendix C), including system prompt and all three filtering step prompts."
    189       },
    190       "hyperparameters_reported": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "No temperature, top-p, or other hyperparameters are reported for the GPT-4o mini screening calls."
    194       },
    195       "scaffolding_described": {
    196         "applies": false,
    197         "answer": false,
    198         "justification": "No agentic scaffolding is used. GPT-4o mini is used for simple classification screening."
    199       },
    200       "data_preprocessing_documented": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "The full filtering pipeline is documented with counts and criteria at each stage: 46,114 → 2,189 (keyword) → 938 → 846 → 522 (LLM filtering) → 445 (manual review). Table 7 details each step. Appendix C provides the flowchart."
    204       }
    205     },
    206     "limitations_and_scope": {
    207       "limitations_section_present": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "§6 Discussion contains two substantial paragraphs describing limitations of the approach."
    211       },
    212       "threats_to_validity_specific": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Specific threats discussed: LLM filtering may have introduced 'undetected false negative systematic errors,' distributional shift in language usage may bias toward recent papers, 'limiting the number of reviewers per paper, reducing the robustness of the reviews' (§6)."
    216       },
    217       "scope_boundaries_stated": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "§6 explicitly states: the review 'does not capture benchmarks developed and released by industry labs without formal peer review, or those published in specialised domain-specific venues.' Also: 'We primarily review benchmarks prevalent in mainstream academic AI research.'"
    221       }
    222     },
    223     "data_integrity": {
    224       "raw_data_available": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "The complete annotation dataset is released on Hugging Face (Appendix B). This allows independent verification of the reported statistics."
    228       },
    229       "data_collection_described": {
    230         "applies": true,
    231         "answer": true,
    232         "justification": "§3 describes keyword search across 6 conferences, time range (2018-2024 for ML, 2020-2024 for NLP), keyword criteria ('benchmark' AND 'LLM' OR 'language model'), and LLM + manual filtering pipeline."
    233       },
    234       "recruitment_methods_described": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The 29 expert reviewers are mentioned as being 'matched on area of expertise' but no details are provided on how they were recruited, their expertise levels, or selection criteria."
    238       },
    239       "data_pipeline_documented": {
    240         "applies": true,
    241         "answer": true,
    242         "justification": "Full pipeline documented: 46,114 articles → 2,189 (keyword) → 522 (LLM filtering with counts at each stage in Table 7) → 445 (manual review). Appendix C provides detailed flowchart (Fig 4)."
    243       }
    244     },
    245     "conflicts_of_interest": {
    246       "funding_disclosed": {
    247         "applies": true,
    248         "answer": true,
    249         "justification": "Detailed Acknowledgments and Disclosure of Funding section lists grants and funding sources for individual authors (Clarendon, EPSRC, ESRC, NSF, Swiss NSF, etc.)."
    250       },
    251       "affiliations_disclosed": {
    252         "applies": true,
    253         "answer": true,
    254         "justification": "All 42 author affiliations are listed, spanning 14 institutions (Oxford, EPFL, Yale, UK AI Security Institute, etc.). No author evaluates their own company's product."
    255       },
    256       "funder_independent_of_outcome": {
    257         "applies": true,
    258         "answer": true,
    259         "justification": "Funders are academic bodies (Clarendon Scholarship, EPSRC, ESRC, NSF, Swiss NSF, Royal Society) with no financial stake in the review outcomes."
    260       },
    261       "financial_interests_declared": {
    262         "applies": true,
    263         "answer": false,
    264         "justification": "No competing interests or financial interests statement is provided. One author is affiliated with UK AI Security Institute which could have policy interests, but this is not discussed."
    265       }
    266     },
    267     "contamination": {
    268       "training_cutoff_stated": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "This is a systematic review of papers, not an evaluation of a pre-trained model on a benchmark. The GPT-4o mini screening is auxiliary and not the subject of the study's claims."
    272       },
    273       "train_test_overlap_discussed": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "Systematic review; no pre-trained model evaluated on benchmark tasks."
    277       },
    278       "benchmark_contamination_addressed": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "Systematic review; no benchmark evaluation of model capabilities."
    282       }
    283     },
    284     "human_studies": {
    285       "pre_registered": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "The 29 reviewers are co-investigators conducting the coding, not human subjects. The unit of analysis is benchmark papers."
    289       },
    290       "irb_or_ethics_approval": {
    291         "applies": false,
    292         "answer": false,
    293         "justification": "No human participants. Reviewers are researchers, not research subjects."
    294       },
    295       "demographics_reported": {
    296         "applies": false,
    297         "answer": false,
    298         "justification": "No human participants in the study."
    299       },
    300       "inclusion_exclusion_criteria": {
    301         "applies": false,
    302         "answer": false,
    303         "justification": "No human participants."
    304       },
    305       "randomization_described": {
    306         "applies": false,
    307         "answer": false,
    308         "justification": "No human participants."
    309       },
    310       "blinding_described": {
    311         "applies": false,
    312         "answer": false,
    313         "justification": "No human participants."
    314       },
    315       "attrition_reported": {
    316         "applies": false,
    317         "answer": false,
    318         "justification": "No human participants."
    319       }
    320     },
    321     "cost_and_practicality": {
    322       "inference_cost_reported": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "Survey paper. The GPT-4o mini screening cost is not the paper's method under evaluation."
    326       },
    327       "compute_budget_stated": {
    328         "applies": false,
    329         "answer": false,
    330         "justification": "Survey paper. No significant compute to report."
    331       }
    332     },
    333     "survey_methodology": {
    334       "prisma_or_structured_protocol": {
    335         "applies": true,
    336         "answer": true,
    337         "justification": "The review follows a structured systematic protocol: keyword search → LLM filtering → manual expert review, documented with a PRISMA-style flowchart (Fig 1, Fig 4). Inclusion/exclusion criteria are explicitly stated with counts at each stage."
    338       },
    339       "quality_assessment_of_sources": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "The entire paper IS a quality assessment of its sources. The codebook with 21 question items across phenomena, tasks, metrics, and claims constitutes a structured quality evaluation of each of the 445 included benchmarks."
    343       },
    344       "publication_bias_discussed": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "§6 acknowledges venue selection bias: 'Our focus on leading conference proceedings... may systematically exclude certain types of impactful benchmarks' including industry benchmarks and domain-specific venues."
    348       }
    349     }
    350   },
    351   "claims": [
    352     {
    353       "claim": "47.8% of benchmark definitions are contested, addressing phenomena with many possible definitions or no clear definition at all.",
    354       "evidence": "Codebook results in §4 Phenomenon section and Fig 3. 52.2% of definitions widely agreed upon, 47.8% contested among those that provide definitions.",
    355       "supported": "strong"
    356     },
    357     {
    358       "claim": "Only 16.0% of reviewed benchmarks used uncertainty estimates or statistical tests to compare results.",
    359       "evidence": "§4 Metric section: 'Once the responses were scored, 16.0% used uncertainty estimates or statistical tests to compare the results.' Also shown in Fig 3.",
    360       "supported": "strong"
    361     },
    362     {
    363       "claim": "27.0% of reviewed benchmarks incorporated convenience sampling as part of their sampling strategy, with 12.3% using it exclusively.",
    364       "evidence": "§4 Task section reports these figures directly from the codebook annotations.",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "Nearly every paper had weaknesses in at least one area across phenomena, tasks, metrics, and claims.",
    369       "evidence": "§6 Discussion states this based on the full annotation dataset. Fig 3 shows that the intersection of all best practices (shaded area) is very small.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Only 53.4% of articles presented evidence for the construct validity of their benchmark.",
    374       "evidence": "§4 Claims section and Fig 3.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "LLM-generated task items are used in 31.2% of benchmarks, with potential non-representative sampling risks.",
    379       "evidence": "§4 Task section. Sampling concerns discussed in §5.3.",
    380       "supported": "moderate"
    381     }
    382   ],
    383   "methodology_tags": ["meta-analysis"],
    384   "key_findings": "A systematic review of 445 LLM benchmark papers from top ML/NLP conferences reveals pervasive construct validity weaknesses. 47.8% of benchmarks have contested phenomenon definitions, only 16% use statistical tests, 27% rely on convenience sampling, and only 53.4% discuss their own construct validity. The authors provide eight recommendations and an operational checklist covering phenomenon definition, task design, contamination, statistical methods, error analysis, and validity justification. Inter-rater agreement among the 29 expert reviewers was moderate (mean Brennan-Prediger Kappa = 0.524).",
    385   "red_flags": [
    386     {
    387       "flag": "Moderate inter-rater reliability",
    388       "detail": "Mean BPK of 0.524 across 30 codebook fields indicates only moderate agreement. Several key fields have very low reliability: task_ecology (BPK=0.146), dataset_sampling_method (BPK=0.122), phenomenon_contested (BPK=0.317). This undermines confidence in the specific percentages reported."
    389     },
    390     {
    391       "flag": "Single reviewer per paper",
    392       "detail": "Each of the 445 papers was coded by a single primary reviewer, with a second reviewer only mapping responses. Only 46 papers (10%) were double-reviewed. Acknowledged as a limitation but still affects reliability of individual annotations."
    393     },
    394     {
    395       "flag": "LLM screening may introduce systematic bias",
    396       "detail": "GPT-4o mini was used to filter from 2,189 to 522 papers. Validated on only 50 papers (2.3% sample) achieving 80% precision and 89% recall. False negatives from this step are unrecoverable. The authors acknowledge this but the validation sample is small."
    397     }
    398   ],
    399   "cited_papers": [
    400     {
    401       "title": "AI and the Everything in the Whole Wide World Benchmark",
    402       "authors": ["I. D. Raji", "E. M. Bender", "A. Paullada", "E. Denton", "A. Hanna"],
    403       "year": 2021,
    404       "relevance": "Foundational work on benchmark definitions and limitations that this paper builds upon for its construct validity framework."
    405     },
    406     {
    407       "title": "BetterBench: Assessing AI Benchmarks, Uncovering Issues, and Establishing Best Practices",
    408       "authors": ["A. Reuel"],
    409       "year": 2024,
    410       "relevance": "Aggregates best practices for the benchmark lifecycle; this paper extends with construct validity focus."
    411     },
    412     {
    413       "title": "What Will It Take to Fix Benchmarking in Natural Language Understanding?",
    414       "authors": ["S. R. Bowman", "G. Dahl"],
    415       "year": 2021,
    416       "relevance": "Emphasizes the value of construct validity in NLP benchmarks and the need for shared evaluation standards."
    417     },
    418     {
    419       "title": "Lessons from the Trenches on Reproducible Evaluation of Language Models",
    420       "authors": ["S. Biderman"],
    421       "year": 2024,
    422       "relevance": "Addresses reproducibility and shared implementation standards for LLM evaluation."
    423     },
    424     {
    425       "title": "Inadequacies of Large Language Model Benchmarks in the Era of Generative Artificial Intelligence",
    426       "authors": ["T. R. McIntosh"],
    427       "year": 2024,
    428       "relevance": "Identifies inadequacies in LLM benchmarks, directly relevant to methodological quality assessment."
    429     },
    430     {
    431       "title": "Adding Error Bars to Evals: A Statistical Approach to Language Model Evaluations",
    432       "authors": ["E. Miller"],
    433       "year": 2024,
    434       "relevance": "Proposes statistical methods for LLM evaluation, addressing the paper's finding that only 16% of benchmarks use statistical tests."
    435     },
    436     {
    437       "title": "Toward an Evaluation Science for Generative AI Systems",
    438       "authors": ["L. Weidinger"],
    439       "year": 2025,
    440       "relevance": "Calls for shared standards and best practices in LLM evaluation science."
    441     },
    442     {
    443       "title": "AgentBench: Evaluating LLMs as Agents",
    444       "authors": ["X. Liu"],
    445       "year": 2024,
    446       "relevance": "Benchmark for evaluating LLMs as agents, cited as example of composite phenomenon measurement."
    447     },
    448     {
    449       "title": "Datasheets for Datasets",
    450       "authors": ["T. Gebru"],
    451       "year": 2021,
    452       "relevance": "Proposes documentation standards for datasets relevant to benchmark transparency and social responsibility."
    453     },
    454     {
    455       "title": "Don't Make Your LLM an Evaluation Benchmark Cheater",
    456       "authors": ["K. Zhou"],
    457       "year": 2023,
    458       "relevance": "Addresses benchmark contamination, a key concern in the construct validity of LLM evaluations."
    459     },
    460     {
    461       "title": "Are Emergent Abilities of Large Language Models a Mirage?",
    462       "authors": ["R. Schaeffer", "B. Miranda", "S. Koyejo"],
    463       "year": 2023,
    464       "relevance": "Questions interpretation of emergent abilities, illustrating contested phenomena in LLM evaluation."
    465     },
    466     {
    467       "title": "A Careful Examination of Large Language Model Performance on Grade School Arithmetic",
    468       "authors": ["H. Zhang"],
    469       "year": 2024,
    470       "relevance": "Demonstrates contamination effects on GSM8K benchmark, used as example in the paper's recommendations."
    471     }
    472   ]
    473 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs