scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22789B)
      1 {
      2   "paper": {
      3     "title": "The Benchmarking Epistemology: Construct Validity for Evaluating Machine Learning Models",
      4     "authors": ["Timo Freiesleben", "Sebastian Zezulka"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2510.23191"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No source code or analysis scripts are mentioned or linked anywhere in the paper. The paper is theoretical but could have released supplementary materials."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No dataset or supplementary data is released. The paper analyzes existing benchmarks (ImageNet, WeatherBench, Fragile Families Challenge) through a conceptual framework but does not release any structured data from this analysis."
     20       },
     21       "environment_specified": {
     22         "applies": false,
     23         "answer": false,
     24         "justification": "This is a theoretical/philosophical paper with no computational experiments, so environment specifications are structurally inapplicable."
     25       },
     26       "reproduction_instructions": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "This is a theoretical/philosophical paper with no experiments to reproduce. The contribution is a conceptual framework and case study analyses."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": false,
     35         "answer": false,
     36         "justification": "The paper does not report its own experimental results. It references confidence intervals from other studies (e.g., Russakovsky et al. 2015 bootstrap-based CIs) but does not produce original quantitative results."
     37       },
     38       "significance_tests": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "No comparative empirical claims are made by the authors. The paper is a theoretical analysis with case studies drawn from existing literature."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "No original empirical results are reported. Effect sizes referenced (e.g., ImageNet error rate drops, FFC scores) come from cited studies."
     47       },
     48       "sample_size_justified": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "This is a theoretical paper that does not collect data or run experiments, so sample size justification is structurally inapplicable."
     52       },
     53       "variance_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "No original experiments are run, so variance reporting is structurally inapplicable."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper positions its framework relative to prior validity-based perspectives on benchmarks, including Yee (2024), Salaudeen et al. (2025), Schlangen (2021), and Raji et al. (2021). Section 3 explicitly discusses how existing concerns in the literature map onto their framework."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The related work is contemporary: Salaudeen et al. (2025), Ye et al. (2025), Suhr et al. (2025), Yee (2024), and the classical Messick (1995) framework. These represent the current state of thinking on benchmark validity."
     69       },
     70       "ablation_study": {
     71         "applies": false,
     72         "answer": false,
     73         "justification": "This is a theoretical framework paper with no system components to ablate."
     74       },
     75       "multiple_metrics": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "No experiments are conducted, so evaluation metrics are structurally inapplicable."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is a theoretical paper that does not produce system outputs requiring human evaluation."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No experiments are conducted, so held-out test sets are structurally inapplicable."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The framework is demonstrated through three distinct case studies (ImageNet, WeatherBench, Fragile Families Challenge), each examining different types of validity conditions (content, consequential, auxiliary). Table 1 provides a structured breakdown of five validity types with their conditions and supported inferences."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Each case study extensively discusses where validity conditions fail. For ImageNet: adaptivity violations, distribution sensitivity, content validity limitations (Section 4.3). For WeatherBench: lack of consequential validity, inability to capture butterfly effect (Section 5.3). For FFC: auxiliary validity failures, unmeasured predictors (Section 6.3)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper's central contribution includes identifying when benchmark inferences are NOT valid. Each case study concludes with a 'Constraining the inference' section that explicitly states what cannot be inferred (Sections 4.3, 5.3, 6.3)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims the paper develops construct validity conditions inspired by psychological measurement theory and examines them through three case studies. Sections 2-3 develop the framework, and Sections 4-6 present the three case studies as claimed. The abstract accurately reflects the paper's content."
    111       },
    112       "causal_claims_justified": {
    113         "applies": false,
    114         "answer": false,
    115         "justification": "The paper makes no causal claims. It develops a conceptual framework for interpreting benchmark scores and analyzes case studies using logical argumentation rather than causal inference."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper is careful about scope. Section 7.1 acknowledges that the validity conditions 'remain incomplete' especially for LLM evaluation. Each case study concludes by explicitly constraining what can and cannot be inferred. The framework is presented as a structured approach, not as a universal solution."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "This is central to the paper's methodology. The FFC case study (Section 6) is entirely organized around ruling out alternative explanations for benchmark results. Section 6.2 discusses adaptivity vs. distribution shift for ImageNet, and Lundberg et al. (2024) qualitative interview findings about unmeasured predictors."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": false,
    131         "answer": false,
    132         "justification": "No ML models are used or evaluated in this paper. It is a theoretical analysis."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "No prompting or LLM usage is involved in this paper's methodology."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "No experiments are conducted, so hyperparameter reporting is structurally inapplicable."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is a theoretical paper."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No data is collected or preprocessed. The paper analyzes existing benchmarks through a conceptual lens."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 7 ('Towards a benchmarking epistemology') serves as both a discussion and limitations section. Section 7.1 explicitly states the framework's validity conditions 'remain incomplete' for LLM evaluation. Section 7.3 discusses the limits of benchmarking as an epistemic practice."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 7.1 identifies specific gaps: the framework does not fully address convergent and discriminant validity for complex constructs like LLM capabilities. Section 7.3 discusses construct underrepresentation and construct irrelevance as specific risks. These are specific to this framework, not boilerplate."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7.1 explicitly states that 'the proposed validity conditions remain incomplete' and that LLM capability assessment 'calls for additional conditions.' Section 7.2 acknowledges the paper focuses on epistemic roles of benchmarks and deliberately leaves aside their 'equally important social role.' Section 7.3 discusses what cannot be meaningfully measured by benchmarks."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": false,
    175         "answer": false,
    176         "justification": "This is a theoretical paper that does not collect or analyze raw data. The case studies analyze existing published benchmarks."
    177       },
    178       "data_collection_described": {
    179         "applies": false,
    180         "answer": false,
    181         "justification": "No data is collected. The paper is a theoretical/philosophical contribution."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No participants are recruited. This is a theoretical paper."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No data pipeline exists. This is a theoretical paper analyzing existing benchmarks."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The Acknowledgements section discloses funding from the Deutsche Forschungsgemeinschaft (DFG) under Germany's Excellence Strategy, the Carl Zeiss Stiftung, and the International Max Planck Research School for Intelligent Systems (IMPRS-IS)."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly stated: Timo Freiesleben at LMU Munich (MCMP & MCML), Sebastian Zezulka at University of Tubingen. No conflicts with evaluated products are apparent since the paper does not evaluate any specific commercial product."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The funders (DFG, Carl Zeiss Stiftung, IMPRS-IS) are academic research funding bodies with no financial stake in the paper's conclusions about benchmark validity."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is included in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This paper does not evaluate any pre-trained model on any benchmark. It is a theoretical analysis of benchmarking as an epistemic practice."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "No model evaluation is performed. The paper discusses train/test overlap as a validity concern conceptually (Section 4.2, internal validity) but does not itself face this issue."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No benchmark evaluation is performed. Contamination is discussed conceptually as part of internal validity conditions but is not a concern for this paper's own methodology."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants. This is a theoretical paper."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants. This is a theoretical paper."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants. This is a theoretical paper."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. This is a theoretical paper."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. This is a theoretical paper."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants. This is a theoretical paper."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants. This is a theoretical paper."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "This is a theoretical paper with no computational experiments, so inference cost is structurally inapplicable."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "This is a theoretical paper with no computational experiments, so compute budget is structurally inapplicable."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Predictive benchmarks are best understood as measurement tools analogous to psychological and educational tests, and interpreting benchmark scores requires specifying and evaluating conditions of construct validity.",
    286       "evidence": "Section 2 develops the analogy between benchmark components (data instances, models, scores) and psychological test components (test items, test takers, scores). Section 3 adapts the four-step argument-based framework for construct validity from Messick (1995) and Kane (2013). Table 1 provides a structured summary of five validity types with conditions and supported inferences.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "ImageNet benchmark scores cannot be interpreted as indicating expected error rates on image classification in general, but can support the weaker inference that model rankings reflect relative progress.",
    291       "evidence": "Section 4 analyzes internal, external, and content validity for ImageNet. External validity evidence shows robust rank preservation across datasets (Kornblith et al. 2019, r=0.86 for large datasets) but 11-14% absolute performance drops on reproduced data (Recht et al. 2019). Content validity limitations include 120/1000 classes being dog breeds and 20% of images having multiple plausible labels (Shankar et al. 2020). Section 4.3 constrains the inference accordingly.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "WeatherBench scores cannot be directly used to reflect utility in downstream applications such as energy market planning, despite strong internal and content validity.",
    296       "evidence": "Section 5 identifies that default WeatherBench evaluation metrics do not capture context-specific utilities. GraphCast produces overly smoothed forecasts (Lam et al. 2023), rare events are under-penalized (Rasp et al. 2024), and models are evaluated on point predictions rather than uncertainty estimates needed in practice. Section 5.3 constrains the inference.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "The uniformly low scores in the Fragile Families Challenge cannot support the strong inference that life outcomes are inherently unpredictable, because auxiliary validity conditions (model diversity, data completeness) are not fully met.",
    301       "evidence": "Section 6 analyzes auxiliary validity. Lundberg et al. (2024) interviewed 40 families and identified three sources of prediction error: imperfectly measured features, unmeasured features, and unmeasurable features (events after survey). The small sample size (4,242 families, 12,942 features) limits model fitting. Section 6.3 constrains to the weaker inference that predictive performance is likely to be low, especially for young and marginalized groups.",
    302       "supported": "strong"
    303     }
    304   ],
    305   "methodology_tags": ["theoretical"],
    306   "key_findings": "The paper develops a construct validity framework for interpreting ML benchmark scores, adapted from psychological measurement theory. It identifies five types of validity (internal, external, content, consequential, auxiliary) with specific conditions for each. Through three case studies -- ImageNet, WeatherBench, and the Fragile Families Challenge -- it demonstrates that benchmarks support inferences only to the degree that corresponding validity conditions are met. The framework shows that stronger inferences (e.g., about model capabilities or real-world deployment) require progressively more validity conditions to be satisfied.",
    307   "red_flags": [],
    308   "cited_papers": [
    309     {
    310       "title": "Measurement to Meaning: A Validity-Centered Framework for AI Evaluation",
    311       "authors": ["Olawale Salaudeen", "Anka Reuel", "Ahmed Ahmed", "Suhana Bedi", "Zachary Robertson", "Sudharsan Sundar", "Ben Domingue", "Angelina Wang", "Sanmi Koyejo"],
    312       "year": 2025,
    313       "relevance": "Proposes a claim-centred framework for evaluating general-purpose AI systems using validity concepts, directly related to this survey's concern with methodological rigor in AI evaluation."
    314     },
    315     {
    316       "title": "Are Emergent Abilities of Large Language Models a Mirage?",
    317       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    318       "year": 2023,
    319       "relevance": "Demonstrates how metric choice can create the appearance of emergent abilities, highlighting construct validity issues in LLM evaluation."
    320     },
    321     {
    322       "title": "AI and the Everything in the Whole Wide World Benchmark",
    323       "authors": ["Deborah Raji", "Emily Denton", "Emily M. Bender", "Alex Hanna", "Amandalynne Paullada"],
    324       "year": 2021,
    325       "relevance": "Argues that only well-specified intensional tasks can be faithfully operationalized by benchmarks, raising fundamental questions about benchmark validity for general-purpose AI."
    326     },
    327     {
    328       "title": "Stop Evaluating AI with Human Tests, Develop Principled, AI-specific Tests instead",
    329       "authors": ["Tom Suhr", "Florian E. Dorner", "Olawale Salaudeen", "Augustin Kelava", "Samira Samadi"],
    330       "year": 2025,
    331       "relevance": "Challenges the practice of using human psychological tests for AI evaluation, proposing AI-specific testing methodologies."
    332     },
    333     {
    334       "title": "The Benchmark Lottery",
    335       "authors": ["Mostafa Dehghani", "Yi Tay", "Alexey A. Gritsenko", "Zhe Zhao", "Neil Houlsby", "Fernando Diaz", "Donald Metzler", "Oriol Vinyals"],
    336       "year": 2021,
    337       "relevance": "Documents how benchmark selection can favor methods already aligned with benchmark idiosyncrasies, a key concern for evaluation methodology."
    338     },
    339     {
    340       "title": "Reduced, Reused and Recycled: The Life of a Dataset in Machine Learning Research",
    341       "authors": ["Bernard Koch", "Emily Denton", "Alex Hanna", "Jacob G. Foster"],
    342       "year": 2021,
    343       "relevance": "Shows concentration of research on a small set of benchmark datasets produced by few institutions, raising concerns about bias and evaluation methodology."
    344     },
    345     {
    346       "title": "Do ImageNet Classifiers Generalize to ImageNet?",
    347       "authors": ["Benjamin Recht", "Rebecca Roelofs", "Ludwig Schmidt", "Vaishaal Shankar"],
    348       "year": 2019,
    349       "relevance": "Empirically tests external validity of ImageNet by recreating the dataset, finding 11-14% performance drops but stable rankings."
    350     },
    351     {
    352       "title": "Targeting the Benchmark: On Methodology in Current Natural Language Processing Research",
    353       "authors": ["David Schlangen"],
    354       "year": 2021,
    355       "relevance": "Introduces the key distinction between intensional task description and extensional benchmark instantiation, foundational for benchmark validity analysis."
    356     },
    357     {
    358       "title": "Are We Learning Yet? A Meta Review of Evaluation Failures Across Machine Learning",
    359       "authors": ["Thomas Liao", "Rohan Taori", "Inioluwa Deborah Raji", "Ludwig Schmidt"],
    360       "year": 2021,
    361       "relevance": "Meta-review documenting evaluation failures in ML research, directly relevant to methodological quality assessment."
    362     },
    363     {
    364       "title": "Position: Why We Must Rethink Empirical Research in Machine Learning",
    365       "authors": ["Moritz Herrmann", "F. Julian D. Lange", "Katharina Eggensperger", "Giuseppe Casalicchio", "Marcel Wever", "Matthias Feurer", "David Rugamer", "Eyke Hullermeier", "Anne-Laure Boulesteix", "Bernd Bischl"],
    366       "year": 2024,
    367       "relevance": "Position paper arguing for rethinking empirical research methodology in ML, showing task-specific performance variation concerns."
    368     },
    369     {
    370       "title": "Evaluating General-Purpose AI with Psychometrics",
    371       "authors": ["Xiting Wang", "Liming Jiang", "Jose Hernandez-Orallo", "David Stillwell", "Luning Sun", "Fang Luo", "Xing Xie"],
    372       "year": 2023,
    373       "relevance": "Proposes using psychometric methods to evaluate AI capabilities, closely related to construct validity framework for AI benchmarks."
    374     },
    375     {
    376       "title": "Large Language Model Psychometrics: A Systematic Review of Evaluation, Validation, and Enhancement",
    377       "authors": ["Haoran Ye", "Jing Jin", "Yuhang Xie", "Xin Zhang", "Guojie Song"],
    378       "year": 2025,
    379       "relevance": "Systematic review of psychometric evaluation approaches for LLMs, directly relevant to methodological quality in AI evaluation."
    380     }
    381   ]
    382 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs