scan.json (16466B)
1 { 2 "paper": { 3 "title": "Questionable practices in machine learning", 4 "authors": ["Gavin Leech", "Juan J Vazquez", "Niclas Kupper", "Misha Yagudin", "Laurence Aitchison"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2407.12220" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "No repository URL or code archive is mentioned in the paper." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No dataset or structured data artifact is released. The paper is a taxonomy with examples drawn from existing literature." 20 }, 21 "environment_specified": { 22 "applies": false, 23 "answer": false, 24 "justification": "This is a qualitative taxonomy paper with no computational experiments requiring an environment." 25 }, 26 "reproduction_instructions": { 27 "applies": false, 28 "answer": false, 29 "justification": "No experiments to reproduce; this is a conceptual taxonomy paper." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": false, 35 "answer": false, 36 "justification": "The paper does not run experiments or report quantitative results of its own." 37 }, 38 "significance_tests": { 39 "applies": false, 40 "answer": false, 41 "justification": "No comparative quantitative claims are made by the authors from their own experiments." 42 }, 43 "effect_sizes_reported": { 44 "applies": false, 45 "answer": false, 46 "justification": "No original experiments; effect sizes cited are from other papers." 47 }, 48 "sample_size_justified": { 49 "applies": false, 50 "answer": false, 51 "justification": "No empirical study with a sample conducted by the authors." 52 }, 53 "variance_reported": { 54 "applies": false, 55 "answer": false, 56 "justification": "No original experimental runs to report variance over." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper compares to and extends prior work on QRPs, notably Wicherts et al. (2016) from psychology and Biderman et al. (2024) from ML. Section 2 discusses related work extensively." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Related work includes contemporary references such as Biderman et al. (2024), Kapoor et al. (2024), and Hofman et al. (2023)." 69 }, 70 "ablation_study": { 71 "applies": false, 72 "answer": false, 73 "justification": "No system with components to ablate; this is a taxonomy paper." 74 }, 75 "multiple_metrics": { 76 "applies": false, 77 "answer": false, 78 "justification": "No quantitative evaluation is performed." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "No system outputs to evaluate. This is a conceptual taxonomy." 84 }, 85 "held_out_test_set": { 86 "applies": false, 87 "answer": false, 88 "justification": "No dataset or test set used." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "The taxonomy is organized into clear categories: contamination (10 types), cherrypicking (8 types), misreporting (12 types), amplifiers (3 types), and irreproducible research practices (8 types), presented in Tables 1 and 2." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "The entire paper consists of discussing failure cases — questionable practices with concrete examples from published work." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 5.4 Limitations acknowledges the paper does not quantify prevalence or severity of QRPs, and notes the taxonomy is not exhaustive." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims to describe 44 QRPs with examples, which the paper delivers across Sections 3 and 4 with Tables 1 and 2." 111 }, 112 "causal_claims_justified": { 113 "applies": false, 114 "answer": false, 115 "justification": "The paper describes and categorizes practices rather than making causal claims. Causal examples cited (e.g., contamination causing score inflation) reference others' experimental findings." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "Section 5.4 explicitly states: 'We do not expect Table 1 to be exhaustive' and 'this work does not quantify the prevalence or severity of the QRPs, so we cannot tell you how much to worry.'" 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section 5.3 discusses root causes from multiple angles (researcher incentives vs. industrialization). The paper also consistently notes when QRPs could be accidental vs. intentional (Table 1 'Accidental?' column)." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": false, 131 "answer": false, 132 "justification": "No models are used or evaluated by the authors." 133 }, 134 "prompts_provided": { 135 "applies": false, 136 "answer": false, 137 "justification": "No prompting is used in this paper." 138 }, 139 "hyperparameters_reported": { 140 "applies": false, 141 "answer": false, 142 "justification": "No experiments conducted requiring hyperparameters." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding used." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": false, 152 "justification": "The paper does not describe a systematic methodology for how the 44 QRPs were identified, collected, or validated. No search strategy, inclusion/exclusion criteria, or literature review protocol is documented." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 5.4 'Limitations' is a dedicated subsection discussing the paper's shortcomings." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Section 5.4 states specific limitations: the taxonomy is not exhaustive, the paper cannot quantify prevalence or severity, and it acknowledges dual-use risk." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "The introduction states: 'we do not claim that most performance is spurious. Nor do we show the general prevalence of these problems. This paper answers the limited question \"what could make a model's reported performance to some extent spurious?\"'" 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "No structured dataset of QRPs, examples, or references is released for independent verification." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": false, 181 "justification": "The paper does not describe how the 44 QRPs were systematically identified. Section 2.1 mentions Twitter as a source of leads, but no formal collection methodology is described." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants recruited; this is a literature-based taxonomy." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": false, 191 "justification": "No pipeline from literature search to final taxonomy is documented. The selection and organization of QRPs appears ad hoc." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "Section 6.1 states: 'GL is funded by the UKRI Centre for Doctoral Training in Interactive Artificial Intelligence (EP/S022937/1).'" 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are listed: Arb Research, University of Bath, University of Bristol." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": true, 208 "justification": "UKRI doctoral training funding has no stake in the outcome of a taxonomy of questionable research practices." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is included in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "The paper does not evaluate any pre-trained model on a benchmark." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "No model evaluation on benchmarks performed." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "No model evaluation on benchmarks performed." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "This is a taxonomy/survey paper with no computational method to cost." 275 }, 276 "compute_budget_stated": { 277 "applies": false, 278 "answer": false, 279 "justification": "No computation performed." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "44 questionable research practices can undermine reported ML results, falling into contamination, cherrypicking, and misreporting categories.", 286 "evidence": "Tables 1 and 2, Sections 3 and 4 enumerate and describe all 44 practices with examples from published work.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Data contamination causes large changes in benchmark performance — e.g., Gemini 1.0 Ultra increased HumanEval from 74.4% to 89.0% when exposed to the test set once in pre-training.", 291 "evidence": "Section 3.1.1 cites Reid et al. (2024) for this specific result.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "The industrialization of AI research creates incentives misaligned with scientific evaluation norms.", 296 "evidence": "Section 5.3.2 discusses how business goals (marketing, investment) diverge from scientific goals (fair comparison, generalization).", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Subtle prompt and harness differences can cause ~30% performance swings, as shown by the MMLU scoring discrepancy across three evaluation harnesses.", 301 "evidence": "Section 3.2.6 describes how Llama-65b's MMLU score varied by nearly 30% between the EleutherAI harness and the original MMLU/HELM harnesses (Fourrier et al., 2023).", 302 "supported": "strong" 303 } 304 ], 305 "methodology_tags": ["qualitative"], 306 "key_findings": "The paper catalogs 44 questionable research practices (QRPs) in ML evaluation organized into contamination (10 types), cherrypicking (8 types), misreporting (12 types), and amplifiers (3 types), plus 9 irreproducible research practices. It provides concrete examples from published work including Gemini, GPT-4, Phi-3, and Falcon launches. The paper identifies two root causes: researcher self-certification of SOTA results and the industrialization of AI research, and proposes defenses including standardized evaluation harnesses, private benchmarks, and preregistration.", 307 "red_flags": [ 308 { 309 "flag": "No systematic methodology for taxonomy construction", 310 "detail": "The 44 QRPs appear to be collected ad hoc from literature and Twitter rather than through a systematic review process. No search strategy, inclusion criteria, or saturation analysis is described." 311 }, 312 { 313 "flag": "No prevalence or severity estimates", 314 "detail": "The paper acknowledges in Section 5.4 that it cannot quantify how common or damaging these practices are, limiting its utility for assessing the actual state of ML research quality." 315 } 316 ], 317 "cited_papers": [ 318 { 319 "title": "Lessons from the Trenches on Reproducible Evaluation of Language Models", 320 "authors": ["Stella Biderman"], 321 "year": 2024, 322 "relevance": "Directly addresses methodological problems in LLM evaluation, a core concern of this survey." 323 }, 324 { 325 "title": "AI Agents That Matter", 326 "authors": ["Sayash Kapoor"], 327 "year": 2024, 328 "relevance": "Discusses evaluation methodology issues for AI agents including cost reporting and baseline fairness." 329 }, 330 { 331 "title": "Troubling Trends in Machine Learning Scholarship", 332 "authors": ["Zachary C. Lipton", "Jacob Steinhardt"], 333 "year": 2019, 334 "relevance": "Foundational work on inflated claims and poor methodology in ML research." 335 }, 336 { 337 "title": "Are We Learning Yet? A Meta-Review of Evaluation Failures Across Machine Learning", 338 "authors": ["Thomas Liao"], 339 "year": 2021, 340 "relevance": "Studies the mismatch between benchmarks and real-world problems in ML evaluation." 341 }, 342 { 343 "title": "Holistic Evaluation of Language Models", 344 "authors": ["Percy Liang"], 345 "year": 2022, 346 "relevance": "Major evaluation framework (HELM) for LLMs addressing standardization of evaluation." 347 }, 348 { 349 "title": "How to Avoid Machine Learning Pitfalls: a Guide for Academic Researchers", 350 "authors": ["Michael A. Lones"], 351 "year": 2021, 352 "relevance": "Systematic guide of ML anti-patterns complementary to this paper's QRP taxonomy." 353 }, 354 { 355 "title": "Chatbot Arena: An open platform for evaluating LLMs by human preference", 356 "authors": ["Wei-Lin Chiang"], 357 "year": 2024, 358 "relevance": "Major human-preference evaluation platform discussed as a defense against contamination and reification." 359 }, 360 { 361 "title": "GPT-4 technical report", 362 "authors": ["Josh Achiam"], 363 "year": 2023, 364 "arxiv_id": "2303.08774", 365 "relevance": "Key reference for reported contamination in frontier LLMs and evaluation methodology." 366 }, 367 { 368 "title": "Evaluating large language models trained on code", 369 "authors": ["Mark Chen"], 370 "year": 2021, 371 "arxiv_id": "2107.03374", 372 "relevance": "Introduces HumanEval benchmark, a central example in contamination discussions." 373 }, 374 { 375 "title": "Measuring Data Contamination in Large-Scale Benchmarks", 376 "authors": ["Riddell"], 377 "year": 2024, 378 "relevance": "Demonstrates contamination in popular open-source training corpora (The Pile, The Stack) with HumanEval." 379 } 380 ] 381 }