scan.json (20091B)
1 { 2 "paper": { 3 "title": "Leakage and the Reproducibility Crisis in ML-based Science", 4 "authors": ["Sayash Kapoor", "Arvind Narayanan"], 5 "year": 2022, 6 "venue": "arXiv", 7 "arxiv_id": "2207.07048" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "Code and data for the civil war prediction case study are uploaded to a CodeOcean capsule (https://doi.org/10.24433/CO.4899453.v1), stated in the Materials and Methods section. The model info sheet template is available at https://reproducible.cs.princeton.edu." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The data required to reproduce the civil war prediction case study is included in the CodeOcean capsule. The survey data (Table 1) is presented in the paper itself." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": true, 24 "justification": "The CodeOcean capsule replicates the exact computational environment used, which is the purpose of the platform (Clyburne-Sherin et al., 2019)." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": true, 29 "justification": "The CodeOcean capsule provides a reproducible computational environment. Appendix B provides detailed description of methods for the civil war prediction case study." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": true, 36 "justification": "The paper reports 95% confidence intervals for the civil war prediction results, e.g., '[0.66-0.95]' for the smoothed AUC of Blair & Sambanis (2020). Bootstrapped test set resampling is used." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": true, 41 "justification": "The paper performs significance tests comparing model performance, reporting Z-statistics and p-values (e.g., 'Z = 0.64, 1.09, 0.42, 0.67; p = 0.26, 0.14, 0.34, 0.25' in footnote 3). Uses smoothed ROC curve comparison test from Robin et al." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports AUC differences between reported and corrected results (e.g., 'Difference in AUC between Adaboost and Logistic Regression drops from 0.14 to 0.01' for Wang 2019). Figure 1 provides visual comparison of effect sizes." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper does not justify the sample size of the survey (20 papers across 17 fields) or the case study (12 papers in civil war prediction). No power analysis is discussed." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "While confidence intervals are reported for some results (Blair & Sambanis), variance or standard deviation across experimental runs is not systematically reported for the corrected civil war prediction results." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The civil war prediction case study compares corrected results of complex ML models against Logistic Regression baselines. The paper also compares against the original (incorrect) reported results." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "The baselines are the same models used in the original papers (Random Forests, Adaboost, GBT, Logistic Regression), which is appropriate since the goal is to evaluate the original claims." 69 }, 70 "ablation_study": { 71 "applies": false, 72 "answer": false, 73 "justification": "The paper is a survey and reproducibility study, not a system with components to ablate." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "The civil war prediction case study uses both AUC and accuracy metrics. The paper also discusses metric choice issues across fields (Section 2.5)." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "Human evaluation is not relevant to this paper's claims, which are about data leakage detection and reproducibility verification." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "The civil war prediction case study uses the same held-out test sets as the original papers, and the paper emphasizes proper train-test separation as a core concern." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Table 1 provides per-field breakdown of leakage types across 17 fields. Figure 1 provides per-paper breakdown of reported vs corrected results for 4 civil war prediction papers." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 3.4 discusses limitations of model info sheets. The paper also discusses cases where errors could not be detected by reading papers alone." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that 8 of 12 civil war prediction papers had no errors detected. Section 3.4 explicitly discusses limitations of the proposed solution." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims about 17 fields, 329 affected papers, taxonomy of 8 leakage types, and civil war prediction findings are all supported by Table 1, Section 2.4, and Section 4/Figure 1 respectively." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper makes causal claims that data leakage caused irreproducible results. This is justified by correcting specific errors and showing results change (Section 4, Figure 1). The causal mechanism is demonstrated by fixing the leakage and observing performance drops." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper explicitly states the survey results are 'a lower bound of reproducibility issues' (Section 2.1) and that the case study is limited to civil war prediction. It does not overclaim beyond the tested settings." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section 5 discusses five alternative diagnoses for reproducibility failures beyond leakage (lack of understanding limits to prediction, hype, inadequate expertise, lack of standardization, lack of computational reproducibility). Appendix B discusses methodological choices in the corrected results." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": false, 131 "answer": false, 132 "justification": "The paper does not use LLMs or API-based models. It uses classical ML models (Random Forests, Logistic Regression, Adaboost) via R packages, which are specified." 133 }, 134 "prompts_provided": { 135 "applies": false, 136 "answer": false, 137 "justification": "The paper does not use prompting." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper does not report hyperparameters for the ML models used in the civil war prediction case study (e.g., number of trees in Random Forests, learning rate for Adaboost). It states they follow the original papers' implementations but does not detail the settings." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used in this paper." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Appendix B provides detailed documentation of data preprocessing, including the imputation correction methodology (mice package vs rfImpute), the precise mechanism of leakage, and step-by-step corrections for each paper." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 3.4 is titled 'Limitations of model info sheets' and discusses three specific limitations. The paper also acknowledges survey limitations in Section 2.1." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Section 3.4 discusses specific threats: claims cannot be verified without computational reproducibility, incorrect claims may provide false assurances, and ML expertise is required. Section 2.1 notes the survey is a lower bound due to inconsistent terminology across fields." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 1 explicitly defines scope: 'We focus on reproducibility issues in ML-based science' and distinguishes from ML methods research, ethics research, engineering applications, and modeling contests. The survey is explicitly presented as 'a lower bound' not comprehensive." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "The CodeOcean capsule (https://doi.org/10.24433/CO.4899453.v1) provides code and data for the civil war prediction case study, enabling independent verification." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Appendix B.1 describes the paper selection process for the civil war prediction review: search terms used, database searched (Dimensions), time period (Jan 2016 - May 2021), yielding 124 papers narrowed to 12. Section 2.1 describes the cross-disciplinary survey methodology." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants were recruited. The study analyzes published papers." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "Appendix B.1 documents the pipeline: 124 papers from search → 15 focused on civil war prediction with train-test split → 12 with complete code and data → 4 with identified errors. Each filtering step has counts and criteria." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding source is disclosed in the paper. There is an acknowledgments section thanking individuals for feedback but no mention of grants or funding agencies." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Authors are identified as from 'Department of Computer Science and Center for Information Technology Policy, Princeton University.' No product being evaluated, so no vendor conflict." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding is disclosed, so independence of funder cannot be assessed. The absence of funding disclosure is noted." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. It uses classical ML models trained from scratch on specific datasets." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "Same as above — no pre-trained model benchmark evaluation. Train-test overlap in the civil war prediction context is actually the core topic of the paper, but not in the contamination sense." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "No pre-trained model benchmark evaluation is conducted." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "This is a survey and reproducibility study, not a proposed method with inference costs." 275 }, 276 "compute_budget_stated": { 277 "applies": false, 278 "answer": false, 279 "justification": "This is a survey and reproducibility study. Compute requirements are minimal (classical ML models on small datasets)." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Data leakage is a widespread problem across 17 scientific fields adopting ML methods, collectively affecting 329 papers.", 286 "evidence": "Table 1 surveys 20 papers across 17 fields documenting leakage errors. Each field has at least one type of leakage identified (Section 2.1).", 287 "supported": "strong" 288 }, 289 { 290 "claim": "All 4 civil war prediction papers claiming superior ML model performance over Logistic Regression fail to reproduce due to data leakage.", 291 "evidence": "Figure 1 and Section 4 show that correcting leakage in Muchlinski et al. (2016), Colaresi & Mahmood (2017), Wang (2019), and Kaufman et al. (2019) eliminates the claimed advantage. Detailed corrections in Appendix B.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "Complex ML models don't perform substantively better than decades-old Logistic Regression models for civil war prediction once leakage is corrected.", 296 "evidence": "Figure 1 shows corrected AUC/accuracy values are comparable across model types. Wang (2019) AUC difference drops from 0.14 to 0.01 (Section 4).", 297 "supported": "strong" 298 }, 299 { 300 "claim": "Model info sheets would enable detection of leakage in each civil war prediction paper with errors.", 301 "evidence": "Appendix C demonstrates how model info sheets address each type of leakage found. However, this is argued by construction rather than tested empirically.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "9 of 12 civil war prediction papers with complete code and data included no significance tests or uncertainty quantification.", 306 "evidence": "Table A6 and Section 4, Finding 2. The paper provides specific p-values and confidence intervals to illustrate the problem with Blair & Sambanis (2020).", 307 "supported": "strong" 308 } 309 ], 310 "methodology_tags": ["meta-analysis", "case-study"], 311 "key_findings": "Data leakage is pervasive across 17 scientific fields adopting ML, affecting at least 329 papers. A taxonomy of 8 leakage types is presented, ranging from textbook errors (no test set) to open problems (sampling bias). In civil war prediction, all 4 papers claiming ML superiority over Logistic Regression fail to reproduce once leakage is corrected. The authors propose 'model info sheets' as a reporting standard to detect and prevent leakage.", 312 "red_flags": [], 313 "cited_papers": [ 314 { 315 "title": "Dos and Don'ts of Machine Learning in Computer Security", 316 "authors": ["D. Arp", "E. Quiring", "F. Pendlebury", "A. Warnecke", "F. Pierazzi", "C. Wressnegger", "L. Cavallaro", "K. Rieck"], 317 "year": 2022, 318 "relevance": "Comprehensive study of ML pitfalls in computer security, documenting leakage and reproducibility failures across 30 papers." 319 }, 320 { 321 "title": "Common pitfalls and recommendations for using machine learning to detect and prognosticate for COVID-19 using chest radiographs and CT scans", 322 "authors": ["M. Roberts", "D. Driggs", "M. Thorpe"], 323 "year": 2021, 324 "relevance": "Found all 62 reviewed COVID-19 ML papers had methodological pitfalls, directly relevant to ML evaluation quality." 325 }, 326 { 327 "title": "Reproducibility in machine learning for health research: Still a ways to go", 328 "authors": ["M. B. A. McDermott", "S. Wang", "N. Marinsek", "R. Ranganath", "L. Foschini", "M. Ghassemi"], 329 "year": 2021, 330 "relevance": "Documents reproducibility failures in ML-based health research, parallel to our survey's scope on ML methodology quality." 331 }, 332 { 333 "title": "Improving Reproducibility in Machine Learning Research (A Report from the NeurIPS 2019 Reproducibility Program)", 334 "authors": ["J. Pineau", "P. Vincent-Lamarre", "K. Sinha"], 335 "year": 2020, 336 "relevance": "Reports on NeurIPS reproducibility program including checklists for ML research, directly relevant to methodology quality assessment." 337 }, 338 { 339 "title": "Model Cards for Model Reporting", 340 "authors": ["M. Mitchell", "S. Wu", "A. Zaldivar"], 341 "year": 2019, 342 "relevance": "Introduced model cards for transparent ML model reporting, foundational work for the model info sheets proposed in this paper." 343 }, 344 { 345 "title": "The worst of both worlds: A comparative analysis of errors in learning from data in psychology and machine learning", 346 "authors": ["J. Hullman", "S. Kapoor", "P. Nanayakkara", "A. Gelman", "A. Narayanan"], 347 "year": 2022, 348 "arxiv_id": "2203.06498", 349 "relevance": "Cross-disciplinary analysis of statistical and ML methodology errors, directly relevant to methodology quality in ML-based research." 350 }, 351 { 352 "title": "Be careful of when: an empirical study on time-related misuse of issue tracking data", 353 "authors": ["F. Tu", "J. Zhu", "Q. Zheng", "M. Zhou"], 354 "year": 2018, 355 "relevance": "Documents temporal leakage issues in software engineering ML applications, relevant to SE methodology quality." 356 }, 357 { 358 "title": "Shortcut learning in deep neural networks", 359 "authors": ["R. Geirhos", "J.-H. Jacobsen", "C. Michaelis", "R. Zemel", "W. Brendel", "M. Bethge", "F. A. Wichmann"], 360 "year": 2020, 361 "relevance": "Identifies shortcut learning as a fundamental problem in ML evaluation, closely related to data leakage and benchmark validity." 362 }, 363 { 364 "title": "Measuring the predictability of life outcomes with a scientific mass collaboration", 365 "authors": ["M. J. Salganik"], 366 "year": 2020, 367 "relevance": "Demonstrates limits of ML prediction for social outcomes, supporting the paper's argument about overoptimism in ML-based science." 368 } 369 ] 370 }