scan.json (25875B)
1 { 2 "paper": { 3 "title": "Lessons from the Trenches on Reproducible Evaluation of Language Models", 4 "authors": ["Stella Biderman", "Hailey Schoelkopf", "Lintang Sutawika", "Leo Gao", "Jonathan Tow", "Baber Abbasi", "Alham Fikri Aji", "Pawan Sasanka Ammanamanchi", "Sidney Black", "Jordan Clive", "Anthony DiPofi", "Julen Etxaniz", "Benjamin Fattori", "Jessica Zosa Forde", "Charles Foster", "Jeffrey Hsu", "Mimansa Jaiswal", "Wilson Y. Lee", "Haonan Li", "Charles Lovering", "Niklas Muennighoff", "Ellie Pavlick", "Jason Phang", "Aviya Skowron", "Samson Tan", "Xiangru Tang", "Kevin A. Wang", "Genta Indra Winata", "François Yvon", "Andy Zou"], 5 "year": 2024, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2405.14782", 8 "doi": "10.48550/arXiv.2405.14782" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval", "qualitative"], 13 "key_findings": "The paper identifies key challenges in LM evaluation including the 'Key Problem' of semantic equivalence, sensitivity to minor implementation details, and lack of reproducibility. Case studies demonstrate that prompt format changes can cause score swings of 10-20+ percentage points on ARC and MMLU. The lm-eval library addresses these via standardized task implementations, version tracking, standard error reporting, and configurable task definitions. Best practices include sharing exact prompts and code, avoiding copying results across papers, providing model outputs, and reporting uncertainty.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper describes lm-eval as an open source library and references the repository (Gao et al., 2021, with Zenodo DOI 10.5281/zenodo.5371628). The library is publicly available." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The benchmarks used are publicly available standard datasets (ARC, MMLU, etc.) accessed through the HuggingFace Datasets library. Task configurations are part of the open-source lm-eval codebase." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No requirements.txt, Dockerfile, or environment specification is mentioned in the paper. The paper describes lm-eval's design but does not specify the environment needed to reproduce the case study results." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "While YAML configuration files are shown in Appendix B.2 and the library is described, no step-by-step instructions for reproducing the specific case study results (Table 1, etc.) are provided." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": true, 41 "justification": "Table 1 reports 95% confidence intervals via standard error of the mean for all results (e.g., '38.0 ± 2.78%'). The paper also advocates for this practice (Section 3, Section 4.2)." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "Despite advocating for statistical significance testing (Section 3 'Measure and Report Uncertainty'), the paper does not perform significance tests on its own comparative results in Table 1." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Table 1 provides absolute scores with baseline context for different prompt styles, allowing readers to assess effect magnitudes (e.g., Mistral-7B: 58.6% MMLU-style vs 48.3% Hybrid, a 10.3pp difference)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper uses 5 models in its case study (Table 1) without justifying why these specific models or this number were chosen." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Results appear to be single-run evaluations. The confidence intervals reported are standard errors of the metric (sampling variance), not variance across multiple experimental runs." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The case studies compare multiple models (GPT-NeoX-20B, Llama-2-7B, Falcon-7B, Mistral-7B, Mixtral-8x7B) across prompt styles, serving as baselines for each other." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Models used include Mistral-7B and Mixtral-8x7B (2023-2024), which were contemporary at the time of writing." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Table 1 and Section 5.2 serve as ablation studies on prompt format: comparing Cloze vs MMLU-style for ARC, and MMLU-style vs Hybrid for MMLU, isolating the effect of prompt formatting." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper discusses and implements multiple metrics: accuracy, acc_norm (byte-length normalized), acc_mutual_info, perplexity, bits per byte, word-level perplexity (Section 4.1, Appendix A)." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "This is an infrastructure/methodology paper about evaluation tools. Human evaluation of the tool's outputs is not relevant to its claims about standardization and reproducibility." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are reported on standard test splits of ARC and MMLU benchmarks. The YAML configs show explicit test_split: test." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 1 breaks down results per model and per prompt style. Table 2 surveys architectures across multiple individual benchmarks. Results are not just aggregated." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 2 extensively discusses failure modes: sensitivity to prompts, reproducibility failures, the Key Problem, benchmark validity issues. Section 5.2 shows cases where prompt changes cause dramatic performance shifts." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper extensively reports negative findings: difficulty reproducing results from other papers, cases where evaluation methodology breaks down, and problems with current practices (Section 2, Section 5.2)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims about challenges in LM evaluation, best practices, and the lm-eval library are all substantiated in the paper's three main sections (2, 3, 4) with examples and case studies." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper's causal claims are primarily about prompt format affecting scores. Table 1 demonstrates this through controlled experiments varying only the prompt style while holding models constant, which is adequate for this causal claim." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper is careful about scope: it focuses on autoregressive language models (Section 2.1 explicitly notes the Key Problem does not apply to all domains), and case studies are presented as illustrative examples rather than universal claims." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "Section 2 extensively discusses alternative explanations for evaluation differences: tokenization effects, prompt sensitivity, normalization choices, benchmark validity. Section 5.2 notes that confidence intervals alone cannot resolve methodological differences." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "Section 2.2 explicitly discusses benchmark validity — the distinction between benchmark scores (proxy) and real-world capabilities (outcome). The paper acknowledges benchmarks are proxies and discusses construct validity concerns." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Table 1 uses model names like 'Mistral-7B' and 'Mixtral-8x7B' without specifying exact versions, revision hashes, or snapshot dates. References point to papers but not specific model checkpoints." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Full YAML configuration files including complete prompt templates are provided in Appendix B.2 for both ARC and MMLU variants, with Jinja templates showing the exact text used." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No temperature, top-p, or other sampling/generation hyperparameters are reported for the evaluations in Table 1, despite the paper being about evaluation methodology." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The paper evaluates models directly through the lm-eval harness using loglikelihood-based scoring." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Appendix A provides detailed documentation of tokenization handling, whitespace shifting, loglikelihood computation, perplexity calculation methods, and sliding window approaches. YAML configs show exact data source and split configurations." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no dedicated limitations section in the paper. The conclusion (Section 6) is brief and does not discuss limitations of the library or the paper's analysis." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No specific threats to validity of the paper's own analyses are discussed. While Section 2.2 discusses validity of benchmarks in general, the paper does not reflect on limitations of its own case studies or recommendations." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 2.1 explicitly notes the Key Problem does not impact domains with verifiers (coding, math, games). Section 2.2 states lm-eval focuses on measurement consistency rather than construct validity. Section 4 clarifies lm-eval does not prescribe benchmarks." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "The paper does not release raw evaluation outputs, per-sample results, or logs from the case study experiments despite advocating for this practice in Section 3 ('Always provide model outputs')." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "The evaluation procedure is described in detail: which benchmarks, which models, which prompt formats, and how scores are computed (Section 5.2, Appendix A, Appendix B.2)." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. The study evaluates language models on standard benchmarks." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "Appendix A provides extensive documentation of the full data pipeline from tokenization through loglikelihood computation to final metric calculation, including normalization options and scoring approaches." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding acknowledgments or grant information is provided in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly listed: EleutherAI, Stability AI, MBZUAI, Brown University, Amazon, Yale, CMU, etc. Multiple industry and academic affiliations are transparent." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed. Several authors are from EleutherAI, which develops and maintains lm-eval — the tool being promoted in this paper." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement is present. Authors from Amazon, Stability AI, and other companies that develop LLMs are evaluating infrastructure related to LLM evaluation." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper evaluates several pretrained models (GPT-NeoX-20B, Llama-2, Falcon, Mistral, Mixtral) on ARC and MMLU without stating any training data cutoff dates." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "Despite the paper discussing contamination as a general concern and even mentioning benchmark contamination issues (Section 2.4), it does not address potential train/test overlap for the specific models evaluated in its own case studies." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "ARC (2018) and MMLU (2020) were published before all evaluated models were trained. The paper does not discuss whether these benchmarks may have been in the training data of the models used in Table 1." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference costs, API costs, or evaluation compute time are reported for the case study experiments despite the paper noting that evaluation is expensive (Section 2.3.3)." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No computational budget (GPU hours, hardware used) is reported for running the case study evaluations." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "Results in Table 1 appear to be single-run evaluations. No seed sensitivity analysis is performed despite the paper discussing the importance of multi-seed evaluation (Section 3)." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of runs for Table 1 results is not stated. Standard errors reported are of the metric (sampling variance), not from multiple runs." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": false, 305 "answer": false, 306 "justification": "No hyperparameter search was performed — the paper evaluates pretrained models with fixed configurations, not tuning any parameters." 307 }, 308 "best_config_selection_justified": { 309 "applies": false, 310 "answer": false, 311 "justification": "No configuration selection was needed — the paper compares predetermined prompt formats, not selecting among them for best performance." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors of lm-eval are evaluating lm-eval as a contribution to evaluation rigor. This self-evaluation bias is not acknowledged despite the paper being about evaluation methodology." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": false, 325 "answer": false, 326 "justification": "The paper compares prompt formats, not compute-intensive approaches. Compute differences between configurations are negligible." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": true, 331 "justification": "Section 2.2 explicitly discusses benchmark validity and construct validity, referencing Subramonian et al. (2023), Raji et al. (2021), Saphra et al. (2023), and Davis (2023). The paper acknowledges that benchmarks may not measure what is claimed." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is used. Models are evaluated directly via loglikelihood scoring." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "Despite Section 2.4 and Figure 1 discussing how benchmarks predate modern models, the paper does not address temporal leakage for its own experiments (e.g., whether Mistral-7B's training data included ARC/MMLU solutions)." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information to models through the prompt format or few-shot examples." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether benchmark examples share structure or sources with model training data." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No concrete leakage detection or prevention method is applied to the paper's own evaluations." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Minor implementation details like prompt format significantly impact LM evaluation scores", 365 "evidence": "Table 1 shows Mistral-7B scores 72.4% vs 25.9% (Falcon-7B) on ARC with MMLU-style prompting but 50.1% vs 40.2% with cloze-style; MMLU scores for Mistral-7B shift from 58.6% to 48.3% between MMLU-style and Hybrid formats (Section 5.2)", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Copying results across papers with different evaluation setups leads to nonsensical comparisons", 370 "evidence": "Section 5.2 demonstrates that different prompt styles produce substantially different scores, making cross-paper comparisons unreliable. Section 3 recommends against copying results (Section 3, citing Marie et al. 2021)", 371 "supported": "strong" 372 }, 373 { 374 "claim": "lm-eval has been widely adopted by the community for standardized evaluation", 375 "evidence": "Table 2 surveys 12 recent architecture releases and shows most use lm-eval for evaluation. Section 5.3 mentions adoption for the Open LLM Leaderboard (Beeching et al., 2023) and community benchmark contributions", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Standard error reporting and confidence intervals are important but insufficient for addressing evaluation methodology differences", 380 "evidence": "Section 5.2 states: 'better statistical reporting such as the use of confidence intervals which we report, does not resolve these issues–while it gives a sense of how reliable a given measurement is, it cannot tell us how much model performance will vary across different measurement settings'", 381 "supported": "moderate" 382 } 383 ], 384 "red_flags": [ 385 { 386 "flag": "Authors evaluate their own tool", 387 "detail": "EleutherAI authors present lm-eval, a tool they built and maintain, as the solution to evaluation challenges. The paper reads partly as advocacy for their tool without acknowledging self-evaluation bias." 388 }, 389 { 390 "flag": "Does not follow own recommendations", 391 "detail": "The paper recommends sharing model outputs, reporting uncertainty across runs, and performing statistical significance testing, but does not do any of these for its own case study results. Table 1 shows single-run results with only sampling standard errors." 392 }, 393 { 394 "flag": "No contamination analysis despite discussing it", 395 "detail": "The paper discusses benchmark contamination as a general concern (Section 2.4) and temporal misalignment (Figure 1), but does not analyze whether the models in its own experiments may have been contaminated on ARC/MMLU." 396 } 397 ], 398 "cited_papers": [ 399 { 400 "title": "The benchmark lottery", 401 "authors": ["Mostafa Dehghani", "Yi Tay", "Alexey A. Gritsenko", "Zhe Zhao", "Neil Houlsby", "Fernando Diaz", "Donald Metzler", "Oriol Vinyals"], 402 "year": 2021, 403 "relevance": "Studies inconsistencies and biases in evaluation practices and their influence on research direction." 404 }, 405 { 406 "title": "Quantifying language models' sensitivity to spurious features in prompt design", 407 "authors": ["Melanie Sclar", "Yejin Choi", "Yulia Tsvetkov", "Alane Suhr"], 408 "year": 2023, 409 "relevance": "Demonstrates sensitivity of LMs to minor prompt variations, core evidence for evaluation reproducibility concerns." 410 }, 411 { 412 "title": "When benchmarks are targets: Revealing the sensitivity of large language model leaderboards", 413 "authors": ["Norah Alzahrani"], 414 "year": 2024, 415 "relevance": "Explores LLM sensitivity to evaluation setup including prompt format, directly relevant to benchmark contamination and evaluation rigor." 416 }, 417 { 418 "title": "Scientific credibility of machine translation research: A meta-evaluation of 769 papers", 419 "authors": ["Benjamin Marie", "Atsushi Fujita", "Raphael Rubino"], 420 "year": 2021, 421 "relevance": "Meta-evaluation study finding widespread methodological issues in MT research, parallel to LLM evaluation concerns." 422 }, 423 { 424 "title": "Holistic evaluation of language models", 425 "authors": ["Percy Liang"], 426 "year": 2023, 427 "relevance": "Major LLM benchmarking framework (HELM) that takes a complementary approach to lm-eval for standardized evaluation." 428 }, 429 { 430 "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena", 431 "authors": ["Lianmin Zheng"], 432 "year": 2023, 433 "arxiv_id": "2306.05685", 434 "relevance": "Introduces LLM-as-judge evaluation and Chatbot Arena, key alternative evaluation paradigm for LLMs." 435 }, 436 { 437 "title": "Measuring massive multitask language understanding", 438 "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart", "Andy Zou", "Mantas Mazeika", "Dawn Song", "Jacob Steinhardt"], 439 "year": 2020, 440 "relevance": "Introduces MMLU benchmark, one of the most widely used LLM evaluations, central to the paper's case studies." 441 }, 442 { 443 "title": "Language models are few-shot learners", 444 "authors": ["Tom B Brown"], 445 "year": 2020, 446 "arxiv_id": "2005.14165", 447 "relevance": "GPT-3 paper that established in-context learning evaluation paradigm, foundational for current LM evaluation practices." 448 }, 449 { 450 "title": "Are emergent abilities of large language models a mirage?", 451 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 452 "year": 2023, 453 "relevance": "Questions emergent abilities claims, relevant to evaluation methodology and measurement choices affecting conclusions." 454 }, 455 { 456 "title": "Beyond the imitation game: Quantifying and extrapolating the capabilities of language models", 457 "authors": ["Aarohi Srivastava"], 458 "year": 2022, 459 "arxiv_id": "2206.04615", 460 "relevance": "BIG-bench benchmark suite, major evaluation infrastructure effort complementary to lm-eval." 461 } 462 ] 463 }