scan.json (31285B)
1 { 2 "paper": { 3 "title": "MMLU-CF: A Contamination-free Multi-task Language Understanding Benchmark", 4 "authors": [ 5 "Qihao Zhao", 6 "Yangyu Huang", 7 "Tengchao Lv", 8 "Lei Cui", 9 "Furu Wei", 10 "Qinzheng Sun", 11 "Ying Xin", 12 "Shaoguang Mao", 13 "Xin Zhang", 14 "Qiufeng Yin", 15 "Scarlett Li" 16 ], 17 "year": 2024, 18 "venue": "arXiv", 19 "arxiv_id": "2412.15194", 20 "doi": "10.48550/arXiv.2412.15194" 21 }, 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": false, 27 "justification": "No source code repository URL is provided in the paper. They mention a 'project homepage' for evaluation submission but no actual code link." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": false, 32 "justification": "The validation set is described as 'open-source' but the paper uses future tense ('we will publicly release the validation set') and provides no download URL. The test set is explicitly closed-source." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "No environment specifications, requirements files, or dependency information is provided. They mention using OpenCompass for evaluation but provide no setup details." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "No step-by-step reproduction instructions are provided. The paper describes the dataset construction pipeline conceptually but gives no runnable instructions for reproducing the evaluation." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "Table 1 reports only point estimates (e.g., '73.4%') with no confidence intervals or error bars. No uncertainty quantification is provided for any result." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "Multiple comparative claims are made (e.g., 'GPT-4o emerges as the strongest model', 'Qwen2.5-72B-instruct outperforms its peers') based solely on comparing point estimates. No statistical significance tests are used." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "Effect sizes are provided in context: MMLU vs MMLU-CF scores are reported side-by-side (e.g., GPT-4o drops from 88.0% to 73.4%), and the Δ column shows absolute differences between test and validation sets. The ablation in Table 2 also shows incremental performance changes." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "The benchmark contains 10,000 test and 10,000 validation questions, but no justification is given for why this specific size was chosen. No power analysis or sample size rationale is discussed." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "All results appear to be from single runs. No standard deviations, variance across seeds, or repeated-run statistics are reported anywhere in the paper." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "MMLU scores are provided as a baseline comparison for all models in Table 1, enabling direct assessment of the contamination-free benchmark's impact." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "The evaluation includes contemporary models as of late 2024: GPT-4o, Gemini-1.5-Flash, Qwen2.5, Llama-3.3, Phi-4. These represent the state of the art at the time of publication." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "Table 2 presents an ablation study of the three decontamination rules (rephrase, shuffle choices, random replace) applied incrementally, showing each rule's contribution to performance change on GPT-4o, GPT-3.5-Turbo, and Llama-3.1-8b." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": false, 91 "justification": "The sole evaluation metric is accuracy (%). While both 5-shot and 0-shot settings are used, these are different evaluation configurations of the same metric, not different metrics (e.g., no F1, calibration, per-question analysis)." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": false, 96 "justification": "No human evaluation of model outputs is conducted. LLMs (GPT-4o, Gemini, Claude) are used to check question quality in the construction pipeline, but no humans evaluate the evaluation results." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": true, 101 "justification": "The benchmark explicitly separates test (10,000 questions, closed-source) and validation (10,000 questions, open-source) sets. Section 4.5 discusses the properties of this partitioning." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Table 3 provides per-discipline breakdowns across 14 fields (Math, Physics, Chemistry, Law, etc.) for six major models on the 5-shot test set." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": false, 111 "justification": "The paper notes that 'models perform worst in Computer Science' (Appendix A.1) but provides no detailed error analysis, qualitative failure examples, or discussion of where the benchmark approach breaks down." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": false, 116 "justification": "Every decontamination rule is presented as effective (Table 2 shows monotonic performance decreases). No unsuccessful approaches, failed rules, or abandoned methods are discussed." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "The abstract claims GPT-4o achieves 73.4% (5-shot) and 71.9% (0-shot), which Table 1 confirms. The claim that these scores indicate contamination-free evaluation is supported by the decontamination pipeline and ablation study." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "The ablation study in Table 2 uses controlled incremental addition of decontamination rules (adding Rule 1, then Rule 2, then Rule 3), which is an adequate design for the causal claim that these rules reduce contamination-inflated scores." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": false, 133 "justification": "The title and abstract claim 'Contamination-free' as an absolute property, but the decontamination rules can only reduce contamination, not guarantee its elimination. Rephrasing questions doesn't prevent a model from recognizing the underlying concept. The claim overstates what the methodology can actually prove." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "The performance drops from MMLU to MMLU-CF are attributed to contamination, but alternative explanations are not discussed. The questions may be inherently harder because they come from different, broader sources. Rule 3 (replacing choices with 'None of the other choices') increases difficulty independent of contamination. These confounds are not addressed." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": false, 143 "justification": "The paper frames MCQ accuracy as measuring 'understanding' and 'problem-solving abilities' without discussing the gap between MCQ performance and actual language understanding. No discussion of whether MCQ accuracy is a valid proxy for the claimed construct." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": true, 150 "justification": "API model versions are specified with snapshot dates: GPT-4o (v2024-10-1), GPT-4o-mini (v2024-10-1), GPT-4-Turbo (v2024-2-15), GPT-3.5-Turbo (v2024-2-15). Open-source models use standard versioned names (e.g., Qwen2.5-72B-instruct, Llama-3.3-70B-instruct)." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": true, 155 "justification": "Figure 1 shows the MCQ evaluation prompt ('There is a single choice question. Answer the question by replying A, B, C or D.'). Table 4 provides the full LLMs checking prompt. The difficulty rating prompt is quoted in Section 3.2." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": false, 160 "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any of the evaluation runs or the LLM checking pipeline. These settings significantly affect model output." 161 }, 162 "scaffolding_described": { 163 "applies": false, 164 "answer": false, 165 "justification": "No agentic scaffolding is used. Models are evaluated directly via standard 5-shot and 0-shot prompting through OpenCompass." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 3.2 and Figure 3 document the full 5-step pipeline with counts at each stage: 200B+ documents → 2.7M MCQs → 1.66M after cleaning → 50K after difficulty sampling → 20K after LLM checking → 10K test + 10K validation. Filtering criteria are described at each stage." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": true, 177 "justification": "Section 6 'Limitations' provides a dedicated limitations discussion covering potential remaining errors and the scope limitation to MCQs and language modalities." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": false, 182 "justification": "Section 6 offers mostly generic limitations: 'some errors may remain' and 'primarily focuses on multiple-choice questions and language modalities.' No specific threats like 'our use of GPT-4o for difficulty rating may introduce systematic bias' or 'web-sourced questions may contain errors despite LLM checking.'" 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": true, 187 "justification": "Section 6 explicitly states: 'this dataset primarily focuses on multiple-choice questions and language modalities. However, other aspects of large models' capabilities, such as math and code reasoning, multi-modal understanding (e.g., image and audio), and specific domain expertise, still require evaluation with similarly unbiased and contamination-free benchmarks.'" 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": false, 194 "justification": "The test set is explicitly closed-source. The validation set is described as forthcoming ('we will publicly release') but no download URL is provided. Raw source data (the 2.7M extracted questions) is not available." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 3.2 describes collection from 200+ billion documents across 3,000+ public website domains, using rule-based extraction to obtain 2.7 million MCQs spanning 14 fields." 200 }, 201 "recruitment_methods_described": { 202 "applies": true, 203 "answer": true, 204 "justification": "Data was sourced from public open-source websites using rule-based MCQ extraction across 3,000+ domains. The sourcing method and domain diversity are described in Section 3.2, Step 1." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "Figure 3 and Section 3.2 document the full pipeline: 200B+ documents → 2.7M (rule-based extraction) → 1.66M (cleaning) → 50K (difficulty sampling) → 20K (LLM quality checking) → 10K test + 10K validation (contamination-free processing). Counts and criteria at each stage are provided." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding source is disclosed anywhere in the paper. No acknowledgments section mentioning grants or sponsors." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "All authors are listed as affiliated with Microsoft Research. This is clearly stated on the first page." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "All authors work at Microsoft Research. Microsoft has a major investment in OpenAI, whose GPT-4o model emerges as the top performer on MMLU-CF. This financial relationship creates a non-independent interest in outcomes favorable to GPT-4o." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests statement is present. No disclosure of patents, equity, or financial interests related to the evaluated models or the benchmark itself." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "Despite the paper being fundamentally about contamination, no training data cutoff dates are stated for any of the 40+ evaluated models. This information would be crucial for assessing whether MMLU-CF questions predate model training." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": true, 243 "justification": "The entire paper is about preventing train/test overlap. Three decontamination rules (rephrasing, shuffling, random replacement) are specifically designed to break any memorized associations. Section 4.5 also introduces the Δ metric to monitor for future contamination." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": true, 248 "justification": "Benchmark contamination is the central contribution. The paper employs three decontamination rules, sources from diverse domains (3000+), keeps the test set closed-source, and demonstrates through ablation (Table 2) and Figure 1 that MMLU suffers from contamination while MMLU-CF mitigates it." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study. The paper evaluates LLMs on a benchmark dataset." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this study." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": false, 292 "justification": "No inference costs, API costs, or wall-clock times are reported for any of the 40+ model evaluations despite using both open-source and commercial API models." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "No compute budget is stated for the evaluation runs, the LLM-based quality checking pipeline, or the difficulty sampling process, all of which required substantial API calls." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single evaluation runs." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": false, 309 "justification": "The number of evaluation runs per model is never stated. It is unclear whether results are from single runs or averaged across multiple runs." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "No hyperparameter search budget is reported. The difficulty threshold (level 6 center) and quality score cutoff (>4) appear tuned but no search process is documented." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": true, 319 "justification": "The evaluation uses standard configurations (5-shot and 0-shot) that are conventional in LLM benchmarking, with both settings reported transparently in Table 1." 320 }, 321 "multiple_comparison_correction": { 322 "applies": true, 323 "answer": false, 324 "justification": "Over 40 models are compared with no statistical tests at all, let alone corrections for multiple comparisons." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "Microsoft Research authors evaluate models including GPT-4o (Microsoft/OpenAI partnership) and Phi-4 (Microsoft). No discussion of potential self-evaluation bias despite clear organizational interest." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": false, 334 "justification": "Models are categorized by parameter count (Mini/Small/Medium/Large) but no analysis of compute budget vs. performance is provided. Evaluation compute costs are not reported." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": false, 339 "justification": "The paper does not discuss whether MCQ accuracy actually measures 'language understanding' or 'problem-solving abilities' as claimed. No analysis of construct validity for the benchmark's claims." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": true, 343 "answer": true, 344 "justification": "All models are evaluated through the OpenCompass platform (Section 4.1) with standardized 5-shot and 0-shot prompting, ensuring consistent evaluation scaffolding across all models." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": true, 351 "justification": "The paper's core contribution addresses temporal leakage: questions are rephrased (Rule 1) to break memorization of previously seen questions, and the test set is kept closed-source to prevent future temporal leakage." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "No explicit discussion of feature leakage. While the MCQ format is standard, the paper does not discuss whether the evaluation setup could leak answer information through context or formatting cues." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": true, 361 "justification": "The three decontamination rules (rephrase, shuffle, replace) are specifically designed to break dependencies between MMLU-CF questions and any similar questions in model training data. Section 3.2 also uses GPT-4o for redundancy detection on semantically identical questions." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": true, 366 "justification": "Multiple concrete methods are used: (1) Three decontamination rules as prevention, (2) GPT-4o redundancy detection for semantic deduplication, (3) The prompt-based memorization test in Figure 1 showing models can recall MMLU choices but not MMLU-CF choices, (4) Δ metric between test/validation as ongoing contamination monitoring." 367 } 368 } 369 }, 370 "scan_version": 3, 371 "active_modules": ["experimental_rigor", "data_leakage"], 372 "claims": [ 373 { 374 "claim": "GPT-4o achieves only 73.4% (5-shot) and 71.9% (0-shot) on MMLU-CF, significantly lower than 88.0% on MMLU.", 375 "evidence": "Table 1 shows GPT-4o scores: MMLU 88.0%, MMLU-CF 5-shot test 73.4%, 0-shot test 71.9%. A 14.6 percentage point drop.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Three decontamination rules cumulatively reduce model performance, demonstrating contamination in standard benchmarks.", 380 "evidence": "Table 2 ablation: GPT-4o drops from 79.8% (no rules) to 73.4% (all rules). GPT-3.5-Turbo drops from 65.3% to 58.2%. Llama-3.1-8b drops from 63.8% to 57.1%. Each rule added incrementally reduces scores.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Test and validation sets have similar difficulty, with most Δ values below 1.0%.", 385 "evidence": "Table 1 shows about 60% of Δ values less than 0.5% and 96% below 1.0% across all models in both 5-shot and 0-shot settings.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "MMLU-CF sources from a broader domain (3,000+ website domains) than previous benchmarks to reduce contamination risk.", 390 "evidence": "Section 3.2 states questions come from 'over 3,000 different website domains' extracted from '200+ billion documents,' compared to MMLU which 'relied on a few sources.'", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Smaller models are more sensitive to decontamination rules than larger models.", 395 "evidence": "Table 2 shows larger relative drops for GPT-3.5-Turbo (-7.1pp) and Llama-3.1-8b (-6.7pp) compared to GPT-4o (-6.4pp) when all three rules are applied. Section 4.6 interprets this as smaller models being 'more sensitive.'", 396 "supported": "weak" 397 }, 398 { 399 "claim": "Some LLMs have memorized MMLU questions and can reproduce the exact choices when given only the question text.", 400 "evidence": "Figure 1 demonstrates that when given only MMLU questions as prompts, certain LLMs produce choices identical to the MMLU test set, while they produce guessed (different) choices for MMLU-CF questions.", 401 "supported": "moderate" 402 } 403 ], 404 "methodology_tags": ["benchmark-eval"], 405 "key_findings": "MMLU-CF is a contamination-free MCQ benchmark of 20,000 questions (10K closed test + 10K open validation) sourced from 3,000+ web domains with three decontamination rules. GPT-4o scores 73.4% (5-shot), a 14.6pp drop from 88.0% on MMLU, suggesting substantial contamination in standard benchmarks. An ablation study shows each decontamination rule (rephrasing, shuffling choices, random replacement) contributes to performance reduction, with cumulative effects strongest on smaller models. Test and validation set scores are highly consistent (96% of models within 1.0pp), enabling contamination monitoring.", 406 "red_flags": [ 407 { 408 "flag": "Conflict of interest: Microsoft Research evaluates Microsoft-partnered models", 409 "detail": "All authors are from Microsoft Research. GPT-4o (OpenAI/Microsoft partnership) and Phi-4 (Microsoft) are among the evaluated models, with GPT-4o emerging as the top performer. No conflict of interest statement is provided." 410 }, 411 { 412 "flag": "Confound between contamination and difficulty in Rule 3", 413 "detail": "Rule 3 replaces a random choice with 'None of the other choices' at 50% probability, which increases question difficulty independent of contamination. The ablation (Table 2) shows the largest performance drop when Rule 3 is added, but this cannot be cleanly attributed to decontamination versus added difficulty." 414 }, 415 { 416 "flag": "No statistical significance testing", 417 "detail": "Over 40 models are compared and ranked based solely on point estimates from apparently single runs. No confidence intervals, significance tests, or variance measures are reported, making it impossible to assess whether observed differences are meaningful." 418 }, 419 { 420 "flag": "Overclaimed property: 'Contamination-free'", 421 "detail": "The title claims 'Contamination-free' but the methods can only reduce contamination. Rephrasing questions does not prevent a model from recognizing underlying concepts. A model trained on medical knowledge will still answer medical MCQs correctly regardless of rephrasing — this is knowledge, not memorization." 422 }, 423 { 424 "flag": "Using LLMs to validate a benchmark for LLMs", 425 "detail": "GPT-4o is used for difficulty rating, quality checking, and redundancy detection in the construction pipeline. The same model (and model family) is then evaluated on the resulting benchmark. Any systematic biases in GPT-4o's quality judgments would be baked into the benchmark." 426 } 427 ], 428 "cited_papers": [ 429 { 430 "title": "Measuring Massive Multitask Language Understanding", 431 "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart", "Andy Zou", "Mantas Mazeika", "Dawn Song", "Jacob Steinhardt"], 432 "relevance": "The original MMLU benchmark that MMLU-CF is designed to replace; foundational benchmark for LLM evaluation." 433 }, 434 { 435 "title": "MMLU-Pro: A More Robust and Challenging Multi-task Language Understanding Benchmark", 436 "authors": ["Yubo Wang", "Xueguang Ma", "Ge Zhang"], 437 "year": 2024, 438 "arxiv_id": "2406.01574", 439 "relevance": "Enhanced MMLU variant with reasoning-focused questions and expanded choices; directly related benchmark in the MMLU family." 440 }, 441 { 442 "title": "A Careful Examination of Large Language Model Performance on Grade School Arithmetic", 443 "authors": ["Hugh Zhang", "Jeff Da", "Dean Lee"], 444 "year": 2024, 445 "arxiv_id": "2405.00332", 446 "relevance": "GSM1K contamination-free math benchmark that revealed memorization in LLMs; shares MMLU-CF's motivation." 447 }, 448 { 449 "title": "LiveBench: A Challenging, Contamination-Free LLM Benchmark", 450 "authors": ["Colin White", "Samuel Dooley", "Manley Roberts"], 451 "year": 2024, 452 "arxiv_id": "2406.19314", 453 "relevance": "Contamination-free benchmark using frequently updated questions; alternative approach to the same contamination problem." 454 }, 455 { 456 "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code", 457 "authors": ["Naman Jain", "King Han", "Alex Gu"], 458 "year": 2024, 459 "arxiv_id": "2403.07974", 460 "relevance": "Contamination-free coding benchmark with continuous collection; relevant to AI code generation evaluation." 461 }, 462 { 463 "title": "Investigating Data Contamination in Modern Benchmarks for Large Language Models", 464 "authors": ["Chunyuan Deng", "Yilun Zhao", "Xiangru Tang", "Mark Gerstein", "Arman Cohan"], 465 "year": 2024, 466 "relevance": "Systematic study of data contamination in LLM benchmarks; foundational work on the contamination problem MMLU-CF addresses." 467 }, 468 { 469 "title": "Rethinking Benchmark and Contamination for Language Models with Rephrased Samples", 470 "authors": ["Shuo Yang", "Wei-Lin Chiang", "Lianmin Zheng", "Joseph E. Gonzalez", "Ion Stoica"], 471 "year": 2023, 472 "arxiv_id": "2311.04850", 473 "relevance": "Decontaminator approach using rephrased samples; directly inspired MMLU-CF's decontamination rules." 474 }, 475 { 476 "title": "Quantifying Memorization Across Neural Language Models", 477 "authors": ["Nicholas Carlini", "Daphne Ippolito", "Matthew Jagielski", "Katherine Lee", "Florian Tramer", "Chiyuan Zhang"], 478 "year": 2023, 479 "relevance": "Foundational study of LLM memorization; provides theoretical basis for MMLU-CF's approach to preventing memorized answers." 480 }, 481 { 482 "title": "Detecting Pretraining Data from Large Language Models", 483 "authors": ["Weijia Shi", "Anirudh Ajith", "Mengzhou Xia"], 484 "year": 2023, 485 "arxiv_id": "2310.16789", 486 "relevance": "WIKIMIA dynamic benchmark for detecting pretraining data contamination; related approach to the contamination detection problem." 487 }, 488 { 489 "title": "Inference-time Decontamination: Reusing Leaked Benchmarks for Large Language Model Evaluation", 490 "authors": ["Qin Zhu", "Qingyuan Cheng", "Runyu Peng"], 491 "year": 2024, 492 "arxiv_id": "2406.13990", 493 "relevance": "Proposes inference-time approaches to address benchmark contamination; complementary technique to MMLU-CF's construction-time decontamination." 494 }, 495 { 496 "title": "Changing Answer Order Can Decrease MMLU Accuracy", 497 "authors": ["Vipul Gupta", "David Pantoja", "Candace Ross", "Adina Williams", "Megan Ung"], 498 "year": 2024, 499 "arxiv_id": "2406.19470", 500 "relevance": "Demonstrates that choice order affects LLM accuracy on MMLU; directly motivates MMLU-CF's Rule 2 (shuffle choices)." 501 }, 502 { 503 "title": "Are We Done with MMLU?", 504 "authors": ["Aryo Pradipta Gema", "Joshua Ong Jun Leang"], 505 "year": 2024, 506 "arxiv_id": "2406.04127", 507 "relevance": "Critical examination of MMLU's continued validity as a benchmark; provides context for MMLU-CF's creation." 508 } 509 ], 510 "engagement_factors": { 511 "practical_relevance": { 512 "score": 2, 513 "justification": "Practitioners evaluating LLMs could use MMLU-CF for more reliable model comparisons, though the test set is closed-source and the validation set was not yet released." 514 }, 515 "surprise_contrarian": { 516 "score": 2, 517 "justification": "Shows GPT-4o drops 14.6pp from MMLU to MMLU-CF, directly challenging the reliability of widely-cited MMLU leaderboard rankings." 518 }, 519 "fear_safety": { 520 "score": 1, 521 "justification": "Raises concerns that LLM evaluations may be unreliable due to contamination, but does not demonstrate direct safety or security risks." 522 }, 523 "drama_conflict": { 524 "score": 2, 525 "justification": "Implies that MMLU scores — widely used for model marketing — are inflated by contamination, with some models potentially having been maliciously trained on benchmark data." 526 }, 527 "demo_ability": { 528 "score": 1, 529 "justification": "The validation set was announced as forthcoming but not yet available; evaluation requires submission through a project homepage." 530 }, 531 "brand_recognition": { 532 "score": 2, 533 "justification": "From Microsoft Research, evaluates well-known models (GPT-4o, Gemini, Llama), and builds on the famous MMLU benchmark." 534 } 535 } 536 }