scan.json (24095B)
1 { 2 "paper": { 3 "title": "NLP Evaluation in trouble: On the Need to Measure LLM Data Contamination for each Benchmark", 4 "authors": [ 5 "Oscar Sainz", 6 "Jon Ander Campos", 7 "Iker García-Ferrero", 8 "Julen Etxaniz", 9 "Oier Lopez de Lacalle", 10 "Eneko Agirre" 11 ], 12 "year": 2023, 13 "venue": "Conference on Empirical Methods in Natural Language Processing", 14 "arxiv_id": "2310.18018", 15 "doi": "10.48550/arXiv.2310.18018" 16 }, 17 "scan_version": 3, 18 "active_modules": [], 19 "methodology_tags": ["theoretical", "case-study"], 20 "key_findings": "This position paper argues that LLM evaluation on standard NLP benchmarks is compromised by data contamination, where models have been trained on test data. The authors define three contamination types (guideline, raw text, annotation) and identify contamination at three training stages (pre-training, fine-tuning, post-deployment). Empirical demonstrations show ChatGPT, WizardCoder, and GitHub Copilot can reproduce CoNLL03 training data verbatim, confirming contamination. The paper proposes community measures including automatic contamination detection, a contamination registry (the LM Contamination Index), and peer-review mechanisms to flag compromised conclusions.", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": false, 26 "justification": "No source code repository is provided in the paper. The LM Contamination Index website is referenced but no analysis scripts or contamination detection code are released." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "The paper references the publicly available LM Contamination Index at hitz-zentroa.github.io/lm-contamination/ (footnote 1), a registry of contamination cases that is part of this work's contribution." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "No environment specifications are provided for reproducing the contamination demonstrations shown in the appendix." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "No reproduction instructions are provided. The appendix shows prompts and outputs but does not describe how to systematically reproduce the contamination demonstrations." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": false, 47 "answer": false, 48 "justification": "Position paper with no quantitative experiments producing numerical results that would require confidence intervals." 49 }, 50 "significance_tests": { 51 "applies": false, 52 "answer": false, 53 "justification": "Position paper with no comparative quantitative claims requiring significance testing." 54 }, 55 "effect_sizes_reported": { 56 "applies": false, 57 "answer": false, 58 "justification": "Position paper with no quantitative experiments for which effect sizes would be applicable." 59 }, 60 "sample_size_justified": { 61 "applies": false, 62 "answer": false, 63 "justification": "Position paper with no experimental sample to justify." 64 }, 65 "variance_reported": { 66 "applies": false, 67 "answer": false, 68 "justification": "Position paper with no experimental runs for which variance would be reported." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": false, 74 "answer": false, 75 "justification": "Position paper with illustrative demonstrations, not a system evaluation requiring baselines." 76 }, 77 "baselines_contemporary": { 78 "applies": false, 79 "answer": false, 80 "justification": "No system evaluation is conducted; baselines are not applicable to a position paper." 81 }, 82 "ablation_study": { 83 "applies": false, 84 "answer": false, 85 "justification": "No system with components to ablate; this is a position paper." 86 }, 87 "multiple_metrics": { 88 "applies": false, 89 "answer": false, 90 "justification": "No system evaluation with metrics; this is a position paper with qualitative demonstrations." 91 }, 92 "human_evaluation": { 93 "applies": false, 94 "answer": false, 95 "justification": "No system outputs to evaluate; the paper argues a position rather than presenting a system." 96 }, 97 "held_out_test_set": { 98 "applies": false, 99 "answer": false, 100 "justification": "No experimental evaluation requiring train/test splits; this is a position paper." 101 }, 102 "per_category_breakdown": { 103 "applies": false, 104 "answer": false, 105 "justification": "No quantitative results to break down by category; this is a position paper." 106 }, 107 "failure_cases_discussed": { 108 "applies": false, 109 "answer": false, 110 "justification": "No system or method being evaluated that would have failure cases in the experimental sense." 111 }, 112 "negative_results_reported": { 113 "applies": false, 114 "answer": false, 115 "justification": "No experiments conducted that could produce negative results." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "The abstract claims that contamination causes overestimation and wrong scientific conclusions. The paper supports these with cited evidence (GPT-3 contamination bug in Brown et al. 2020, GPT-4's BIG-bench contamination in OpenAI 2023, empirical demonstrations in Appendix A) and logical argumentation throughout Sections 1-5." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "The paper's central causal claim — that training on test data causes overestimation of performance — is well-supported by the established mechanism (memorization → inflated scores) and concrete examples such as GPT-3's acknowledged contamination bug (Brown et al. 2020) and the BIG-bench mixing into GPT-4 training (OpenAI 2023, Section 2)." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper scopes its claims to 'evaluating LLMs on standard academic benchmarks' (Section 7) and explicitly states 'there could exist other issues in current evaluations, but, they are out of the scope of this position paper.' The title and claims consistently focus on NLP benchmark evaluation." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper briefly raises one alternative in Section 5.2 — that 'the lack of memorization of a benchmark ensures that the LLM was not trained on that benchmark' and that performance might still be unaffected — but labels this 'currently speculation' without substantive discussion. No other alternative explanations for the observed phenomena are considered." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper's central argument is precisely about the distinction between the proxy (benchmark scores from contaminated evaluations) and the outcome (actual model capability). Section 1 explicitly argues that contaminated scores 'overestimate' true performance and lead to wrong conclusions about model quality." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": false, 149 "justification": "The appendix demonstrations use 'ChatGPT', 'WizardCoder', and 'GitHub Copilot' without specifying exact model versions, snapshot dates, or API versions. No version information is provided for any model used in the demonstrations." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": true, 154 "justification": "The actual prompts used in the contamination demonstrations are shown in Figures 1, 2, and 3 in Appendix A. For example, Figure 1 shows: 'Please, generate the first instances of the CoNLL03 dataset train split in BIO format.'" 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": false, 159 "justification": "No temperature, sampling parameters, or other hyperparameters are reported for the LLM API calls used in the appendix demonstrations." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "No agentic scaffolding is used; the demonstrations are simple single-turn prompts to LLMs." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper does not document how the contamination demonstrations were selected or how the specific prompts were developed. There is no description of the process for choosing CoNLL03 or the three specific models." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 7 is titled 'Limitations' and discusses the scope of the paper and the early-stage nature of proposed solutions." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": false, 181 "justification": "The limitations section contains generic statements: 'we are aware that these are early-stage solutions and that the proposed effort is really challenging' and 'there could exist other issues in current evaluations.' No specific threats to the validity of the paper's own arguments or demonstrations are identified." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": true, 186 "justification": "Section 7 explicitly states: 'we address the problem of data contamination that occurs when evaluating LLMs on standard academic benchmarks' and that 'there could exist other issues in current evaluations, but, they are out of the scope of this position paper.'" 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": false, 193 "justification": "The appendix shows model outputs as figures but the raw model responses are not available for independent verification. Only selected portions of outputs are shown ('The output was shortened for commodity')." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": false, 198 "justification": "The paper does not describe how the contamination demonstrations were conducted — when the models were queried, how many attempts were made, or whether the shown outputs are representative or cherry-picked." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants in this study. The paper demonstrates model behavior through direct prompting." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": false, 208 "justification": "No data pipeline is documented. The process from prompting models to selecting which outputs to show in the appendix is not described." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "The acknowledgements section lists funding from the Basque Government (IT-1805-22), the Spanish Government (ILENIA project), and individual doctoral grants (PRE_2023_2_0137, PRE_2022_2_0208, PRE_2023_2_0060)." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Author affiliations are clearly listed: five authors at HiTZ Center - University of the Basque Country, and one author (Jon Ander Campos) at Cohere, an LLM company." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": true, 225 "justification": "Funding is from government research agencies (Basque Government, Spanish Government) that have no financial stake in the contamination findings. While one author is at Cohere, Cohere is not listed as a funder." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial interests statement is present in the paper. One author works at Cohere, an LLM company whose products could be affected by contamination discussions, but no declaration of competing interests is made." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": false, 236 "answer": false, 237 "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. It studies contamination itself by testing whether models can reproduce benchmark data, not measuring model performance on the benchmark." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": false, 241 "answer": false, 242 "justification": "The paper's entire topic is train/test overlap, but it does not itself evaluate model capability on benchmarks. The contamination checklist items are designed for papers that use benchmark evaluation, not papers studying contamination as a phenomenon." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": false, 246 "answer": false, 247 "justification": "Same as above — the paper IS about benchmark contamination as its research topic, but it does not evaluate models on benchmarks in a way that would require addressing contamination of its own evaluation." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in this study." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in this study." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in this study." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this study." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in this study." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in this study." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": false, 290 "answer": false, 291 "justification": "Position paper; no method with computational cost to report." 292 }, 293 "compute_budget_stated": { 294 "applies": false, 295 "answer": false, 296 "justification": "Position paper; no significant compute budget required." 297 } 298 } 299 }, 300 "claims": [ 301 { 302 "claim": "Data contamination causes overestimation of LLM performance on benchmarks, causing contaminated models to be preferred over non-contaminated counterparts.", 303 "evidence": "Supported by cited examples: GPT-3 authors acknowledged a bug that contaminated several benchmarks (Brown et al. 2020, Section 2), OpenAI stated BIG-bench was mixed into GPT-4 training (OpenAI 2023, Section 2), and GSM-8K/MATH training data was deliberately included (Section 2).", 304 "supported": "strong" 305 }, 306 { 307 "claim": "Papers using contaminated LLMs may draw wrong scientific conclusions about their hypotheses, invalidating alternative hypotheses that could be true.", 308 "evidence": "Section 2 and Appendix A identify specific papers evaluating ChatGPT/GPT-3/Codex on CoNLL03 (Wei et al. 2023, Li et al. 2023a, Han et al. 2023, Li et al. 2023b), a benchmark shown to be contaminated. The argument is logical but no quantitative analysis of how contamination affected specific conclusions is provided.", 309 "supported": "moderate" 310 }, 311 { 312 "claim": "ChatGPT, WizardCoder, and GitHub Copilot have memorized and can reproduce the CoNLL03 dataset.", 313 "evidence": "Appendix A, Figures 1-3 show all three models generating the first lines of CoNLL03 training data in BIO format when prompted, producing near-verbatim reproductions.", 314 "supported": "strong" 315 }, 316 { 317 "claim": "LLM memorization capabilities can be used to detect data contamination in closed models where training data is unavailable.", 318 "evidence": "Section 5.2 proposes using extractability (adapted from Carlini et al. 2023) as a contamination metric, demonstrated anecdotally in Appendix A for three models. However, the paper acknowledges this remains speculative and notes 'further research on this topic is necessary.'", 319 "supported": "moderate" 320 }, 321 { 322 "claim": "The community needs automatic measures, a contamination registry, and peer-review mechanisms to address data contamination.", 323 "evidence": "Section 6 proposes four action items. The LM Contamination Index (footnote 1) is presented as an initial implementation. This is a normative proposal rather than an empirical claim.", 324 "supported": "moderate" 325 } 326 ], 327 "red_flags": [ 328 { 329 "flag": "Anecdotal empirical evidence", 330 "detail": "The contamination demonstrations are limited to a single benchmark (CoNLL03) on three models. Despite arguing that contamination is widespread, no systematic quantification of contamination extent across benchmarks or models is provided." 331 }, 332 { 333 "flag": "Possible selection bias in demonstrations", 334 "detail": "The paper shows cases where models successfully reproduce benchmark data but does not describe how many attempts were made, whether any models failed to reproduce the data, or whether the CoNLL03 example was selected because it works particularly well." 335 }, 336 { 337 "flag": "Lack of reproducibility details for demonstrations", 338 "detail": "The appendix demonstrations lack model versions, dates of API access, temperature settings, and number of attempts. Outputs are described as 'shortened for commodity' without full outputs being available." 339 } 340 ], 341 "cited_papers": [ 342 { 343 "title": "Extracting training data from large language models", 344 "authors": ["Nicholas Carlini", "Florian Tramèr", "Eric Wallace", "Matthew Jagielski", "Ariel Herbert-Voss", "Katherine Lee", "Adam Roberts", "Tom Brown", "Dawn Song", "Úlfar Erlingsson", "Alina Oprea", "Colin Raffel"], 345 "year": 2021, 346 "relevance": "Foundational work on extracting memorized training data from LLMs, directly relevant to data leakage and contamination detection." 347 }, 348 { 349 "title": "Quantifying memorization across neural language models", 350 "authors": ["Nicholas Carlini", "Daphne Ippolito", "Matthew Jagielski", "Katherine Lee", "Florian Tramer", "Chiyuan Zhang"], 351 "year": 2023, 352 "relevance": "Defines extractability of memorized training examples, the metric this paper adapts for measuring data contamination levels." 353 }, 354 { 355 "title": "Data contamination: From memorization to exploitation", 356 "authors": ["Inbal Magar", "Roy Schwartz"], 357 "year": 2022, 358 "relevance": "Directly studies data contamination in LLMs, showing progression from memorization to exploitation of leaked data." 359 }, 360 { 361 "title": "Documenting large webtext corpora: A case study on the colossal clean crawled corpus", 362 "authors": ["Jesse Dodge", "Maarten Sap", "Ana Marasović", "William Agnew", "Gabriel Ilharco", "Dirk Groeneveld", "Margaret Mitchell", "Matt Gardner"], 363 "year": 2021, 364 "relevance": "Showed that the C4 pre-training corpus contained test splits of several NLP benchmarks, providing direct evidence of contamination." 365 }, 366 { 367 "title": "Stop uploading test data in plain text: Practical strategies for mitigating data contamination by evaluation benchmarks", 368 "authors": ["Alon Jacovi", "Avi Caciularu", "Omer Goldman", "Yoav Goldberg"], 369 "year": 2023, 370 "relevance": "Proposes preventative strategies to avoid benchmark contamination, complementary to this paper's detection-focused approach." 371 }, 372 { 373 "title": "Language models are few-shot learners", 374 "authors": ["Tom B. Brown", "Benjamin Mann", "Nick Ryder"], 375 "year": 2020, 376 "relevance": "GPT-3 paper that acknowledged a bug in their filtering script causing benchmark contamination during training." 377 }, 378 { 379 "title": "GPT-4 technical report", 380 "authors": ["OpenAI"], 381 "year": 2023, 382 "relevance": "Acknowledged that BIG-bench was inadvertently mixed into GPT-4 training and that MATH/GSM-8K training data was included to improve mathematical reasoning." 383 }, 384 { 385 "title": "Did chatgpt cheat on your test?", 386 "authors": ["Oscar Sainz", "Jon Ander Campos", "Iker García-Ferrero", "Julen Etxaniz", "Eneko Agirre"], 387 "year": 2023, 388 "relevance": "Prior work by the same authors showing ChatGPT generates portions of popular NLP benchmarks, providing evidence of contamination." 389 }, 390 { 391 "title": "Evaluating large language models trained on code", 392 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 393 "year": 2021, 394 "relevance": "Codex paper — one of the LLMs identified as potentially contaminated when evaluated on CoNLL03 by downstream papers." 395 }, 396 { 397 "title": "The ROOTS search tool: Data transparency for LLMs", 398 "authors": ["Aleksandra Piktus", "Christopher Akiki", "Paulo Villegas", "Hugo Laurençon"], 399 "year": 2023, 400 "relevance": "Tool for searching pre-training data to detect contamination; found BLOOM should not be evaluated on XNLI due to contamination." 401 }, 402 { 403 "title": "Memorization vs. generalization: Quantifying data leakage in NLP performance evaluation", 404 "authors": ["Aparna Elangovan", "Jiayuan He", "Karin Verspoor"], 405 "year": 2021, 406 "relevance": "Studies data leakage scenarios in NLP evaluation, quantifying the impact of memorization on performance estimates." 407 }, 408 { 409 "title": "Data portraits: Recording foundation model training data", 410 "authors": ["Marc Marone", "Benjamin Van Durme"], 411 "year": 2023, 412 "relevance": "Tool for auditing foundation model training data, useful for contamination detection in open LLMs." 413 } 414 ], 415 "engagement_factors": { 416 "practical_relevance": { 417 "score": 2, 418 "justification": "Proposes a contamination taxonomy and detection framework useful for researchers evaluating LLMs, but not a directly usable tool." 419 }, 420 "surprise_contrarian": { 421 "score": 2, 422 "justification": "Argues that standard NLP evaluation is 'in trouble' and published results may be invalid, challenging confidence in benchmark leaderboards." 423 }, 424 "fear_safety": { 425 "score": 1, 426 "justification": "Raises concerns about unreliable scientific conclusions but focuses on evaluation integrity rather than AI safety or security." 427 }, 428 "drama_conflict": { 429 "score": 2, 430 "justification": "Names specific models (ChatGPT, GPT-4, Copilot) as contaminated and identifies published papers with potentially invalid conclusions." 431 }, 432 "demo_ability": { 433 "score": 1, 434 "justification": "References the LM Contamination Index website but provides no pip-installable tool or interactive demo." 435 }, 436 "brand_recognition": { 437 "score": 2, 438 "justification": "Discusses ChatGPT, GPT-4, GitHub Copilot, and LLaMA; published at EMNLP, a top NLP venue." 439 } 440 } 441 }