scan.json (31686B)
1 { 2 "paper": { 3 "title": "Quantifying Contamination in Evaluating Code Generation Capabilities of Language Models", 4 "authors": ["Martin Riddell", "Ansong Ni", "Arman Cohan"], 5 "year": 2024, 6 "venue": "Annual Meeting of the Association for Computational Linguistics", 7 "arxiv_id": "2403.04811", 8 "doi": "10.48550/arXiv.2403.04811" 9 }, 10 "scan_version": 3, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval", "observational"], 13 "key_findings": "Between 3.6% and 20.8% of MBPP and HumanEval benchmark solutions appear in the PILE and STACK pretraining corpora via surface-level and semantic-level matching. Models perform dramatically better on contaminated subsets (up to 50+ percentage point gap between top-10% and bottom-10% similarity bins). De-contaminated accuracy drops by up to 82.7% for some model-benchmark pairs, and much of the apparent performance gap between model families is attributable to differential contamination rather than capability differences.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper provides a GitHub link in footnote 1: 'Code and data available at https://github.com/yale-nlp/code-llm-contamination' and states 'We release all resulting files from our matching pipeline for future research.'" 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The matching results are released via the same GitHub link. Additionally, the benchmarks (MBPP, HumanEval) and pretraining corpora (PILE, STACK) are publicly available datasets." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No environment specification (requirements.txt, Dockerfile, conda environment) is mentioned in the paper. They reference the rapidfuzz library (footnote 2) and Dolos toolkit, but no comprehensive dependency list is provided." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided in the paper. The methodology is described at a high level but lacks specific commands or scripts to replicate the matching pipeline." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results in Tables 1-4 are reported as point estimates without confidence intervals or error bars. No uncertainty quantification is provided for any accuracy or similarity score." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims models perform 'significantly better' on contaminated subsets but provides no statistical significance tests (no p-values, t-tests, or other formal tests). Comparisons are based solely on numerical differences." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Effect sizes are reported with context: performance gaps with baselines (e.g., Tab 2 shows 72% vs 22% accuracy for top/bottom 10%), relative accuracy degradation percentages in Tab 1 (e.g., '-82.7%'), and the absolute performance gap metric (∆⇕)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification for sample sizes. The 500 MBPP test questions and 164 HumanEval questions are used as-is without power analysis or discussion of whether these sizes support the claimed statistical patterns." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance or standard deviation is reported. Model outputs are taken from L2CEval as single-run results. No multiple-seed analysis or spread measures are provided." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper compares multiple model families (Pythia, CodeGen-NL, StarCoderBase) across multiple sizes, and compares contaminated vs. de-contaminated performance. Original accuracy serves as the baseline for measuring contamination effects." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "The models studied (StarCoderBase, Pythia, CodeGen-NL) were current open-source models at the time of writing. The selection criteria are clearly justified: models must have publicly available training data." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper ablates on surface-level vs. semantic-level similarity (Fig 3, Fig 11), different similarity thresholds (Tab 1 with scores of 100, >90, >80), model sizes (Fig 5, Tab 4), and contamination subsets vs. complements (Tab 3a)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Multiple metrics are used: surface-level Levenshtein similarity, semantic-level Dolos similarity, aggregated similarity scores, top-1 and top-10 scores, pass@1 accuracy, de-contaminated accuracy, and performance gap (∆⇕)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation is included. The similarity matching and accuracy evaluation are entirely automated. Manual inspection is only mentioned for threshold justification (footnote 5) but not as a formal evaluation." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are reported on the standard test splits of MBPP (500 questions) and HumanEval (164 questions). Model outputs are from L2CEval's standard evaluation protocol." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Extensive breakdowns are provided: per-model (12 models), per-benchmark (MBPP vs HumanEval), per-threshold, per-similarity-bin (Fig 5), per-subset seen/unseen (Tab 3a), and per-corpus (PILE vs STACK)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 4.3 presents a case study of examples where models fail despite having seen similar solutions 10+ times in training, with two representative examples (Fig 7, Fig 8) and discussion of why failure occurs." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Table 3a shows that Pythia and CodeGen-NL models actually perform worse on the STACK-seen subset than on unseen questions (e.g., CodeGen-NL-16B: 11.5% on seen vs. 21.7% on unseen for MBPP). The case study also reports models failing on highly contaminated examples." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims of 'substantial overlap' are supported by Fig 1 and Section 4.1 (3.6-20.8% exact matches). Claims of 'significantly better' performance on contaminated subsets are supported by Tab 2 (50+ point gaps). Analysis of model size, difficulty, and question length are in Section 4.2." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper makes causal claims ('models perform significantly better when similar solutions are seen during training') from observational data. While Section 4.2 attempts to decouple memorization and difficulty by cross-referencing subsets across model families, this does not establish causation. Confounds such as problem type, coding style, or algorithmic patterns shared between training data and benchmarks are not controlled for." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper explicitly bounds its claims to MBPP and HumanEval benchmarks, three model series, and two corpora. The Limitations section (§6) acknowledges that only gold solutions were searched, compute constraints limited the search scope, and more models/benchmarks are needed." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "Section 4.2 explicitly addresses the alternative explanation that contaminated questions might simply be easier, using cross-model subset analysis to show this is not the case. The effect of program length is analyzed in Fig 6 to rule out length as a confound." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper is careful to distinguish between its similarity scores (proxy) and actual contamination (outcome). It acknowledges in Limitations that searching only gold solutions provides a 'minimum number of questions' exposed, that false positives exist in semantic matching, and that multiple correct solutions mean their measurements are lower bounds." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Exact model names and sizes are specified: Pythia-1.4B/2.8B/6.9B/12B, CodeGen-NL-350M/2B/6B/16B, StarCoderBase-1B/3B/7B/15.5B (Section 3.1). These are specific named model checkpoints from known training runs." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper uses model outputs from L2CEval (Ni et al., 2023) but does not provide the exact prompt formats used for benchmark evaluation. The reader is directed to L2CEval without the actual prompt text being included." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No generation hyperparameters (temperature, top-p, max tokens) are reported for the model evaluations. The outputs are taken from L2CEval without specifying these settings. For the matching pipeline, the top-500 threshold is justified (footnote 5) but other parameters (sliding window step size, etc.) are not specified." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The paper evaluates base models directly on benchmarks and runs a similarity matching pipeline." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "The data pipeline is well documented in Section 2: sliding window character-by-character scan, Levenshtein similarity computation, top-500 selection, Dolos semantic matching, and aggregation via max(Ssurface, Ssemantic). Section 3.1 specifies the GitHub split (95.16 GiB) for PILE and 60.40 GB Python split for STACK." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 6 'Limitations' provides substantive discussion across four specific subsections: multiple correct solutions, compute costs limiting search, false positives, and scarcity of open data." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "The limitations are specific to this study: searching only gold solutions means contamination is a lower bound, compute constraints limited search to GitHub/Python splits only, Dolos semantic matching can produce false positives (with examples referenced in §A.4), and only two benchmarks with gold solutions could be studied." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "The paper explicitly states it covers only MBPP and HumanEval, only three model series with open training data, only gold solutions (not all correct solutions), and only specific splits of training data. It notes 'We hope more open models with open data will become available in the future.'" 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "The paper releases all matching pipeline results via GitHub (footnote 1). The underlying benchmarks (MBPP, HumanEval) and training corpora (PILE, STACK) are publicly available. Appendix A.4 provides lists of all perfect matches." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 2 describes the matching pipeline in detail: character-by-character sliding window for surface matching, tree-sitter AST tokenization for semantic matching via Dolos, and aggregation. Section 3 describes model selection criteria and data sources." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. The study uses publicly available benchmarks and pretraining corpora." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The full pipeline is documented: benchmark gold solutions → sliding window Levenshtein matching over training data → top 500 surface matches per question → Dolos semantic matching → aggregated score = max(surface, semantic). Thresholds and filtering criteria are specified." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "The Acknowledgements section thanks specific individuals but does not mention any funding sources, grants, or financial support." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All authors are disclosed as being from the Department of Computer Science, Yale University. They are not affiliated with any of the companies whose models they evaluate." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding source is disclosed, so independence cannot be assessed. The authors are academic researchers with no apparent financial stake in the models evaluated." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": true, 230 "justification": "Rather than stating cutoff dates, the paper goes further: it uses the actual training data (PILE GitHub split, STACK Python split) and directly searches through it. The training datasets are fully specified and publicly available (Section 3.1)." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": true, 235 "justification": "Train/test overlap is the central focus of the paper. Section 4.1 precisely quantifies the overlap using both surface-level and semantic-level matching, finding 3.6-20.8% exact matches." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": true, 240 "justification": "Benchmark contamination is the paper's primary contribution. They note that STACK went through 'a string-matching-based decontamination process for MBPP and HumanEval, but we are still able to find traces of contamination' (footnote 7), demonstrating that standard decontamination is insufficient." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study. It is a computational analysis of training data overlap." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. The study analyzes publicly available datasets and models." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference cost or latency is reported for the model evaluations. Footnote 4 estimates '5.2×10^5 CPU hours to search just the Python files from the STACK for MBPP' but this is used to justify search limitations, not as systematic cost reporting." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "The total computational budget for the matching pipeline is not stated. Footnote 4 provides a theoretical estimate for full search, and they mention limiting to specific splits to reduce cost, but the actual compute spent is not reported." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No seed sensitivity analysis is reported. Model outputs are taken as single runs from L2CEval. The matching pipeline is deterministic, but model generation is not." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of evaluation runs is not stated. Results appear to be from single-run evaluations via L2CEval." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No formal hyperparameter search budget is reported. The top-500 threshold is justified by manual inspection (footnote 5: '95% of them have a similarity score < 72'), but the overall search budget for pipeline parameters is not disclosed." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "Table 1 reports results at multiple thresholds (100, >90, >80) rather than selecting only the most favorable one. The aggregation method (max of surface and semantic) is motivated by the complementary examples in Figs 2 and 4." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "The paper makes numerous comparisons across models, benchmarks, thresholds, and subsets without any correction for multiple comparisons. No formal statistical tests are used at all, let alone corrected ones." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors do not discuss potential biases in their own evaluation methodology. While they acknowledge false positives in the Limitations section, they do not discuss how their pipeline design choices might systematically bias contamination estimates." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": false, 325 "answer": false, 326 "justification": "The paper does not propose a competing method where compute budget vs. performance tradeoffs are relevant. It is an analysis study measuring contamination." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": true, 331 "justification": "The entire paper questions whether MBPP and HumanEval benchmark scores actually measure model capability vs. memorization. This is a direct investigation of construct validity for code generation benchmarks." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved. Models are evaluated directly on benchmarks without agentic scaffolding." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": true, 343 "justification": "The paper directly addresses temporal leakage by searching pretraining corpora for benchmark solutions. They note that MBPP and HumanEval predate the training data collection, creating contamination risk." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "The paper does not discuss whether the benchmark evaluation setup itself leaks information (e.g., whether docstrings or test cases in HumanEval prompts contain answer hints). The focus is entirely on training data overlap." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": true, 353 "justification": "The paper directly measures non-independence between training and test data through exhaustive similarity matching. Tables 3b and 5-13 provide exact counts and examples of overlapping instances." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": true, 358 "justification": "The paper proposes and applies a concrete two-stage detection method: Levenshtein surface-level matching via sliding window, followed by Dolos AST-based semantic matching. This is the paper's core contribution." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "3.6% to 20.8% of MBPP and HumanEval benchmark solutions are found in pretraining corpora as exact matches", 365 "evidence": "Fig 1 and Section 4.1: 12.2% of HumanEval in PILE, 18.9% in STACK; 3.6% of MBPP in PILE, 20.8% in STACK (using aggregated similarity score = 100).", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Models perform significantly better on questions where similar solutions were seen during training", 370 "evidence": "Table 2: StarCoderBase-15.5B shows 72% accuracy on top-10% similarity questions vs. 22% on bottom-10% for MBPP (50pp gap). Pythia-12B shows 56.3% vs 0% on HumanEval (56.3pp gap).", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Removing contaminated examples causes substantial accuracy drops, and narrows performance gaps between model families", 375 "evidence": "Table 1: Pythia-12B accuracy drops from 9.8% to 1.7% (-82.7%) on HumanEval at threshold >80. The StarCoderBase-Pythia gap shrinks from 23.8% to 13.9% after de-contamination.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Larger models are better at both memorization and generalization", 380 "evidence": "Figure 5 shows across all three model families that larger models consistently achieve higher accuracy at every similarity bin, indicating improvement in both exploiting seen data and solving unseen problems.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "The performance advantage on contaminated examples is not explained by question difficulty", 385 "evidence": "Table 3a: CodeGen-NL-16B has 19.6% overall accuracy but only 11.5% on the 104 MBPP questions seen by StarCoderBase, indicating these questions are not inherently easier. Models not trained on STACK perform worse on the STACK-seen subset.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "STACK's string-matching decontamination process was insufficient for removing benchmark overlap", 390 "evidence": "Footnote 7 notes the STACK underwent decontamination, yet the study still finds 20.8% of MBPP and 18.9% of HumanEval solutions in STACK, demonstrating semantic-level contamination that surface decontamination missed.", 391 "supported": "strong" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "No statistical significance tests", 397 "detail": "The paper uses the word 'significantly' when describing performance differences but provides no formal statistical tests. All comparisons are based on raw numerical differences without p-values, confidence intervals, or any inferential statistics." 398 }, 399 { 400 "flag": "No uncertainty quantification", 401 "detail": "All accuracy results are single-run point estimates from L2CEval with no error bars, standard deviations, or confidence intervals. The stability of the reported performance gaps across different evaluation conditions is unknown." 402 }, 403 { 404 "flag": "Causal claims from observational data", 405 "detail": "The paper implies contamination causes better performance, but the study design is observational. While the cross-model difficulty analysis (Section 4.2) is clever, it does not fully control for confounds such as shared algorithmic patterns or coding style correlations." 406 }, 407 { 408 "flag": "Limited search scope understates contamination", 409 "detail": "The search was limited to the GitHub split of PILE and the Python split of STACK due to compute constraints. Similar solutions may exist in other programming languages or non-code text, meaning the reported contamination rates are lower bounds, but the magnitude of the underestimate is unknown." 410 } 411 ], 412 "cited_papers": [ 413 { 414 "title": "Program synthesis with large language models", 415 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye", "Maarten Bosma", "Henryk Michalewski", "David Dohan", "Ellen Jiang", "Carrie Cai", "Michael Terry", "Quoc Le", "Charles Sutton"], 416 "year": 2021, 417 "relevance": "Introduces MBPP, one of the two primary benchmarks studied for contamination in this paper." 418 }, 419 { 420 "title": "Evaluating large language models trained on code", 421 "authors": ["Mark Chen", "Jerry Tworek"], 422 "year": 2021, 423 "relevance": "Introduces HumanEval, the other primary code generation benchmark studied for contamination." 424 }, 425 { 426 "title": "Starcoder: may the source be with you!", 427 "authors": ["Raymond Li", "Loubna Ben Allal"], 428 "year": 2023, 429 "relevance": "StarCoderBase models are a primary subject of the contamination study, trained on the STACK corpus." 430 }, 431 { 432 "title": "Pythia: A suite for analyzing large language models across training and scaling", 433 "authors": ["Stella Biderman", "Hailey Schoelkopf"], 434 "year": 2023, 435 "relevance": "Pythia model suite is one of three model families studied for training data contamination effects." 436 }, 437 { 438 "title": "Codegen: An open large language model for code with multi-turn program synthesis", 439 "authors": ["Erik Nijkamp", "Bo Pang"], 440 "year": 2023, 441 "relevance": "CodeGen-NL model series is studied for contamination effects on code generation benchmarks." 442 }, 443 { 444 "title": "The pile: An 800gb dataset of diverse text for language modeling", 445 "authors": ["Leo Gao", "Stella Biderman"], 446 "year": 2020, 447 "relevance": "The PILE pretraining corpus is one of two training datasets searched for benchmark contamination." 448 }, 449 { 450 "title": "The stack: 3 tb of permissively licensed source code", 451 "authors": ["Denis Kocetkov", "Raymond Li"], 452 "year": 2022, 453 "relevance": "The STACK pretraining corpus is the other training dataset searched for benchmark contamination." 454 }, 455 { 456 "title": "Quantifying memorization across neural language models", 457 "authors": ["Nicholas Carlini", "Daphne Ippolito"], 458 "year": 2023, 459 "relevance": "Studies memorization in language models, directly relevant to understanding LLM training data leakage." 460 }, 461 { 462 "title": "Detecting pretraining data from large language models", 463 "authors": ["Weijia Shi", "Anirudh Ajith", "Mengzhou Xia"], 464 "year": 2023, 465 "relevance": "Proposes methods for detecting contamination in black-box models using output probabilities." 466 }, 467 { 468 "title": "Time travel in llms: Tracing data contamination in large language models", 469 "authors": ["Shahriar Golchin", "Mihai Surdeanu"], 470 "year": 2023, 471 "relevance": "Studies data contamination in LLMs for natural language tasks, complementary to this paper's code focus." 472 }, 473 { 474 "title": "L2ceval: Evaluating language-to-code generation capabilities of large language models", 475 "authors": ["Ansong Ni", "Pengcheng Yin", "Yilun Zhao", "Martin Riddell"], 476 "year": 2023, 477 "arxiv_id": "2309.17446", 478 "relevance": "Evaluation framework and model outputs used in this paper to obtain code generation performance numbers." 479 }, 480 { 481 "title": "Generalization or memorization: Data contamination and trustworthy evaluation for large language models", 482 "authors": ["Yihong Dong", "Xue Jiang", "Huanyu Liu", "Zhi Jin", "Ge Li"], 483 "year": 2024, 484 "arxiv_id": "2402.15938", 485 "relevance": "Identifies contamination through output distribution peakness for black-box LLMs, a complementary detection approach." 486 }, 487 { 488 "title": "Codex hacks hackerrank: Memorization issues and a framework for code synthesis evaluation", 489 "authors": ["Anjan Karmakar", "Julian Aron Prenner", "Marco D'Ambros", "Romain Robbes"], 490 "year": 2022, 491 "relevance": "Demonstrates memorization patterns in code LLMs and proposes evaluation frameworks for code synthesis." 492 }, 493 { 494 "title": "CodeIPPrompt: Intellectual property infringement assessment of code language models", 495 "authors": ["Zhiyuan Yu", "Yuhao Wu", "Ning Zhang"], 496 "year": 2023, 497 "relevance": "Uses Dolos plagiarism detection tool (also used in this paper) to assess code LLM intellectual property issues." 498 } 499 ], 500 "engagement_factors": { 501 "practical_relevance": { 502 "score": 2, 503 "justification": "Practitioners evaluating LLMs on code benchmarks should know about contamination rates; the released pipeline could be used for contamination checking." 504 }, 505 "surprise_contrarian": { 506 "score": 2, 507 "justification": "Quantifies that up to 20.8% of popular benchmark solutions appear in training data and that STACK's decontamination process was insufficient, challenging face-value benchmark results." 508 }, 509 "fear_safety": { 510 "score": 1, 511 "justification": "Raises concerns about reliability of LLM evaluations but does not demonstrate a novel security vulnerability or existential risk." 512 }, 513 "drama_conflict": { 514 "score": 2, 515 "justification": "Directly challenges the validity of widely-cited benchmark results (MBPP, HumanEval) and shows that reported model performance gaps may be largely due to contamination." 516 }, 517 "demo_ability": { 518 "score": 1, 519 "justification": "Code and data are released on GitHub, but the pipeline requires significant compute to run and is not a simple tool someone can quickly try." 520 }, 521 "brand_recognition": { 522 "score": 1, 523 "justification": "Yale NLP is a respected academic group; the paper evaluates well-known models (StarCoder, Pythia) and benchmarks (HumanEval, MBPP) but is not from a top-tier industry lab." 524 } 525 } 526 }