scan.json (31699B)
1 { 2 "paper": { 3 "title": "DyCodeEval: Dynamic Benchmarking of Reasoning Capabilities in Code Large Language Models Under Data Contamination", 4 "authors": [ 5 "Simin Chen", 6 "Pranav Pusarla", 7 "Baishakhi Ray" 8 ], 9 "year": 2025, 10 "venue": "International Conference on Machine Learning", 11 "arxiv_id": "2503.04149", 12 "doi": "10.48550/arXiv.2503.04149" 13 }, 14 "scan_version": 3, 15 "active_modules": ["experimental_rigor", "data_leakage"], 16 "methodology_tags": ["benchmark-eval"], 17 "key_findings": "DyCodeEval generates semantically diverse yet complexity-equivalent programming problem variants using a four-agent pipeline, mitigating data contamination effects in Code LLM benchmarking. Controlled contamination experiments on 3 small models show static benchmarks inflate Pass@1 by up to 2-4x while dynamic benchmarks remain stable. In-the-wild evaluation of 18 models flags QWEN2.5-CODER-7B as potentially contaminated on both HumanEval and MBPP. The proposed DyPass@K metric is robust to contamination and produces consistent results across 10 random trials (low std dev).", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "The paper provides a project webpage URL (https://codekaleidoscope.github.io/dycodeeval.html) but does not explicitly state that source code is released. The URL is a project page, not a repository link." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The paper uses two publicly available seed datasets: HumanEval (Chen et al., 2021) and MBPP-Sanitized (Austin et al., 2021), both standard public benchmarks described in §4.1 and Appendix B." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "The paper mentions using vLLM for open-source models and commercial APIs for closed-source models but provides no requirements.txt, Dockerfile, or detailed environment specifications beyond these tool mentions." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions are provided. The pipeline is described conceptually but there are no concrete commands or scripts to replicate the experiments." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": true, 45 "justification": "The stability experiment (§4.5, Fig. 6) reports mean and standard deviation across 10 runs. The in-the-wild analysis (§4.3, Fig. 5) shows 95% confidence interval regression bands." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "No statistical significance tests are used. Comparative claims across models and conditions are made by comparing raw Pass@1 scores and visual regression analysis without formal hypothesis testing." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "The paper reports absolute Pass@1 scores at each contamination level (0%, 25%, 50%, 75%, 100%) providing baseline and treatment context. Tables 2-3 compare Pass@K and DyPass@K with specific values." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "No justification is given for the choice of seed datasets (164 HumanEval, 427 MBPP problems), the number of models tested, or the number of scenarios/contexts (50 each). No power analysis is discussed." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": true, 65 "justification": "Section 4.5 and Fig. 6 explicitly report mean and standard deviation of Pass@1 scores across 10 generated benchmark datasets for the stability analysis." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "Table 1 compares DyCodeEval against multiple baselines for diversity: Token Mutation, Char Mutation, Func Mutation, Insert Line, CommSyntax, and PPM. Static benchmarking serves as the baseline for contamination experiments." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "Baselines include PPM (Chen et al., 2024) and ReCode mutations (Wang et al., 2023), which are recent methods. LiveCodeBench (Jain et al., 2024) is also discussed as a related approach." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": false, 82 "justification": "The system has four distinct agents (Scenario Proposer, Context Generator, Prompt Rewriter, Validator) but no ablation study removes or modifies individual components to measure their contribution. Section 4.6 only tests swapping the foundation LLM." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "The paper uses Pass@1, Pass@K, BLEU-4, cosine semantic similarity, and the proposed DyPass@K metric across different evaluation dimensions." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": true, 92 "justification": "Appendix D describes a human verification step where two graduate-level students independently reviewed 60 randomly sampled problem pairs (30 per dataset) for consistency, achieving 95% agreement after discussion." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": true, 97 "justification": "For the contamination experiment (§4.2), the contaminated model is evaluated on both the leaked benchmark and a separate non-leaked benchmark. The dynamic benchmarks are newly generated each time, ensuring no overlap with training data." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Results are broken down by model (18 models), contamination level (0/25/50/75/100%), dataset (HumanEval vs MBPP), internal vs external diversity (Table 1), and Pass@K values (k=3,5,10 in Tables 2-3)." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper discusses the DEEPSEEK-CODER anomaly where 25% contamination drops performance below baseline (§4.2). The limitations section notes cases where generated problems contain excessive information that may confuse readers." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "The consistency rate drops from 95% to 83% when using CLAUDE-3.5-HAIKU instead of SONNET as the foundation model (§4.6). The DEEPSEEK-CODER performance degradation at 25% contamination is an unexpected negative finding." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract's three main claims — effective assessment under contamination (§4.2), diverse problem generation (§4.4, Table 1), and robust/consistent benchmarking (§4.5, Fig. 6) — are all supported by corresponding experimental sections." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "The main causal claim ('data contamination creates a false sense of code reasoning capability') is supported by a controlled intervention design (§4.2) where the authors deliberately introduce contamination at varying levels and measure its effect on both static and dynamic benchmarks." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The title claims results for 'Code Large Language Models' broadly, but experiments are limited to Python code generation on two benchmarks (HumanEval and MBPP). The contamination experiment uses only 3 small models (1B-3B parameters). No discussion of whether findings extend to other languages or larger models." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper hypothesizes about the DEEPSEEK-CODER anomaly (§4.2) but does not systematically discuss alternative explanations for the main findings, such as whether benchmark difficulty differences could explain performance gaps, or whether the regression outlier detection could produce false positives." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper measures Pass@1 on code generation benchmarks but repeatedly claims to measure 'reasoning capabilities.' Pass@1 on HumanEval/MBPP is a proxy for reasoning ability, but the paper does not acknowledge this gap or discuss what reasoning capability actually entails beyond benchmark performance." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": false, 146 "justification": "The paper specifies model families and sizes (e.g., LLAMA-3.2-1B, DEEPSEEK-CODER-1.3B) but the critical foundation model 'CLAUDE-3.5-SONNET' used for benchmark generation lacks a version date or API snapshot identifier. Multiple Claude-3.5-Sonnet versions exist with different behaviors." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": true, 151 "justification": "Appendix C provides full prompt templates for all four agents (Scenario Proposer, Context Generator, Prompt Rewriter, and two Validation prompts) along with worked examples showing filled-in values. A reader could reconstruct the prompts." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": true, 156 "justification": "Section 4.1 states: temperature 0.8 for generation, temperature 0 for validation. The number of scenarios (50) and contexts per scenario (50) are also specified." 157 }, 158 "scaffolding_described": { 159 "applies": true, 160 "answer": true, 161 "justification": "The four-agent pipeline (Scenario Proposer, Context Generator, Prompt Rewriter, Validator) is described in detail in §3.2 with a workflow diagram (Fig. 2), inter-agent data flow, type inference algorithm (Alg. 1), and retry logic when validation fails." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "Appendix B describes how MBPP-Sanitized was refined by 'adding function headers and converting natural language instructions into function docstrings.' The contamination fine-tuning process and data splits (0/25/50/75/100%) are documented in §4.2." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 6 (Conclusion) contains a substantive limitations paragraph identifying two specific limitations: (1) computational cost requiring large LLMs for high consistency rates, and (2) excessive information in generated prompts that may confuse readers." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": true, 178 "justification": "The limitations are specific to this study: the consistency rate drops from 95% to 83% with a smaller foundation LLM (§4.6), and generated questions sometimes contain excessive information. These are concrete, study-specific observations." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "The paper does not explicitly state what the results do NOT show. There is no discussion of limitations to specific languages (Python only), model sizes (only 1B-13B for contamination experiments), or task types (only NL-to-code generation)." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": false, 190 "justification": "The generated benchmark datasets, fine-tuned model weights, and raw evaluation outputs are not provided for independent verification. Only aggregate results (Pass@1 scores, diversity metrics) are reported." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "The benchmark generation pipeline is described in detail (§3.2). Seed datasets are described in Appendix B. The contamination fine-tuning process is documented in §4.2 with specific percentages." 196 }, 197 "recruitment_methods_described": { 198 "applies": true, 199 "answer": false, 200 "justification": "The human verification (Appendix D) mentions 'two graduate-level students' but provides no details about their qualifications, how they were selected, or potential biases in their evaluation." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": true, 205 "justification": "The four-stage pipeline is documented with the type inference algorithm (Alg. 1), prompt templates (Appendix C), validation criteria (two-step validation), and the iterative retry process. The 95% human agreement rate provides a quality metric." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Acknowledgements section states: 'This work was supported in part by CCF 2313055, CCF 2107405, CAREER 2025082, and FAI: 2040961.'" 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "All authors are disclosed as being from Columbia University Department of Computer Science. They evaluate models from external companies (Meta, DeepSeek, Alibaba, Anthropic) and use Anthropic's Claude as their foundation model." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": true, 222 "justification": "Funding is from NSF grants (CCF, CAREER, FAI programs), which are government research grants independent of any LLM vendor's commercial interests." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial interests statement is present in the paper." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": true, 233 "answer": false, 234 "justification": "The paper does not state the training data cutoff dates for any of the 18 evaluated models, despite the paper being specifically about data contamination. This makes it difficult to assess the baseline contamination status of the in-the-wild models." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": true, 238 "answer": true, 239 "justification": "The entire paper is about train/test overlap. Section 4.2 creates controlled contamination experiments, and §4.3 uses regression analysis to detect potential contamination in wild models (flagging QWEN2.5-CODER-7B as an outlier)." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": true, 243 "answer": true, 244 "justification": "Benchmark contamination is the paper's core concern. It explicitly discusses how HumanEval and MBPP are likely in many models' training data and proposes dynamic generation as a mitigation strategy." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "This is a benchmark evaluation paper with no human participants in the research sense. The two annotators in Appendix D perform validation, not a human subjects study." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human subjects study is conducted. The paper evaluates Code LLMs on programming benchmarks." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants study. The two annotators in Appendix D are described only as 'graduate-level students.'" 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human subjects study is conducted." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human subjects study is conducted." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human subjects study is conducted." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human subjects study is conducted." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": false, 288 "justification": "Computational cost is mentioned as a limitation in §6 but no actual costs, token counts, API spend, or wall-clock times are reported. Section 4.6 discusses using a cheaper model but does not quantify the savings." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "No GPU hours, total API spend, or hardware specifications are reported despite the method requiring extensive LLM calls (Claude-3.5-Sonnet for generating 50 scenarios × 50 contexts × hundreds of problems plus validation)." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": true, 300 "justification": "Section 4.5 runs DyCodeEval 10 times and reports mean and standard deviation of Pass@1 scores, demonstrating sensitivity to the randomness in the generation process." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": true, 305 "justification": "Section 4.5 explicitly states 'we run DyCodeEval 10 times and measure the Pass@1 scores across these 10 generated benchmark datasets.'" 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "No hyperparameter search is described. The choice of 50 scenarios, 50 contexts, temperature 0.8, and temperature 0 for validation appear to be fixed without justification or search." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": false, 315 "justification": "The paper does not explain how the configuration (50 scenarios, 50 contexts, temperature settings) was selected. No validation-based selection or sensitivity analysis across configurations is provided." 316 }, 317 "multiple_comparison_correction": { 318 "applies": true, 319 "answer": false, 320 "justification": "The paper compares 18 models across two datasets with multiple metrics but performs no statistical tests at all, let alone corrections for multiple comparisons." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "The authors propose DyCodeEval and evaluate it against baselines they implemented, without acknowledging author-evaluation bias or having independent evaluation." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": false, 330 "justification": "The dynamic benchmark generation requires extensive LLM calls (Claude-3.5-Sonnet) compared to static benchmarking, but this compute difference is not quantified or discussed in relation to the evaluation benefits." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": true, 335 "justification": "The paper's central thesis addresses construct validity: static benchmarks may measure memorization rather than reasoning under contamination. The dynamic approach is explicitly designed to test whether models are 'genuinely reasoning to solve it' (§5)." 336 }, 337 "scaffold_confound_addressed": { 338 "applies": false, 339 "answer": false, 340 "justification": "The evaluated Code LLMs are tested via straightforward prompt-to-code generation without scaffolding. No agentic scaffolding is involved in the model evaluation step." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": true, 347 "justification": "The paper's core contribution addresses temporal leakage: static benchmarks published before model training can leak into training data. DyCodeEval generates fresh variants to avoid this. Section 1 explicitly discusses this concern." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "The paper does not discuss whether the evaluation setup leaks answer information through context. The generated prompts provide rich contextual information that could potentially provide hints not available in the original problems." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "All generated problems derive from the same seed problems (HumanEval and MBPP), sharing underlying algorithmic structure. The paper does not discuss whether this structural non-independence affects the validity of using generated problems as independent test items." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": true, 362 "justification": "Section 4.3 uses regression analysis with 95% confidence intervals to detect potential contamination in wild models (flagging outliers like QWEN2.5-CODER-7B). The collision analysis (§3.3, Theorems 3.1-3.3) provides theoretical bounds on the probability of generating identical problems." 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "Data contamination creates a false sense of code reasoning capability under static benchmarks, with contaminated models achieving inflated Pass@1 scores on leaked benchmarks.", 369 "evidence": "Section 4.2, Fig. 4: Fine-tuned models with 100% leaked data show substantially higher Pass@1 on contaminated benchmarks but not on non-contaminated ones. Three models tested at 0/25/50/75/100% contamination levels.", 370 "supported": "strong" 371 }, 372 { 373 "claim": "DyCodeEval's dynamic benchmarking mitigates the impact of data contamination, preventing artificially inflated scores.", 374 "evidence": "Section 4.2, Fig. 4 (second row): Dynamic benchmark scores remain stable regardless of contamination level, unlike static benchmarks. Demonstrated on LLAMA-3.2-1B, LLAMA-3.2-3B, and DEEPSEEK-CODER-1.3B.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "QWEN2.5-CODER-7B is potentially contaminated on both HumanEval and MBPP datasets.", 379 "evidence": "Section 4.3, Fig. 5: QWEN2.5-CODER-7B consistently falls outside the 95% confidence interval of the in-the-wild regression area for both datasets.", 380 "supported": "weak" 381 }, 382 { 383 "claim": "DyCodeEval generates semantically diverse programming problems, significantly outperforming baseline mutation methods.", 384 "evidence": "Section 4.4, Table 1: DyCodeEval achieves BLEU-4 of 0.17-0.27 (internal) and 0.02-0.17 (external) compared to baselines ranging 0.57-1.00. Semantic similarity drops to 0.59-0.74 vs 0.84-1.00 for baselines.", 385 "supported": "strong" 386 }, 387 { 388 "claim": "DyCodeEval produces stable benchmarking results despite its inherent randomness.", 389 "evidence": "Section 4.5, Fig. 6: Variance in Pass@1 scores is minimal across 10 independent runs, with standard deviations much smaller than mean values.", 390 "supported": "strong" 391 }, 392 { 393 "claim": "DyPass@K is more robust to contamination than Pass@K, accurately reflecting reasoning capability even under data leakage.", 394 "evidence": "Section 5, Tables 2-3: For contaminated LLAMA-3.2-1B, Pass@3 jumps from 0.22 to 0.82 while DyPass@3 drops from 0.17 to 0.13. Non-contaminated models show consistent scores across both metrics.", 395 "supported": "moderate" 396 } 397 ], 398 "red_flags": [ 399 { 400 "flag": "Contamination experiment limited to tiny models", 401 "detail": "The controlled contamination experiment (§4.2) uses only 3 models of 1B-3B parameters. These very small models may be more susceptible to overfitting on leaked data than larger, more capable models, limiting the generalizability of the contamination findings." 402 }, 403 { 404 "flag": "Contamination accusation without ground truth", 405 "detail": "The paper claims QWEN2.5-CODER-7B is 'potentially contaminated' based solely on being a regression outlier (§4.3). There is no independent verification, no access to Qwen's training data, and no discussion of alternative explanations for the outlier behavior (e.g., different training objectives, data curation strategies)." 406 }, 407 { 408 "flag": "No statistical significance tests", 409 "detail": "All comparative claims across 18 models and multiple conditions rely on raw score comparisons without any formal statistical testing. Differences may not be statistically meaningful, especially given the variance shown in §4.5." 410 }, 411 { 412 "flag": "Very small human evaluation sample", 413 "detail": "The human verification (Appendix D) uses only 2 annotators reviewing 60 problem pairs. This is a minimal sample for validating an entire automated benchmarking approach, and inter-annotator agreement before discussion is not reported separately." 414 }, 415 { 416 "flag": "Foundation model evaluates its own outputs", 417 "detail": "Claude-3.5-Sonnet serves as both the generation engine (creating benchmark problems) and the validation agent (checking consistency). Self-validation by the same model family may miss systematic biases in the generation." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "Evaluating large language models trained on code", 423 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 424 "year": 2021, 425 "arxiv_id": "2107.03374", 426 "relevance": "Introduces HumanEval, one of the most widely used code generation benchmarks and a seed dataset in this study." 427 }, 428 { 429 "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code", 430 "authors": ["Naman Jain", "King Han", "Alex Gu"], 431 "year": 2024, 432 "arxiv_id": "2403.07974", 433 "relevance": "Proposes contamination-free code benchmarking through continuous collection of new problems, a key baseline approach this paper aims to improve upon." 434 }, 435 { 436 "title": "PPM: Automated generation of diverse programming problems for benchmarking code generation models", 437 "authors": ["Simin Chen", "Xin Feng", "Xinyu Han"], 438 "year": 2024, 439 "relevance": "Prior work by the same first author on automated benchmark generation using manually defined operators, used as a diversity comparison baseline." 440 }, 441 { 442 "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation", 443 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 444 "year": 2023, 445 "relevance": "Introduces EvalPlus, identifying limitations of HumanEval/MBPP test coverage and proposing more rigorous code generation evaluation." 446 }, 447 { 448 "title": "SWE-bench: Can language models resolve real-world GitHub issues?", 449 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"], 450 "year": 2024, 451 "relevance": "Major real-world code benchmark for evaluating LLMs on GitHub issue resolution, relevant to benchmarking methodology and contamination concerns." 452 }, 453 { 454 "title": "DeepSeek-Coder: When the large language model meets programming—the rise of code intelligence", 455 "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"], 456 "year": 2024, 457 "arxiv_id": "2401.14196", 458 "relevance": "Describes the DeepSeek-Coder model family evaluated in this study, relevant to code LLM development and capability assessment." 459 }, 460 { 461 "title": "ReCode: Robustness evaluation of code generation models", 462 "authors": ["Shiqi Wang", "Zheng Li", "Haifeng Qian"], 463 "year": 2023, 464 "doi": "10.18653/V1/2023.ACL-LONG.773", 465 "relevance": "Proposes robustness-based mutations for code generation evaluation, used as a baseline for diversity comparison in this study." 466 }, 467 { 468 "title": "DyVal: Dynamic evaluation of large language models for reasoning tasks", 469 "authors": ["Kaijie Zhu", "Jiaao Chen", "Jindong Wang"], 470 "year": 2024, 471 "arxiv_id": "2309.17167", 472 "relevance": "Uses DAG structures for dynamic LLM benchmarking, a key related approach for contamination-resistant evaluation." 473 }, 474 { 475 "title": "Recent advances in large language model benchmarks against data contamination: From static to dynamic evaluation", 476 "authors": ["Simin Chen", "Yanxin Chen", "Zelin Li"], 477 "year": 2025, 478 "arxiv_id": "2502.17521", 479 "relevance": "Survey of contamination-free benchmarking approaches by the same first author, providing context on the contamination problem landscape." 480 }, 481 { 482 "title": "Generalization or memorization: Data contamination and trustworthy evaluation for large language models", 483 "authors": ["Yicheng Dong", "Xiangjue Jiang", "Hanlin Liu"], 484 "year": 2024, 485 "arxiv_id": "2402.15938", 486 "relevance": "Studies data contamination effects on LLM evaluation trustworthiness, directly relevant to understanding contamination's impact on benchmarking." 487 }, 488 { 489 "title": "Language models are few-shot learners", 490 "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"], 491 "year": 2020, 492 "relevance": "GPT-3 paper that first raised data contamination concerns in LLM benchmarking at scale." 493 }, 494 { 495 "title": "Program synthesis with large language models", 496 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell I. Nye"], 497 "year": 2021, 498 "arxiv_id": "2108.07732", 499 "relevance": "Introduces the MBPP benchmark used as one of two seed datasets in this study." 500 } 501 ], 502 "engagement_factors": { 503 "practical_relevance": { 504 "score": 2, 505 "justification": "DyCodeEval provides a usable framework for contamination-resistant benchmarking, though setup requires an LLM agent pipeline." 506 }, 507 "surprise_contrarian": { 508 "score": 1, 509 "justification": "Data contamination in benchmarks is a known concern; the specific finding about QWEN2.5-CODER-7B adds a mildly surprising result." 510 }, 511 "fear_safety": { 512 "score": 1, 513 "justification": "Raises concerns about unreliable LLM evaluations due to contamination but does not demonstrate novel attacks or safety threats." 514 }, 515 "drama_conflict": { 516 "score": 2, 517 "justification": "Publicly flagging QWEN2.5-CODER-7B as potentially contaminated and arguing that 'static benchmarks create a false sense of accuracy' has moderate controversy potential." 518 }, 519 "demo_ability": { 520 "score": 1, 521 "justification": "A project webpage exists but no readily installable tool or live demo is provided in the paper." 522 }, 523 "brand_recognition": { 524 "score": 1, 525 "justification": "Columbia University authors, published at ICML. Evaluates well-known models but is not from a major AI lab." 526 } 527 } 528 }