scan.json (30531B)
1 { 2 "paper": { 3 "title": "Top Leaderboard Ranking = Top Coding Proficiency, Always? EvoEval: Evolving Coding Benchmarks via LLM", 4 "authors": [ 5 "Chunqiu Steven Xia", 6 "Yinlin Deng", 7 "Lingming Zhang" 8 ], 9 "year": 2024, 10 "venue": "arXiv.org", 11 "arxiv_id": "2403.19114", 12 "doi": "10.48550/arXiv.2403.19114" 13 }, 14 "scan_version": 2, 15 "active_modules": ["experimental_rigor", "data_leakage"], 16 "methodology_tags": ["benchmark-eval"], 17 "key_findings": "EvoEval, a suite of 828 benchmark problems evolved from HumanEval using GPT-4, reveals an average 39.4% performance drop across 51 LLMs compared to standard HumanEval, with per-model drops ranging from 19.6% to 47.7% and significant ranking changes among top models. Instruction-following LLMs are particularly brittle to rephrasing and subtle changes (7.6% drop vs <1% for base models on SUBTLE), while all models struggle with compositional generalization — the best model (GPT-4) can only compose 53.8% of individually-solved subproblems. The findings suggest potential overfitting to existing benchmarks and highlight gaps in tool-use capabilities of instruction-following models.", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The paper states 'We have open-sourced our benchmarks, tools, and complete LLM generations at https://github.com/evo-eval/evoeval' in both the abstract and conclusion." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The benchmarks (828 problems with test cases and groundtruths) and complete LLM generations are released at the same GitHub repository." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. The paper mentions using vLLM for some models and building on EvalPlus but doesn't specify library versions or environment details." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "The paper does not include step-by-step reproduction instructions. It describes the methodology and points to the repository but lacks a dedicated 'Reproducing Results' section with specific commands." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "All results in Table 2 and throughout are reported as point estimates (pass@1 scores) with no confidence intervals or error bars." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "The paper makes numerous comparative claims (e.g., 'LLMs consistently perform worse', ranking changes) based solely on comparing pass@1 numbers without any statistical significance tests." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "The paper consistently reports percentage drops with baseline context, e.g., 'GPT-4 solve close to 85% of all HUMANEVAL problems but fall almost below 50% pass@1 when evaluated on the DIFFICULT problems' and 'on average 39.4% decrease' with per-benchmark breakdowns (Section 5.1)." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "No justification is given for why 100 problems per semantic-altering benchmark or 164 for semantic-preserving benchmarks. No power analysis or discussion of whether these sizes are sufficient." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "The paper uses greedy decoding (temperature=0) producing single deterministic results per problem. No variance across runs, seeds, or sampling strategies is reported." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "HumanEval and HumanEval+ serve as baselines throughout. All EvoEval performance is compared against these established benchmarks (Table 2, Figures 4-7)." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "The 51 evaluated LLMs include the most contemporary models at time of writing: GPT-4-Turbo, Claude-3, Gemini, DeepSeek-Coder, StarCoder2 (Table 5)." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "The paper includes ablation-like comparisons: TOOL USE vs TOOL USE-MAIN ONLY (Section 5.4, isolating helper function effects), COMBINE vs COMBINE-NAIVE (Section 5.2, isolating composition complexity), and DECOMPOSE analysis (Section 5.3). Each transformation type tests a specific capability dimension." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "Beyond pass@1, the paper reports composition percentage (Table 3), decomposition percentage and recomposition percentage (Table 4), and normalized pass@1 for ranking analysis (Figure 5)." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "Evaluation is entirely automated through differential testing against groundtruth outputs (Section 4, Appendix A.2). No human evaluation of LLM-generated code quality or correctness is performed." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": true, 97 "justification": "EvoEval benchmarks are newly generated problems that were not available during any model's training, by design serving as held-out test data. The paper explicitly argues this avoids contamination from existing benchmarks." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Table 2 shows per-benchmark breakdown for all 51 LLMs across 5 semantic-altering benchmarks. Figures 4-7 provide additional per-model, per-benchmark visualizations. Tables 3-4 provide detailed composition/decomposition breakdowns." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "Appendix C (Figures 21-27) shows concrete GPT-4 failure examples for each benchmark type with detailed analysis of why the generated solution is incorrect." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper's core finding IS a negative result: significant performance degradation on EvoEval vs HumanEval. Specific negative findings include instruction-following models' brittleness to rephrasing, poor compositional generalization (<54% for best model), and poor tool-use integration." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "Abstract claims — 'significant drop in performance (on average 39.4%)' supported by Table 2, 'decrease can range from 19.6% to 47.7%' supported by per-model analysis, 'drastic ranking changes' supported by Figure 5, 'brittleness of instruction-following models' supported by Figure 7 and Section 5.1." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": false, 124 "justification": "The paper makes causal-adjacent claims about 'potential overfitting' and 'indicating potential memorization or contamination of prior evaluation benchmarks' (Section 5.1). While the instruction-following vs base model comparison on SUBTLE provides suggestive evidence, alternative explanations (e.g., format sensitivity unrelated to memorization, GPT-4 generation bias) are not discussed." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper tests only Python problems derived from HumanEval but makes broad claims about 'coding proficiency' and 'program synthesis ability of LLMs' (title and abstract). The scope is not bounded to Python or HumanEval-style function synthesis." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper attributes performance drops to data leakage and overfitting without considering alternatives: evolved problems may be inherently harder in ways unrelated to leakage, GPT-4-generated problems may have systematic biases favoring/disfavoring certain models, or instruction-following sensitivity may reflect format calibration rather than memorization." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper measures pass@1 (functional correctness on test cases) but frames results as measuring 'coding proficiency' and 'program synthesis ability' (title, abstract). No discussion of the gap between passing specific test cases and actual coding proficiency." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": true, 146 "justification": "Table 5 (Appendix A.1) provides exact model IDs for all 51 models, including API endpoint names (e.g., 'gpt-4-0125-preview', 'claude-3-opus-20240229') and HuggingFace model names (e.g., 'deepseek-ai/deepseek-coder-33b-instruct')." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": true, 151 "justification": "Appendix B provides the exact transformation prompts for all 7 benchmarks plus refinement and I/O fixing prompts (Figures 11-20). Appendix A.2 shows an example input prompt for GPT-4 (Figure 10) and describes the input format for base vs instruction-following models." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": true, 156 "justification": "Section 4 states 'greedy decoding (i.e., producing a deterministic sample per each problem with temperature = 0).' Appendix A.2 reports evaluation timeout settings (Tmax=1000ms, f=4) and exact-match thresholds (10^-6 for floating point)." 157 }, 158 "scaffolding_described": { 159 "applies": false, 160 "answer": false, 161 "justification": "No agentic scaffolding is used. The evaluation is simple zero-shot prompting — models receive a function header with docstring and autocomplete (base models) or generate a complete solution (instruction-following models)." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 2 documents the full pipeline: targeted transformation → problem refinement with self-consistency → test generation → manual examination and test augmentation. The sanitization script for parsing LLM outputs is described in Appendix A.2." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": false, 173 "justification": "The paper has no dedicated limitations or threats-to-validity section. The conclusion mentions future work directions but does not substantively discuss limitations of the current study." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": false, 178 "justification": "No specific threats to validity are discussed anywhere in the paper. There is no consideration of threats such as GPT-4 bias in problem generation, Python-only scope, or the limited problem origin (all from HumanEval)." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "The paper does not explicitly state what the results do NOT show. There are no boundary statements about the limitations of Python-only testing, HumanEval-derived problems, or the generalizability of the findings." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": true, 190 "justification": "Complete LLM generations, benchmarks, and tools are released at the GitHub repository, enabling independent verification of all reported results." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 2 describes in detail how benchmark problems were generated from HumanEval seeds using GPT-4 with targeted transformation prompts, including the refinement and self-consistency pipeline." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants. Data sources are the standard HumanEval benchmark and GPT-4-generated transformations." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": true, 205 "justification": "The full pipeline is documented in Section 2 and Figure 2: original problem → targeted transformation → refinement with self-consistency → manual examination → test augmentation. Table 1 shows problem counts and statistics at each stage." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": false, 212 "justification": "No funding sources are disclosed. The acknowledgment section (Section 8) thanks individuals for help and feedback but mentions no grants or funding agencies." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Author affiliations are clearly stated: all three authors are from University of Illinois Urbana-Champaign with email addresses provided." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": false, 222 "justification": "No funding is disclosed, so independence of funding cannot be assessed. The paper does not state whether the work was unfunded." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "There is no competing interests or financial disclosure statement in the paper." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": true, 233 "answer": false, 234 "justification": "Training data cutoff dates are not stated for any of the 51 evaluated LLMs. While EvoEval is designed to mitigate contamination, the training cutoffs are not explicitly reported." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": true, 238 "answer": true, 239 "justification": "The entire paper is motivated by train/test overlap concerns. Section 1 discusses how 'example solutions can be readily found on the web and thus potentially in training data' and cites work showing 'substantial overlap between benchmark solutions and open-source training corpuses' [39]." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": true, 243 "answer": true, 244 "justification": "EvoEval is specifically designed to address benchmark contamination by creating new problems that did not exist before. The paper argues these evolved problems cannot have been seen during training (Section 1). Results on SUBTLE further demonstrate contamination effects." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": false, 288 "justification": "No inference costs, API costs, or per-example costs are reported despite evaluating 51 LLMs across 828+ problems, including commercial API calls to GPT-4, Claude-3, and Gemini." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "No total computational budget is stated — no GPU hours, total API spend, or hardware specifications for running inference on the open-source models." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": false, 300 "justification": "Only greedy decoding (temperature=0) is used, producing single deterministic results. No exploration of sensitivity to temperature, sampling strategy, or random seeds." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": true, 305 "justification": "Section 4 explicitly states 'greedy decoding (i.e., producing a deterministic sample per each problem with temperature = 0)', making clear each result comes from a single run." 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "No hyperparameter search is discussed. Temperature=0 is used without justification for why this setting was chosen or whether other settings were explored." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": false, 315 "justification": "Greedy decoding is used as the sole configuration without justification for why it was selected over other temperature or sampling settings." 316 }, 317 "multiple_comparison_correction": { 318 "applies": false, 319 "answer": false, 320 "justification": "No statistical tests are performed in the paper, so multiple comparison correction is not applicable." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "GPT-4 is used to generate the benchmark problems and is also evaluated on them, creating a potential bias. The paper does not acknowledge or discuss this conflict — GPT-4-generated problems might systematically favor or disfavor GPT-4 models." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": false, 330 "justification": "The paper compares models ranging from 2b to 70b+ parameters and proprietary models of unknown size without discussing compute differences. Performance is not analyzed relative to compute budget." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": true, 335 "justification": "The paper's core thesis addresses construct validity: it argues existing benchmarks don't validly measure coding proficiency due to contamination and limited scope. The t-SNE visualization (Figure 3) demonstrates distributional differences, and multiple transformation types test different capability dimensions." 336 }, 337 "scaffold_confound_addressed": { 338 "applies": false, 339 "answer": false, 340 "justification": "No scaffolding is involved. Models are evaluated via direct prompt-to-completion without any agentic framework." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": true, 347 "justification": "The paper's core motivation is temporal leakage: 'prior benchmarks contain only a very limited set of problems... many benchmarks are prone to data leakage where example solutions can be readily found on the web' (Section 1). EvoEval is designed to create post-training problems to mitigate this." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "Feature leakage is not explicitly discussed. The paper doesn't address whether information in problem descriptions (derived from HumanEval) could provide signal through textual similarity to training data, beyond the direct contamination concern." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "The paper does not discuss whether evolved problems maintain structural similarity to HumanEval originals that could enable transfer from training data. All EvoEval problems derive from HumanEval seeds, creating potential non-independence between training examples and test problems." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": false, 362 "justification": "No concrete leakage detection method is applied. While the paper cites [39] on quantifying contamination and designs EvoEval to prevent leakage, it does not use canary strings, membership inference, n-gram overlap analysis, or other detection methods to verify that EvoEval is actually contamination-free." 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "Compared to high performance on standard benchmarks like HumanEval, there is a significant drop in performance (on average 39.4%) when using EvoEval across 51 LLMs.", 369 "evidence": "Table 2 shows per-benchmark average drops: DIFFICULT 58.7%, CREATIVE 50.2%, SUBTLE 5.0%, COMBINE 78.1%, TOOL USE 4.9%. Per-model drops range from 19.6% to 47.7% (Section 5.1).", 370 "supported": "strong" 371 }, 372 { 373 "claim": "EvoEval causes drastic ranking changes among top-performing LLMs compared to HumanEval leaderboards.", 374 "evidence": "Figure 5 shows ranking changes across benchmarks. While top models differ by <10% on HumanEval, the difference on EvoEval averages over 20%. GPT-4 beats GPT-4-Turbo on DIFFICULT, CREATIVE, and COMBINE despite lower HumanEval scores (Table 2, Section 5.1).", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Instruction-following LLMs are significantly more sensitive to subtle changes and rephrasing of problem descriptions than base models.", 379 "evidence": "Figure 7 shows instruction-following LLMs drop on average 3.4% (VERBOSE) and 4.0% (CONCISE) while base models improve by 0.5% and 2.1%. On SUBTLE, instruction-following LLMs drop 7.6% vs <1% for base models (Section 5.1).", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Current state-of-the-art LLMs fail to effectively compose known programming concepts to solve combined problems.", 384 "evidence": "Table 3 shows composition percentages: GPT-4 achieves only 53.8%, GPT-4-Turbo 48.1%, Claude-3 43.2%. On COMBINE-NAIVE (simpler sequential composition), best is GPT-4 at 75.2% (Section 5.2).", 385 "supported": "strong" 386 }, 387 { 388 "claim": "Base (non-instruction-following) models benefit more from auxiliary helper functions than instruction-following models.", 389 "evidence": "Section 5.4 reports 122.0% average improvement for base models vs 60.4% for instruction-following models when helper functions are provided (TOOL USE vs TOOL USE-MAIN ONLY). Figure 9 shows cases where base models outperform instruction-following counterparts with helpers.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "High scores on existing benchmarks are potentially due to overfitting or data leakage rather than genuine coding capability.", 394 "evidence": "Supported by the SUBTLE experiment showing instruction-following models drop significantly on minimally-changed problems, and by citing [39] showing overlap between benchmark solutions and training corpora. However, alternative explanations are not considered (Section 1, Section 5.1).", 395 "supported": "moderate" 396 } 397 ], 398 "red_flags": [ 399 { 400 "flag": "GPT-4 used to generate benchmark AND evaluated on it", 401 "detail": "GPT-4 is used to generate all EvoEval problems, groundtruth solutions, and test cases. GPT-4 and GPT-4-Turbo are then evaluated on these problems. This creates a potential systematic bias — problems generated by GPT-4 may favor or disfavor GPT-4's reasoning patterns in ways not applicable to other model families. The paper does not acknowledge this conflict." 402 }, 403 { 404 "flag": "No error bars or statistical tests", 405 "detail": "All comparisons across 51 LLMs and 7 benchmarks rely on raw pass@1 point estimates. Claims of 'significant drop' and 'drastic ranking changes' are stated without any statistical test. With 100 problems per benchmark, small ranking differences may not be statistically meaningful." 406 }, 407 { 408 "flag": "No limitations section", 409 "detail": "The paper lacks any dedicated discussion of limitations, threats to validity, or scope boundaries. Key unaddressed limitations include Python-only evaluation, all problems derived from a single seed benchmark (HumanEval), and potential GPT-4 generation bias." 410 }, 411 { 412 "flag": "Broad claims from narrow evaluation scope", 413 "detail": "The paper draws broad conclusions about 'coding proficiency' and 'program synthesis ability' from Python-only, function-level problems all derived from HumanEval. These are framed as general findings about LLM capabilities without bounding to the tested domain." 414 } 415 ], 416 "cited_papers": [ 417 { 418 "title": "Evaluating large language models trained on code", 419 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 420 "year": 2021, 421 "arxiv_id": "2107.03374", 422 "relevance": "Introduces HumanEval, the foundational code generation benchmark that EvoEval evolves from and that dominates LLM code evaluation." 423 }, 424 { 425 "title": "Program synthesis with large language models", 426 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"], 427 "year": 2021, 428 "relevance": "Introduces MBPP, another widely-used code generation benchmark discussed as suffering from the same contamination issues as HumanEval." 429 }, 430 { 431 "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation", 432 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 433 "year": 2023, 434 "relevance": "Introduces EvalPlus/HumanEval+ with augmented test suites for more rigorous code generation evaluation, directly built upon by EvoEval." 435 }, 436 { 437 "title": "SWE-bench: Can language models resolve real-world GitHub issues?", 438 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"], 439 "year": 2024, 440 "relevance": "Real-world software engineering benchmark that addresses some limitations of HumanEval by using real GitHub issues." 441 }, 442 { 443 "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code", 444 "authors": ["Naman Jain", "King Han", "Alex Gu"], 445 "year": 2024, 446 "relevance": "Another contamination-free code benchmark using live programming contest problems, addressing similar concerns as EvoEval." 447 }, 448 { 449 "title": "Quantifying contamination in evaluating code generation capabilities of language models", 450 "authors": ["Martin Riddell", "Ansong Ni", "Arman Cohan"], 451 "year": 2024, 452 "arxiv_id": "2403.04811", 453 "relevance": "Directly quantifies the overlap between benchmark solutions and training corpora, providing evidence for the contamination problem EvoEval addresses." 454 }, 455 { 456 "title": "Leak, cheat, repeat: Data contamination and evaluation malpractices in closed-source LLMs", 457 "authors": ["Simone Balloccu", "Patrícia Schmidtová", "Mateusz Lango", "Ondřej Dušek"], 458 "year": 2024, 459 "arxiv_id": "2402.03927", 460 "relevance": "Documents data contamination and evaluation malpractice in closed-source LLMs, including deliberate benchmark gaming." 461 }, 462 { 463 "title": "Magicoder: Source code is all you need", 464 "authors": ["Yuxiang Wei", "Zhe Wang", "Jiawei Liu"], 465 "year": 2023, 466 "arxiv_id": "2312.02120", 467 "relevance": "Proposes OSS-Instruct for synthesizing instruction data from code, relevant to understanding how training data composition affects benchmark performance." 468 }, 469 { 470 "title": "WizardCoder: Empowering code large language models with Evol-Instruct", 471 "authors": ["Ziyang Luo", "Can Xu", "Pu Zhao"], 472 "year": 2023, 473 "arxiv_id": "2306.08568", 474 "relevance": "Uses Evol-Instruct to create complex training data for code LLMs — a training-time approach paralleling EvoEval's test-time evolution approach." 475 }, 476 { 477 "title": "DeepSeek-Coder: When the large language model meets programming", 478 "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"], 479 "year": 2024, 480 "arxiv_id": "2401.14196", 481 "relevance": "Major open-source code LLM family evaluated extensively in EvoEval across multiple model sizes and instruction variants." 482 }, 483 { 484 "title": "Code LLama: Open foundation models for code", 485 "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"], 486 "year": 2023, 487 "arxiv_id": "2308.12950", 488 "relevance": "Open-source code LLM family evaluated in EvoEval, showing significant instruction-following sensitivity." 489 }, 490 { 491 "title": "MultiPL-E: A scalable and polyglot approach to benchmarking neural code generation", 492 "authors": ["Federico Cassano", "John Gouwar", "Daniel Nguyen"], 493 "year": 2023, 494 "relevance": "Extends HumanEval and MBPP to 18 languages, addressing the language diversity limitation that EvoEval also indirectly highlights." 495 }, 496 { 497 "title": "ClassEval: A manually-crafted benchmark for evaluating LLMs on class-level code generation", 498 "authors": ["Xueying Du", "Mingwei Liu", "Kaixin Wang"], 499 "year": 2023, 500 "arxiv_id": "2308.01861", 501 "relevance": "Benchmark for class-level code generation, addressing the function-level limitation of HumanEval that EvoEval also builds upon." 502 } 503 ] 504 }