scan.json (33543B)
1 { 2 "paper": { 3 "title": "EvoCodeBench: An Evolving Code Generation Benchmark Aligned with Real-World Code Repositories", 4 "authors": [ 5 "Jia Li", 6 "Ge Li", 7 "Xuanming Zhang", 8 "Yihong Dong", 9 "Zhi Jin" 10 ], 11 "year": 2024, 12 "venue": "arXiv.org", 13 "arxiv_id": "2404.00599", 14 "doi": "10.48550/arXiv.2404.00599" 15 }, 16 "scan_version": 3, 17 "active_modules": ["experimental_rigor", "data_leakage"], 18 "methodology_tags": ["benchmark-eval"], 19 "key_findings": "EvoCodeBench-2403 reveals a dramatic performance drop for LLMs compared to existing benchmarks: gpt-4's highest Pass@1 is only 20.73% (vs 80% on HumanEval), demonstrating that standalone function benchmarks overestimate real-world coding ability. Code context from local files improves gpt-4's Pass@1 by up to 152%, confirming the importance of repository context. Manual error analysis of 50 failed gpt-4 cases finds 58% are logic errors and 40% are due to missing cross-file context. The benchmark's code and dependency distributions match those observed across 500 real-world repositories.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "GitHub repository provided in footnote 1: https://github.com/seketeam/EvoCodeBench. The abstract states 'We release EvoCodeBench, all prompts, and LLMs' completions for further community analysis.'" 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "EvoCodeBench-2403 (275 samples from 25 repositories) is released publicly along with all prompts and model completions, as stated in the abstract and Section 1." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper mentions using pip3 and Pytest for test execution but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions for reproducing the experiments." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "While code and data are released, the paper itself does not contain step-by-step reproduction instructions. No README with commands, scripts, or 'Reproducing Results' section is described." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "All results in Table 4 are point estimates (Pass@k and Recall@k values). No confidence intervals, error bars, or uncertainty measures are reported for any result." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper claims multiple performance differences between LLMs (e.g., 'gpt-4 achieves the highest Pass@k') and between settings (e.g., '104% and 152% improvement') based solely on comparing numbers, with no statistical significance tests." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "The paper reports percentage improvements with baseline context: 'Pass@1 of gpt-4 is improved by 104% and 152%' from specific baseline values. Table 4 provides absolute scores allowing readers to compute effect magnitudes across all models and settings." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "The benchmark contains 275 samples from 25 repositories. No justification is given for why this number is sufficient, no power analysis is provided, and no discussion of whether 275 samples provides adequate statistical power for the comparisons made." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "No standard deviation, variance, or spread measures are reported. For k=1, greedy decoding produces a single deterministic output. For k>1, 20 samples are generated per requirement but the unbiased Pass@k estimator yields only a point estimate with no reported spread across runs." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Ten LLMs are compared against each other, and results are contrasted with performance on prior benchmarks (e.g., gpt-4's 88.4% Pass@1 on HumanEval vs 7.27% on EvoCodeBench without context)." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "The evaluated models include gpt-4-turbo-1106, DeepSeek Coder (Nov 2023), StarCoder 2 (Dec 2023), and Gemma (Feb 2024), all released within months of the paper. These were state-of-the-art at the time." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Three experimental settings (without context, local file completion, local file infilling) serve as controlled ablations of context availability. Additionally, the RAG experiment (Section 5, Table 6) ablates the effect of retrieved similar functions." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Two metrics are used: Pass@k for functional correctness (test-case-based) and Recall@k for reference dependency recall. Both are reported at k=1,3,5,10." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": true, 94 "justification": "Section 5 includes human evaluation of auto-generated requirements: two developers write requirements and two others evaluate them vs. gpt-4's output on 50 functions (Cohen's Kappa = 0.92). Section 4.4 includes manual analysis of 50 error cases. Section 5 also includes manual annotation of dependencies in 50 programs to gauge Recall@k bias." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "EvoCodeBench-2403 is used purely for evaluation. No model tuning or development decisions are made on this data. The benchmark is collected from repositories created after the training cutoffs of evaluated models." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Results are broken down by program type (standalone vs non-standalone, Figure 4), dependency type (intra-class, intra-file, cross-file, Figure 5), and context setting (three settings in Table 4)." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 4.4 'Error Analyses': 'we manually analyze 50 error cases of gpt-4 in the Local File (Infilling) setting. We found that most of the cases (29 cases) failed due to implementation logic errors. 20 cases failed since the necessary contexts were missing... one case failed because of the vague requirement.'" 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The paper's central finding is that LLMs perform poorly: gpt-4's highest Pass@1 is only 20.73%. The paper explicitly frames low performance as a key result and discusses limitations of current LLMs. Section 4.5 summarizes the challenges that remain unsolved." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The abstract claims gpt-4's highest Pass@1 is 20.73%, supported by Table 4 (Local File Infilling). Claims about alignment with real-world distributions are supported by Table 2 comparing EvoCodeBench-2403 statistics with 500 real repositories (27% standalone, 73% non-standalone, avg 3.46 vs 3.22 dependencies)." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "The paper claims code contexts improve performance ('Pass@1 of gpt-4 is improved by 104% and 152%'). This causal claim is supported by controlled comparison: the same models are evaluated under three conditions (no context, completion, infilling) with only the context variable changing, which constitutes adequate single-variable manipulation." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The title claims alignment with 'Real-World Code Repositories' and the abstract says results 'reveal the coding abilities of these LLMs in real-world repositories,' but the benchmark is Python-only with English requirements from 25 repositories. The paper acknowledges this in Section 8 but the abstract and title do not qualify the scope." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper attributes performance differences primarily to context availability and model design (instruction tuning vs. standard LM training) but does not consider other potential explanations such as prompt formatting effects, repository difficulty variance, or whether the specific 25 repositories are representative. The observation about GPT family's higher Pass@k but lower Recall@k receives only speculation, not substantive analysis." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper measures Pass@k (functional correctness via test cases) and Recall@k (dependency recall via static analysis). These are presented as specific metrics, not over-framed as broader constructs. The paper discusses the bias in Recall@k explicitly (Section 5), noting the parser may miss runtime-determined dependencies and quantifying the bias at 0.16." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": true, 148 "justification": "Table 3 specifies exact versions: gpt-4-turbo-1106, gpt-3.5-turbo-1106. Open-source models are identified by name and parameter count (e.g., DeepSeek Coder 33B/6.7B, StarCoder 2 15B/7B, CodeLLaMa 13B/7B). Appendix B.1 provides further details including release dates and training data scope." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": true, 153 "justification": "Full prompt templates are provided in Figures 6-10 and Appendix B.2, covering all four settings (without context, local file completion, local file infilling, similar functions). The paper also states 'We release... all prompts' via the GitHub repository." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 4.3: 'k=1, we use the greedy search... k>1, we use the nucleus sampling with a temperature 0.4 and sample 20 programs per requirement. We set the top-p to 0.95 and the max generation length to 500.'" 159 }, 160 "scaffolding_described": { 161 "applies": false, 162 "answer": false, 163 "justification": "No agentic scaffolding is used. LLMs are directly prompted with requirements, signatures, and optional code contexts. No tools, retry logic, or multi-step workflows are involved." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 3 describes the five-stage pipeline: repository selection (criteria: open-source Python, recent, non-fork, >50 stars, unit tests), function parsing (excluding trivial functions), test construction (pip3 + Pytest), deduplication (Jaccard similarity on code files and imports), and requirement annotation (gpt-4 with few-shot prompts)." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section 8 'Limitations' is a dedicated section with six specific limitation points covering monolingual scope, auto-generated requirement quality, Recall@k bias, limited LLMs evaluated, limited context exploration, and hyperparameter sensitivity." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": true, 180 "justification": "Section 8 discusses specific threats: 'EvoCodeBench is a monolingual benchmark (requirements in English and code in Python),' 'auto-generated requirements... may lack necessary details (e.g., hyper-parameters),' Recall@k bias quantified at 0.16, 'limited computing budgets' constrained the number of LLMs evaluated, and 'we do not carefully tune hyper-parameters and prompts.'" 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": true, 185 "justification": "Section 8 explicitly states what was not tested: other programming languages, other natural languages, more LLMs, cross-file context extraction methods, and hyperparameter tuning. Each is identified as a specific gap with future work plans." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": true, 192 "justification": "The benchmark data (275 samples with requirements, reference code, reference dependencies, repositories, and test cases) and all LLM completions are released via the GitHub repository." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "Section 3 describes the full collection procedure: GitHub crawling with criteria (open-source Python, created 2023-10 to 2024-2, non-fork, >50 stars, explicit unit tests), function extraction with static analysis, test case extraction using Pytest, deduplication, and LLM-based requirement annotation." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "The main data source is public GitHub repositories selected by automated criteria. No human participants are recruited for the main study. The small supplementary requirement evaluation (Section 5) involves hired developers but is not a human subjects study." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": true, 207 "justification": "Section 3 documents the six-stage pipeline: repository selection → function parsing → test construction → deduplication → requirement annotation → benchmark construction. Table 7 provides details on the 25 selected repositories including creation date, stars, file counts, and sample counts." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding sources, grants, or acknowledgments section is present in the paper." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "All five authors are clearly identified as affiliated with 'School of Computer Science, Peking University.' No product being evaluated is affiliated with the authors." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "Cannot assess funder independence because no funding source is disclosed. The authors are academic researchers at Peking University with no apparent commercial stake, but the lack of disclosure prevents verification." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests or financial interests statement is present in the paper." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": true, 236 "justification": "The paper states: 'the latest LLM's training data is up to 2023-9.' Appendix B.1 provides specific cutoffs: gpt-4-1106 training data up to April 2023, gpt-3.5-turbo-1106 training data up to September 2021." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": true, 241 "justification": "This is a core design feature. Section 2.4: 'EvoCodeBench-2403 is collected from real-world repositories that were created from 2023-10 to 2024-2' — after all evaluated models' training cutoffs. The evolving benchmark design is explicitly motivated by avoiding data leakage." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": true, 246 "justification": "The paper addresses contamination through temporal separation: repositories were created after training cutoffs. Section 1 lists 'Avoiding Data Leaking' as a key benchmark feature. The evolving update schedule (every ~6 months) is designed to maintain this property as new models emerge." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human subjects study. The paper evaluates LLMs on a code generation benchmark. The small requirement quality evaluation (Section 5) with hired developers is a supplementary validation, not a human subjects study." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human subjects study requiring IRB approval. The paper is a benchmark evaluation of LLMs." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in the main study. The supplementary requirement evaluation (Section 5) hires developers but does not characterize them beyond their role." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human subjects study. Data is collected from public GitHub repositories with automated selection criteria." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human subjects experimental study. LLMs are evaluated on a fixed benchmark." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human subjects experimental study requiring blinding." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human subjects study with participants who could attrit." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": false, 290 "justification": "The requirement annotation cost is reported ($0.54 for gpt-4, Table 5), but no inference costs are reported for the main LLM evaluation experiments (20 samples per requirement × 275 samples × 10 models × 3 settings)." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "No GPU hours, total API spend, or hardware specifications are reported for the main experiments. Section 8 acknowledges 'limited computing budgets' but does not quantify them." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "For k=1, greedy decoding is deterministic (no seed variance). For k>1, nucleus sampling with temperature 0.4 is used to generate 20 samples, but no analysis of sensitivity to random seeds or across independent runs is reported." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": true, 307 "justification": "Section 4.3: 'When k=1, we use the greedy search and generate a single program per requirement. When k>1, we use the nucleus sampling... and sample 20 programs per requirement.' The number of generated programs is explicit." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": false, 312 "justification": "Section 8 explicitly states: 'Due to the limited computing budgets, we do not carefully tune hyper-parameters and prompts.' No search budget is reported because no search was conducted, but the lack of tuning is transparently acknowledged." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": true, 317 "justification": "The paper uses a single fixed configuration for all models (temperature=0.4, top-p=0.95, max_length=500) without selecting a 'best' configuration. This uniform approach avoids cherry-picking, and the paper states: 'We ensure all LLMs are evaluated under the same experimental settings.'" 318 }, 319 "multiple_comparison_correction": { 320 "applies": true, 321 "answer": false, 322 "justification": "Ten LLMs are compared across three settings and four k values with no statistical tests performed at all, let alone correction for multiple comparisons." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The authors created the benchmark and evaluate LLMs on it. They do not discuss potential biases in benchmark construction (e.g., whether the 25 selected repositories or 275 samples favor certain model architectures or coding patterns)." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": false, 332 "justification": "Models ranging from 7B to 33B parameters and closed-source models of unknown size are compared without discussion of compute costs. No performance-vs-compute analysis is provided." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": true, 337 "justification": "The paper extensively discusses what makes a good benchmark (Section 1, Table 1) and validates EvoCodeBench's construct validity by comparing code/dependency distributions against 500 real-world repositories (Table 2: 27% standalone matches real-world 27%, avg 3.46 dependencies vs 3.22). The contrast with HumanEval (100% standalone) directly addresses validity gaps." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": false, 341 "answer": false, 342 "justification": "No scaffolding is used. All models are prompted directly with the same prompt templates. The evaluation is at the model level, not at a tool/scaffold level." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": true, 349 "justification": "Core design feature: 'EvoCodeBench-2403 is collected from real-world repositories that were created from 2023-10 to 2024-2,' explicitly after the latest model's training cutoff (2023-9). The evolving benchmark design ensures temporal separation." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "The local file settings provide code context from the same file as the target function. The paper does not discuss whether this context might inadvertently contain patterns that leak information about the expected solution (e.g., similar functions in the same file that closely match the reference code)." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": true, 359 "justification": "Section 3 step 4: 'we perform repository-level deduplication based on the Jaccard similarities in code files and imports. The former removes duplicate repositories in text surfaces, and the latter removes repositories in domains that are too similar.'" 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": true, 364 "justification": "The paper uses temporal splits as a concrete leakage prevention method: repositories are selected based on creation dates after model training cutoffs. The automatic pipeline for updating EvoCodeBench from latest repositories is designed to maintain this temporal barrier for future model evaluations." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "GPT-4's highest Pass@1 on EvoCodeBench-2403 is only 20.73%, compared to 80% on HumanEval.", 371 "evidence": "Table 4 shows gpt-4 achieves Pass@1 of 20.73% in Local File (Infilling), 17.45% in Local File (Completion), and 7.27% without context. HumanEval comparison cited in Section 4.4.", 372 "supported": "strong" 373 }, 374 { 375 "claim": "Code contexts from local files improve gpt-4's Pass@1 by 104% (completion) and 152% (infilling) over the no-context setting.", 376 "evidence": "Table 4: gpt-4 Pass@1 goes from 7.27% (no context) to 17.45% (completion, +140% actually) and 20.73% (infilling, +185% actually). The paper states 104% and 152% in Section 4.4, though the math from Table 4 yields different percentages — the paper may be using different base numbers or rounding.", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "EvoCodeBench-2403's code distribution (27% standalone, 73% non-standalone) is consistent with that of 500 real-world repositories.", 381 "evidence": "Table 2 shows EvoCodeBench-2403 has 27% standalone / 73% non-standalone, matching exactly the distribution of 500 real-world repositories (also 27% / 73%). Average dependencies: 3.46 vs 3.22.", 382 "supported": "strong" 383 }, 384 { 385 "claim": "Most LLM failures on EvoCodeBench are due to logic errors (58%) and missing contexts (40%).", 386 "evidence": "Section 4.4: manual analysis of 50 gpt-4 error cases in Local File (Infilling) found 29 logic errors, 20 missing context cases, and 1 vague requirement case.", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "Auto-generated requirements (by gpt-4) are comparable to human-written requirements in 92% of cases.", 391 "evidence": "Table 5 and Section 5: On 50 randomly selected functions, gpt-4 and human developers tied on 41 functions, gpt-4 won 5, humans won 4. Cohen's Kappa between two evaluators is 0.92.", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "Retrieval-augmented generation with similar function names improves Pass@1 for gpt-4 from 8.31% to 12.29%.", 396 "evidence": "Table 6 shows gpt-4 Pass@1 improves from 8.31% (without context) to 12.29% (similar functions), and gpt-3.5 from 6.64% to 11.62%. Recall@1 shows larger gains.", 397 "supported": "moderate" 398 } 399 ], 400 "red_flags": [ 401 { 402 "flag": "No statistical tests or uncertainty quantification", 403 "detail": "All comparisons across 10 LLMs and 3 settings rely on point estimates. No error bars, confidence intervals, or significance tests are reported despite making numerous comparative claims (e.g., 'gpt-4 achieves the highest Pass@k'). With 275 samples, random variation could account for some observed differences." 404 }, 405 { 406 "flag": "Small benchmark with potential selection bias", 407 "detail": "275 samples from only 25 repositories. While the code/dependency distributions match real-world statistics, 25 repositories from a 5-month window (Oct 2023 - Feb 2024) with >50 stars represent a narrow slice of Python development. The selection criteria (high stars, good tests) may bias toward well-maintained projects unrepresentative of typical repositories." 408 }, 409 { 410 "flag": "Percentage improvement calculation appears inconsistent", 411 "detail": "The paper states gpt-4's Pass@1 'is improved by 104% and 152%' from without-context to completion and infilling settings. From Table 4: 7.27% → 17.45% is actually a 140% increase, and 7.27% → 20.73% is a 185% increase. The stated figures may use different baseline numbers but the discrepancy is unexplained." 412 }, 413 { 414 "flag": "No compute costs reported for main experiments", 415 "detail": "Evaluating 10 LLMs × 275 samples × 20 generations (for k>1) × 3 settings represents substantial compute. No API costs, GPU hours, or wall-clock times are reported, making it impossible to assess the practical feasibility of the evaluation." 416 } 417 ], 418 "cited_papers": [ 419 { 420 "title": "Evaluating large language models trained on code", 421 "authors": ["Mark Chen", "Jerry Tworek"], 422 "year": 2021, 423 "relevance": "Introduced HumanEval, the most widely used code generation benchmark and primary comparison point for EvoCodeBench's claim of improved alignment with real-world coding." 424 }, 425 { 426 "title": "SWE-bench: Can language models resolve real-world GitHub issues?", 427 "authors": ["Carlos E. Jimenez", "John Yang"], 428 "year": 2023, 429 "arxiv_id": "2310.06770", 430 "relevance": "Repository-level repair benchmark that focuses on issue resolution rather than code generation, contrasted with EvoCodeBench's code generation focus." 431 }, 432 { 433 "title": "CoderEval: A benchmark of pragmatic code generation with generative pre-trained models", 434 "authors": ["Hao Yu", "Bo Shen"], 435 "year": 2023, 436 "arxiv_id": "2302.00288", 437 "relevance": "Most directly comparable prior work: a code generation benchmark from real repositories with non-standalone programs, but lacking real code distribution and robust dependency metrics." 438 }, 439 { 440 "title": "ClassEval: A manually-crafted benchmark for evaluating LLMs on class-level code generation", 441 "authors": ["Xueying Du", "Mingwei Liu"], 442 "year": 2023, 443 "arxiv_id": "2308.01861", 444 "relevance": "Class-level code generation benchmark with 100 hand-crafted Python classes, compared as a predecessor that lacks real-world repository alignment." 445 }, 446 { 447 "title": "CrossCodeEval: A diverse and multilingual benchmark for cross-file code completion", 448 "authors": ["Yangruibo Ding", "Zijian Wang"], 449 "year": 2023, 450 "arxiv_id": "2310.11248", 451 "relevance": "Cross-file code completion benchmark addressing similar repository-level challenges but focused on completion rather than generation from requirements." 452 }, 453 { 454 "title": "RepoBench: Benchmarking repository-level code auto-completion systems", 455 "authors": ["Tianyang Liu", "Canwen Xu"], 456 "year": 2023, 457 "arxiv_id": "2306.03091", 458 "relevance": "Repository-level code completion benchmark, contrasted with EvoCodeBench's code generation focus and comprehensive annotations." 459 }, 460 { 461 "title": "DeepSeek-Coder: When the large language model meets programming", 462 "authors": ["Daya Guo", "Qihao Zhu"], 463 "year": 2024, 464 "arxiv_id": "2401.14196", 465 "relevance": "One of the evaluated code LLMs, achieving competitive results on EvoCodeBench (Pass@1 19.64% for 33B in infilling setting)." 466 }, 467 { 468 "title": "StarCoder 2 and the Stack v2: The next generation", 469 "authors": ["Anton Lozhkov", "Raymond Li"], 470 "year": 2024, 471 "arxiv_id": "2402.19173", 472 "relevance": "Evaluated code LLM trained on Stack v2; its training data cutoff is used to determine EvoCodeBench's temporal boundary for contamination avoidance." 473 }, 474 { 475 "title": "Code Llama: Open foundation models for code", 476 "authors": ["Baptiste Rozière", "Jonas Gehring"], 477 "year": 2023, 478 "arxiv_id": "2308.12950", 479 "relevance": "Evaluated code LLM series; performance on EvoCodeBench demonstrates the gap between standalone benchmark scores and real-world repository coding." 480 }, 481 { 482 "title": "Program synthesis with large language models", 483 "authors": ["Jacob Austin", "Augustus Odena"], 484 "year": 2021, 485 "arxiv_id": "2108.07732", 486 "relevance": "Introduced MBPP benchmark and unbiased Pass@k estimator used as the primary evaluation metric in EvoCodeBench." 487 }, 488 { 489 "title": "Repository-level prompt generation for large language models of code", 490 "authors": ["Disha Shrivastava", "Hugo Larochelle"], 491 "year": 2023, 492 "relevance": "Prior work on extracting repository contexts for code generation, inspiring EvoCodeBench's context extraction approach." 493 }, 494 { 495 "title": "RepoCoder: Repository-level code completion through iterative retrieval and generation", 496 "authors": ["Fengji Zhang", "Bei Chen"], 497 "year": 2023, 498 "relevance": "Repository-level code completion via iterative retrieval, related to EvoCodeBench's RAG experiments with similar function names." 499 } 500 ], 501 "engagement_factors": { 502 "practical_relevance": { 503 "score": 2, 504 "justification": "Released benchmark and evaluation framework can be used by researchers and practitioners to evaluate code LLMs on realistic tasks, though it requires significant setup." 505 }, 506 "surprise_contrarian": { 507 "score": 1, 508 "justification": "The finding that gpt-4 drops from 80% to 20.73% on real-world code is notable but broadly expected by practitioners who know HumanEval is easy." 509 }, 510 "fear_safety": { 511 "score": 0, 512 "justification": "No AI safety or security concerns raised; purely a benchmark evaluation paper." 513 }, 514 "drama_conflict": { 515 "score": 1, 516 "justification": "Implicitly argues that existing benchmarks like HumanEval are misleading about LLM coding abilities, but presents this as a gap to fill rather than a controversy." 517 }, 518 "demo_ability": { 519 "score": 2, 520 "justification": "Code and data released on GitHub; researchers can download and run evaluations, though it requires setting up repositories and test environments." 521 }, 522 "brand_recognition": { 523 "score": 1, 524 "justification": "From Peking University (well-known in CS); evaluates recognizable models (gpt-4, DeepSeek Coder) but not from a major AI lab." 525 } 526 } 527 }