scan.json (25106B)
1 { 2 "paper": { 3 "title": "COFFE: A Code Efficiency Benchmark for Code Generation", 4 "authors": ["Yun Peng", "Jun Wan", "Yichen Li", "Xiaoxue Ren"], 5 "year": 2025, 6 "venue": "Proc. ACM Softw. Eng. (FSE)", 7 "arxiv_id": "2502.02827", 8 "doi": "10.1145/3715727" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": true, 15 "justification": "Section 10 (Data Availability) states: 'The code and data of STGen and COFFE are available at https://github.com/JohnnyPeng18/Coffe.' A specific GitHub URL is provided." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The benchmark data (COFFE) is released alongside the code at https://github.com/JohnnyPeng18/Coffe, as stated in Section 10. The benchmark is built upon publicly available benchmarks (HumanEval, MBPP, Code Contests, APPS)." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "Section 4.5 mentions 'Linux machine with Ubuntu 20.04.4 LTS' and specific hardware (Intel Xeon Platinum 8358P CPU, 128 cores, 2 TB memory) along with libraries Coverage.py and Cirron, but no requirements.txt, Dockerfile, or detailed dependency versions are provided in the paper." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "The paper does not include step-by-step reproduction instructions. While a GitHub repository is referenced, the paper itself does not provide commands to run or a 'Reproducing Results' section." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "The main results in Table 6 report only point estimates (efficient@1, pass@1, speedup) without confidence intervals or error bars. Table 4 reports RSD but not confidence intervals on the main performance metrics." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper makes comparative claims (e.g., Finding 1-4 comparing LLMs, STGen vs. baselines) based solely on comparing numeric values without any statistical significance tests." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper reports percentage improvements with baseline context throughout. For example, STGen improves RSD by '43.10% and 32.08% on Llama3.1 and GPT-4o' (Section 5.2), and efficient@1 values are compared with pass@1 values with explicit deltas (Table 6 column 'Δ')." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "No justification is given for why 20 code solutions are sampled per problem, why 12 measurement repetitions are used, or why the specific number of problems in each benchmark split was chosen. The sample of 20 solutions per problem for RSD evaluation is stated but not justified." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "Relative Standard Deviation (RSD) is reported extensively. Table 4 shows RSD for CPU instruction count vs. execution time stability. Table 5 shows RSD for distinguishability of test cases. The paper explicitly discusses variance in measurements." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "Three baselines are included for STGen evaluation: Instruction Prompting, Few-shot Prompting, and Generator-based Prompting (Section 4.3, Table 5). Original correctness test cases serve as a baseline for distinguishability evaluation." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "The baselines are adapted from recent test case generation methods (Wang et al. 2024 [79], Ouédraogo et al. 2024 [66], Liu et al. 2024 [49]). The 14 evaluated LLMs include recent models like GPT-4o, Claude 3.5 Sonnet, Gemini 1.5 Pro, DeepSeek V2, and Llama3.1." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": true, 74 "justification": "The comparison between STGen (with contracts) and baselines (without contracts) effectively serves as an ablation showing the contribution of contracts. The comparison of expression/generator test cases vs. raw test cases also demonstrates component contributions. Table 5 shows the impact of each format." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": true, 79 "justification": "Multiple metrics are used throughout: accuracy, line coverage, RSD for STGen evaluation (Table 5); pass@1, efficient@1, and speedup for LLM evaluation (Table 6); RSD and Pearson correlation for CPU instruction count validation (Table 4)." 80 }, 81 "human_evaluation": { 82 "applies": false, 83 "answer": false, 84 "justification": "This is a benchmark evaluation paper measuring automated code generation efficiency. Human evaluation of the system's outputs is not relevant — the evaluation is inherently automated (pass/fail on test cases and CPU instruction count measurements)." 85 }, 86 "held_out_test_set": { 87 "applies": true, 88 "answer": true, 89 "justification": "The benchmark uses established test splits from existing benchmarks (HumanEval, MBPP sanitized, Code Contests, APPS). The evaluation of LLMs on COFFE uses these test problems. The stressful test cases generated by STGen are new and independent of LLM training data." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "Results are broken down by function-level vs. file-level code generation throughout (Tables 5, 6). Per-model results are provided for all 14 LLMs. Results are also broken down by model size family (e.g., CodeLlama 7B/13B/34B)." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": true, 99 "justification": "The paper discusses cases where LLMs generate code slower than ground truth (speedup < 1.0), the significant performance drop from pass@1 to efficient@1, and notes that Llama3-8B achieves 0% efficient@1 at file level. Section 7 discusses measurement errors and limitations of baselines." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "Several negative findings are reported: larger LLMs do not significantly outperform smaller ones in efficient code generation (Finding 4); most LLMs achieve speedup < 1.0 at file level; the few-shot baseline achieves only 65.17% accuracy at file level (Table 5)." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims about STGen's 99% accuracy, 96% line coverage, CPU instruction count stability (1000x smaller RSD), and four findings from evaluating 14 LLMs are all supported by specific results in Tables 4, 5, and 6." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": true, 116 "justification": "The paper makes causal claims about contracts improving test case generation accuracy. This is supported by controlled comparison: STGen (with contracts) vs. baselines (without contracts) on the same problems using the same LLM (GPT-4o). The ablation-style design adequately supports these claims." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": true, 121 "justification": "Section 7.2 explicitly states that results are limited to Python and 'the code generation performance of LLMs on other programming languages such as C++ and Java may be different.' The title refers to 'code generation' broadly but the threats section appropriately bounds this." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": true, 126 "justification": "Section 7 (Threats to Validity) discusses specific alternative explanations: measurement errors from code optimization techniques (Sec 7.1), potential performance changes from modifying baselines, and the limitation to Python that could affect findings for other languages (Sec 7.2)." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": true, 133 "justification": "Section 4.4 specifies exact API engine names: 'gpt-3.5-turbo', 'gpt-4o', 'DeepSeek-V2-0628', 'DeepSeek-V2-0724', 'claude-3-5-sonnet-20240620', 'gemini-1.5-pro'. Open-source models are specified with sizes (e.g., CodeLlama 7B/13B/34B, Phi3 3.8B)." 134 }, 135 "prompts_provided": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper describes the prompting approach at a high level (e.g., 'STGen prompts LLMs with contract, verified generated test cases as demonstrations') but does not provide the actual prompt text used for STGen, contract generation, or code generation experiments." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": false, 143 "justification": "Temperature 0 is mentioned only for problem selection sampling, not for the main code generation evaluation. No temperature, top-p, or other sampling parameters are stated for the main LLM evaluation in RQ3. The threshold of 5 for Phase III is stated, but key LLM generation parameters are missing." 144 }, 145 "scaffolding_described": { 146 "applies": true, 147 "answer": true, 148 "justification": "The STGen pipeline is described in detail across Sections 3.2.1-3.2.4: three phases (contract generation, test case generation, test case-contract validation), with iterative feedback, LLM judge checker, and the flow between phases is shown in Figure 2." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": true, 153 "justification": "Section 3.1 documents the data pipeline in detail: starting benchmarks, problem validation (removing conflicting solutions, file operations, inconsistencies), problem selection (removing problems no LLM can solve), with exact counts at each stage (Table 1 shows Ori., Val., Sel. columns)." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 7 'Threats to Validity' is a dedicated section with two subsections: 7.1 'Threats to Internal Validity' and 7.2 'Threats to External Validity', spanning approximately one full page." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": true, 165 "justification": "Section 7.1 discusses specific threats: measurement errors from code optimization techniques, the lack of existing stressful test case generation baselines requiring modifications, and mitigation strategies (Docker isolation, 12 runs with outlier removal). Section 7.2 discusses Python-specific limitations." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 7.2 explicitly states: 'we mainly focus on the evaluation of Python code generation in this paper' and acknowledges results may differ for C++ and Java. Section 2 states the focus on function-level and file-level (not repo-level) code generation, explaining why repo-level is excluded." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": true, 177 "justification": "The COFFE benchmark data is available at https://github.com/JohnnyPeng18/Coffe (Section 10), which should include the problems, test cases, and generated solutions that underlie the reported results." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 3.1 describes the data collection in detail: problems come from HumanEval, MBPP (sanitized version), Code Contests, and APPS test splits. The selection criteria, validation process, and filtering steps are all documented with specific counts." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No human participants are involved. The data sources are standard public benchmarks (HumanEval, MBPP, Code Contests, APPS)." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": true, 192 "justification": "The full pipeline is documented in Section 3.1: original problems (Table 1 'Ori.' column) → validation removing conflicts/file operations (Table 1 'Val.' column) → selection removing unsolvable problems (Table 1 'Sel.' column). For APPS: 5000 → 3106 → 300 with exact removal counts (1894 conflicts, 2223 unsolvable)." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": true, 199 "justification": "The Acknowledgment section states: 'This work is supported by the National Nature Science Foundation of China (No. 62302437).'" 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "Author affiliations are clearly stated: The Chinese University of Hong Kong and The State Key Laboratory of Blockchain and Data Security, Zhejiang University. None of the authors appear to be affiliated with the LLM companies being evaluated." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": true, 209 "justification": "The funder is the National Nature Science Foundation of China, a government funding agency with no commercial stake in the performance of any specific LLM being evaluated." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests or financial interests statement is present in the paper." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": true, 220 "answer": false, 221 "justification": "The paper evaluates 14 LLMs on benchmarks that could be in their training data (HumanEval published 2021, MBPP published 2021) but does not state training data cutoff dates for any of the models." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": true, 225 "answer": false, 226 "justification": "The paper uses HumanEval and MBPP, which are well-known benchmarks published in 2021 and widely available online. No discussion of whether evaluated models may have seen these problems during training. This is particularly concerning since the paper builds on these benchmarks." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": true, 230 "answer": false, 231 "justification": "HumanEval and MBPP have been publicly available since 2021, and Code Contests since 2022. All models evaluated were trained after these dates. The paper does not address the contamination risk at all, despite this being a well-known concern for these benchmarks." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "No human participants are involved in this study." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants are involved in this study." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants are involved in this study." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants are involved in this study." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants are involved in this study." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants are involved in this study." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants are involved in this study." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": false, 275 "justification": "The paper uses multiple API-based LLMs (GPT-4o, Claude 3.5 Sonnet, Gemini 1.5 Pro, DeepSeek V2) and generates 20 solutions per problem across 14 models but does not report API costs, tokens consumed, or wall-clock time for the evaluation." 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": false, 280 "justification": "The hardware is described (Intel Xeon Platinum 8358P, 128 cores, 2 TB memory) but total computational budget — GPU hours, total API spend, or total experiment time — is not reported despite extensive API usage across 14 models." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "CPU instruction count is 1000x more stable than execution time for measuring code efficiency, with RSD of 0.003%-0.005% vs. 2.37%-5.65%.", 287 "evidence": "Table 4 (Section 5.1) shows RSD comparisons across four benchmarks. CPU instruction count achieves RSD of 0.003-0.005% while execution time achieves 2.37-5.65%. Pearson correlation between the two is 0.96-1.00.", 288 "supported": "strong" 289 }, 290 { 291 "claim": "STGen achieves ~99% accuracy in stressful test case generation, outperforming baselines by up to 51.77%.", 292 "evidence": "Table 5 (Section 5.2) shows STGen accuracy of 98.64% (function-level) and 98.91% (file-level) vs. baselines ranging from 65.17% to 94.86%.", 293 "supported": "strong" 294 }, 295 { 296 "claim": "Stressful test cases generated by STGen better distinguish code solution efficiency, improving RSD by up to 43.10% over correctness test cases.", 297 "evidence": "Table 5 (Section 5.2) shows RSD improvement: function-level from 19.05% to 27.26% (Llama3.1) and from 21.35% to 28.20% (GPT-4o).", 298 "supported": "strong" 299 }, 300 { 301 "claim": "Current LLMs' performance drops significantly from correct to efficient code generation, with best efficient@1 of 46.97% vs. best pass@1 of 79.90% at function level.", 302 "evidence": "Table 6 (Section 5.3) shows the comparison across all 14 LLMs. DeepSeek V2 Coder achieves best efficient@1 of 46.97% with pass@1 of 79.90%. The delta column shows 31-100% performance drops.", 303 "supported": "strong" 304 }, 305 { 306 "claim": "Larger LLMs do not significantly outperform smaller LLMs in efficient code generation despite performing better in correct code generation.", 307 "evidence": "Section 5.3 Finding 4: CodeLlama-34b achieves efficient@1 of 40.37% close to Llama3.1-405b's 39.58% despite being 10x smaller. Similar patterns observed at file level.", 308 "supported": "moderate" 309 } 310 ], 311 "methodology_tags": ["benchmark-eval"], 312 "key_findings": "COFFE is a benchmark for evaluating the time efficiency of LLM-generated code, using CPU instruction count (1000x more stable than execution time) and a novel efficient@k metric combining correctness and efficiency. STGen, a contract-guided stressful test case generation approach, achieves ~99% accuracy and better distinguishes code solution efficiency. Evaluation of 14 LLMs reveals that efficient code generation is significantly more challenging than correct code generation (best efficient@1 of 46.97% vs. best pass@1 of 79.90%), and larger model size does not proportionally improve code efficiency.", 313 "red_flags": [ 314 { 315 "flag": "No benchmark contamination discussion", 316 "detail": "The benchmark builds on HumanEval (2021) and MBPP (2021), which are widely available online and likely in the training data of most evaluated models. The paper does not discuss this contamination risk at all. While the efficiency measurement adds a novel dimension, the correctness pass@1 results could be inflated by memorization." 317 }, 318 { 319 "flag": "Missing hyperparameters for main evaluation", 320 "detail": "Temperature 0 is mentioned only for problem selection, not for the main RQ3 evaluation of 14 LLMs. Sampling parameters (temperature, top-p, max tokens) for the main code generation experiments are not reported, making results potentially non-reproducible." 321 }, 322 { 323 "flag": "No statistical significance tests", 324 "detail": "All comparative claims (Findings 1-4, STGen vs. baselines) are based on raw numeric comparisons without any statistical testing. The claim that larger models do not significantly outperform smaller ones (Finding 4) would particularly benefit from formal testing." 325 }, 326 { 327 "flag": "No inference cost reporting", 328 "detail": "The evaluation involves generating 20 solutions per problem across 14 models (including 4 commercial API-based models) on 756 problems, plus STGen test generation costs, but no API costs or total compute time are reported." 329 } 330 ], 331 "cited_papers": [ 332 { 333 "title": "Evaluating Large Language Models Trained on Code", 334 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 335 "year": 2021, 336 "arxiv_id": "2107.03374", 337 "relevance": "Introduces HumanEval benchmark and Codex, foundational for code generation evaluation." 338 }, 339 { 340 "title": "EffiBench: Benchmarking the Efficiency of Automatically Generated Code", 341 "authors": ["Dong Huang", "Jie M. Zhang", "Yuhao Qing", "Heming Cui"], 342 "year": 2024, 343 "arxiv_id": "2402.02037", 344 "relevance": "First benchmark for evaluating code efficiency of LLM-generated code; direct precursor to COFFE." 345 }, 346 { 347 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 348 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"], 349 "year": 2023, 350 "arxiv_id": "2310.06770", 351 "relevance": "Major benchmark for evaluating LLMs on real-world software engineering tasks." 352 }, 353 { 354 "title": "Evaluating Language Models for Efficient Code Generation", 355 "authors": ["Jiawei Liu", "Songrun Xie", "Junhao Wang"], 356 "year": 2024, 357 "arxiv_id": "2408.06450", 358 "relevance": "Directly evaluates LLM code efficiency using existing benchmarks; motivates the need for COFFE." 359 }, 360 { 361 "title": "Learning Performance-Improving Code Edits", 362 "authors": ["Alexander G Shypula", "Aman Madaan", "Yimeng Zeng"], 363 "year": 2024, 364 "relevance": "Proposes methods for LLM-based performance optimization of code; uses execution time measurement that COFFE replaces with CPU instruction count." 365 }, 366 { 367 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation", 368 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 369 "year": 2023, 370 "relevance": "EvalPlus framework for rigorous LLM code generation evaluation; provides the MBPP+ benchmark used in COFFE." 371 }, 372 { 373 "title": "PerfCodeGen: Improving Performance of LLM Generated Code with Execution Feedback", 374 "authors": ["Yun Peng", "Akhilesh Deepak Gotmare", "Michael Lyu"], 375 "year": 2024, 376 "arxiv_id": "2412.03578", 377 "relevance": "Uses execution feedback to improve LLM code efficiency; by the same first author." 378 }, 379 { 380 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 381 "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig"], 382 "year": 2024, 383 "arxiv_id": "2405.15793", 384 "relevance": "Agentic approach to automated software engineering; relevant to evaluating LLM code generation capabilities." 385 }, 386 { 387 "title": "AgentCoder: Multi-Agent-based Code Generation with Iterative Testing and Optimisation", 388 "authors": ["Dong Huang", "Qingwen Bu", "Jie M. Zhang"], 389 "year": 2023, 390 "arxiv_id": "2312.13010", 391 "relevance": "Multi-agent framework for code generation with iterative testing, relevant to agentic AI programming." 392 }, 393 { 394 "title": "Code Llama: Open Foundation Models for Code", 395 "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"], 396 "year": 2023, 397 "arxiv_id": "2308.12950", 398 "relevance": "Open-source code LLM family evaluated in COFFE; important baseline for code generation." 399 }, 400 { 401 "title": "DeepSeek-Coder: When the Large Language Model Meets Programming", 402 "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"], 403 "year": 2024, 404 "arxiv_id": "2401.14196", 405 "relevance": "Code-specialized LLM achieving top performance on COFFE's efficient@1 metric." 406 } 407 ] 408 }