scan-v4.json (32586B)
1 { 2 "scan_version": 4, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "EffiLearner: Enhancing Efficiency of Generated Code via Self-Optimization", 6 "authors": [ 7 "Dong Huang", 8 "Jianbo Dai", 9 "Han Weng", 10 "Puzhen Wu", 11 "Yuhao Qing", 12 "Heming Cui", 13 "Zhijiang Guo", 14 "Jie M. Zhang" 15 ], 16 "year": 2024, 17 "venue": "Neural Information Processing Systems", 18 "arxiv_id": "2405.15189", 19 "doi": "10.52202/079017-2684" 20 }, 21 "checklist": { 22 "claims_and_evidence": { 23 "abstract_claims_supported": { 24 "applies": true, 25 "answer": true, 26 "justification": "Abstract claims about StarCoder2-15B (87.1% ET reduction from 0.93s to 0.12s, 90.8% TMU reduction from 22.02 to 2.03 Mb*s) and DeepSeek-6.7B-Ins (85.8% MU reduction) are exactly matched by Table 1.", 27 "source": "opus" 28 }, 29 "causal_claims_justified": { 30 "applies": true, 31 "answer": true, 32 "justification": "The causal claim that overhead profiles cause efficiency improvement is supported by the ablation in Table 3, which compares EFFI-LEARNER against controlled variants (no feedback, result-only feedback, single-profiler feedback). This is adequate controlled single-variable manipulation.", 33 "source": "opus" 34 }, 35 "generalization_bounded": { 36 "applies": true, 37 "answer": false, 38 "justification": "The title ('Enhancing Efficiency of Generated Code') and abstract make broad claims about 'LLM-generated code' without bounding to Python. All experiments are Python-only. The limitation (Appendix A.1) acknowledges this but the title and abstract do not.", 39 "source": "opus" 40 }, 41 "alternative_explanations_discussed": { 42 "applies": true, 43 "answer": false, 44 "justification": "No alternative explanations are discussed for why EFFI-LEARNER works. For example, the LLMs may be memorizing known efficient algorithms rather than using the profiling feedback, or the open test cases may be biasing optimization toward specific inputs.", 45 "source": "opus" 46 }, 47 "proxy_outcome_distinction": { 48 "applies": true, 49 "answer": true, 50 "justification": "The paper measures execution time and memory usage and claims improvements in execution time and memory usage. The claims match the granularity of measurements — no broader framing gap exists (they don't claim 'better code quality' or 'improved software engineering').", 51 "source": "opus" 52 } 53 }, 54 "limitations_and_scope": { 55 "limitations_section_present": { 56 "applies": true, 57 "answer": true, 58 "justification": "Appendix A.1 contains a dedicated 'Limitations' section discussing three specific limitations: time-consuming multi-iteration process, increased token consumption, and Python-only evaluation.", 59 "source": "opus" 60 }, 61 "threats_to_validity_specific": { 62 "applies": true, 63 "answer": true, 64 "justification": "Appendix A.1 discusses specific threats: (1) the multi-iteration process is time-consuming for complex tasks, (2) overhead profiles consume additional tokens, (3) effectiveness is evaluated only on Python, so 'performance in different programming languages or environments may vary.'", 65 "source": "opus" 66 }, 67 "scope_boundaries_stated": { 68 "applies": true, 69 "answer": true, 70 "justification": "Appendix A.1 explicitly states: 'the effectiveness of EffiLearner has been primarily evaluated on Python. Therefore, its performance in different programming languages or environments may vary, underscoring the need for further testing and validation.'", 71 "source": "opus" 72 } 73 }, 74 "conflicts_of_interest": { 75 "funding_disclosed": { 76 "applies": true, 77 "answer": true, 78 "justification": "Section 6 discloses funding: National Key R&D Program of China (2022ZD0160201), HK RGC RIF (R7030-22), HK ITF (GHP/169/20SZ), Huawei Flagship Research Grant 2023, HK RGC GRF (17208223 & 17204424), and HKU-CAS Joint Laboratory.", 79 "source": "opus" 80 }, 81 "affiliations_disclosed": { 82 "applies": true, 83 "answer": true, 84 "justification": "Author affiliations are clearly listed: University of Hong Kong, University of Edinburgh, Beijing University of Posts and Telecommunications, University College Dublin, University of Cambridge, King's College London, Shanghai AI Laboratory.", 85 "source": "opus" 86 }, 87 "funder_independent_of_outcome": { 88 "applies": true, 89 "answer": true, 90 "justification": "Funders include government grants and Huawei. The paper evaluates third-party models (GPT-4, Claude, StarCoder, CodeLlama, etc.) — none are Huawei products. The funders have no direct financial stake in which models perform better with EFFI-LEARNER.", 91 "source": "opus" 92 }, 93 "financial_interests_declared": { 94 "applies": true, 95 "answer": false, 96 "justification": "No competing interests statement or financial interest disclosure is present in the paper. The Huawei funding is acknowledged but there is no explicit declaration that authors have no competing interests.", 97 "source": "opus" 98 } 99 }, 100 "scope_and_framing": { 101 "key_terms_defined": { 102 "applies": true, 103 "answer": true, 104 "justification": "Key efficiency terms (ET, NET, MU, NMU, TMU, NTMU) receive formal mathematical definitions in Appendix A.5; the self-optimization pipeline is defined through its three components in Section 3.", 105 "source": "haiku" 106 }, 107 "intended_contribution_clear": { 108 "applies": true, 109 "answer": true, 110 "justification": "The paper explicitly claims to propose 'the first method that significantly improves the efficiency of code generated by a wide range of LLMs' using overhead profile feedback.", 111 "source": "haiku" 112 }, 113 "engagement_with_prior_work": { 114 "applies": true, 115 "answer": true, 116 "justification": "Section 2 situates the work relative to Self-Edit, Self-Debug, Self-Refine, ALGO, and PIE, explicitly distinguishing overhead-profile feedback from correctness-focused feedback approaches.", 117 "source": "haiku" 118 } 119 } 120 }, 121 "type_checklist": { 122 "empirical": { 123 "artifacts": { 124 "code_released": { 125 "applies": true, 126 "answer": true, 127 "justification": "The abstract states 'The source code of EFFI-LEARNER was released in https://github.com/huangd1999/EffiLearner' and the NeurIPS checklist confirms code was uploaded in supplementary files.", 128 "source": "opus" 129 }, 130 "data_released": { 131 "applies": true, 132 "answer": true, 133 "justification": "The paper uses publicly available benchmarks: EffiBench, HumanEval, MBPP, and EvalPlus (HumanEval-Plus, MBPP-Plus). These are all standard public datasets.", 134 "source": "opus" 135 }, 136 "environment_specified": { 137 "applies": true, 138 "answer": false, 139 "justification": "Section 4.2 specifies hardware (Intel Xeon Platinum 8336C CPU, 8× NVIDIA A100-SXM GPUs, 2.0TiB memory) but does not provide software dependency specifications (requirements.txt, library versions, Python version) in the paper itself.", 140 "source": "opus" 141 }, 142 "reproduction_instructions": { 143 "applies": true, 144 "answer": false, 145 "justification": "The paper provides a GitHub link and describes the framework pipeline (Section 3), but does not include step-by-step reproduction instructions with specific commands to replicate the main experiments.", 146 "source": "opus" 147 } 148 }, 149 "statistical_methodology": { 150 "confidence_intervals_or_error_bars": { 151 "applies": true, 152 "answer": false, 153 "justification": "All tables (Tables 1-9) report only point estimates with percentage improvements. No confidence intervals or error bars are provided for any metric.", 154 "source": "opus" 155 }, 156 "significance_tests": { 157 "applies": true, 158 "answer": false, 159 "justification": "The paper repeatedly claims EFFI-LEARNER 'significantly' improves efficiency (e.g., Section 4.3) but no statistical significance tests are used. Comparisons are based solely on raw number differences.", 160 "source": "opus" 161 }, 162 "effect_sizes_reported": { 163 "applies": true, 164 "answer": true, 165 "justification": "Tables report absolute values before and after optimization with percentage reductions (e.g., 'ET decreases from 0.93 (s) to 0.12 (s) which reduces 87.1% execution time'). Normalized metrics (NET, NMU, NTMU) provide relative comparison to canonical solutions.", 166 "source": "opus" 167 }, 168 "sample_size_justified": { 169 "applies": true, 170 "answer": false, 171 "justification": "The number of evaluation problems is determined by the benchmark sizes (EffiBench, HumanEval, MBPP) without any justification for whether these sample sizes are sufficient for the claims made.", 172 "source": "opus" 173 }, 174 "variance_reported": { 175 "applies": true, 176 "answer": false, 177 "justification": "No variance, standard deviation, or spread measures are reported. The NeurIPS checklist (Q7) claims greedy decoding makes results 'consistent,' but execution time and memory measurements are inherently variable across runs and no measurement variance is reported.", 178 "source": "opus" 179 } 180 }, 181 "evaluation_design": { 182 "baselines_included": { 183 "applies": true, 184 "answer": true, 185 "justification": "Table 3 compares against Unsupervised Self-Refine, Result-Aware Self-Refine, Memory Profiler only, and Execution Time Profiler only. Table 7 compares with Self-Edit, Critic, PIE, and Supersonic.", 186 "source": "opus" 187 }, 188 "baselines_contemporary": { 189 "applies": true, 190 "answer": true, 191 "justification": "Baselines include Self-Refine (NeurIPS 2023), Reflexion (NeurIPS 2023), PIE (ICLR 2024), Supersonic, Self-Edit (ACL 2023), and CRITIC (2023). These are recent and relevant approaches.", 192 "source": "opus" 193 }, 194 "ablation_study": { 195 "applies": true, 196 "answer": true, 197 "justification": "Table 3 ablates the contribution of each profiler component (memory profiler only, execution time profiler only, combined). Table 2 ablates the number of self-optimization steps (0-5).", 198 "source": "opus" 199 }, 200 "multiple_metrics": { 201 "applies": true, 202 "answer": true, 203 "justification": "Six metrics are used: Execution Time (ET), Normalized ET (NET), Max Memory Usage (MU), Normalized MU (NMU), Total Memory Usage (TMU), and Normalized TMU (NTMU), defined in Appendix A.5.", 204 "source": "opus" 205 }, 206 "human_evaluation": { 207 "applies": true, 208 "answer": false, 209 "justification": "All evaluation is fully automated through execution time profiling and memory profiling. No human evaluation of code quality, readability, or optimization strategies is performed.", 210 "source": "opus" 211 }, 212 "held_out_test_set": { 213 "applies": true, 214 "answer": true, 215 "justification": "Section 4.1: 'we utilize the open test cases to calculate the efficiency metrics during the self-optimization process, while private test cases provided by EffiBench were used for the final result evaluation.' Similarly, EvalPlus private tests are used for HumanEval/MBPP.", 216 "source": "opus" 217 }, 218 "per_category_breakdown": { 219 "applies": true, 220 "answer": true, 221 "justification": "Results are broken down per model (22 models in Tables 1, 5, 8, 9), per benchmark (EffiBench, HumanEval, MBPP), and per optimization step (Table 2). Individual case studies are also provided.", 222 "source": "opus" 223 }, 224 "failure_cases_discussed": { 225 "applies": true, 226 "answer": true, 227 "justification": "Section 4.6 'Error Analysis' discusses a case (FindMedianSortedArrays) where optimization provides minimal improvement due to already-optimal O(log(min(m,n))) complexity, with detailed code examples in Appendix Figures 12-18.", 228 "source": "opus" 229 }, 230 "negative_results_reported": { 231 "applies": true, 232 "answer": true, 233 "justification": "Table 1 shows StarCoder2-15B MU increases by 5.0% after optimization. Table 3 shows Unsupervised Self-Refine and Result-Aware Self-Refine dramatically worsen performance (e.g., 518.8% TMU increase). Table 6 shows pass@1 decreases. GPT-4 MU and TMU improvements are modest.", 234 "source": "opus" 235 } 236 }, 237 "setup_transparency": { 238 "model_versions_specified": { 239 "applies": true, 240 "answer": false, 241 "justification": "Some models are versioned (GPT-3.5-Turbo-0301), but others use marketing names without versions: 'GPT-4' (no snapshot date), 'Claude-3-Sonnet' (no snapshot), 'GPT-4-Turbo' (no date). Section 4.2 says 'detailed versions are demonstrated in supplementary file' but the paper itself lacks this for key models.", 242 "source": "opus" 243 }, 244 "prompts_provided": { 245 "applies": true, 246 "answer": true, 247 "justification": "Figure 3 and Appendix A.3 provide the full prompt template used in the self-optimization stage. The fill values (task description, test case, code, overhead analysis) are programmatically determined from known benchmarks and profiling output. Complete worked examples are shown in Appendix Figures 4-18.", 248 "source": "opus" 249 }, 250 "hyperparameters_reported": { 251 "applies": true, 252 "answer": false, 253 "justification": "The NeurIPS checklist (Q7) mentions 'greedy-decoding strategy' but the main paper does not specify temperature, top-p, max tokens, or other generation parameters. No hyperparameter table is provided.", 254 "source": "opus" 255 }, 256 "scaffolding_described": { 257 "applies": true, 258 "answer": true, 259 "justification": "The EFFI-LEARNER pipeline is described in detail in Section 3 with Figure 2: Code Generation → Overhead Profiling (line_profiler for time, memory_profiler for memory) → Code Refinement. The iterative loop, profiling tools, and feedback mechanism are clearly documented.", 260 "source": "opus" 261 }, 262 "data_preprocessing_documented": { 263 "applies": true, 264 "answer": true, 265 "justification": "Section 4.2: 'We first collect the generated code from each LLM and evaluate its correctness using open test cases. Only the code that passes all test cases is considered for efficiency evaluation.' The filtering pipeline from generation to evaluation is clearly documented.", 266 "source": "opus" 267 } 268 }, 269 "data_integrity": { 270 "raw_data_available": { 271 "applies": true, 272 "answer": false, 273 "justification": "Only aggregated results (averages across benchmarks) are reported in tables. Individual per-problem efficiency measurements, profiling outputs, and generated code are not released for independent verification.", 274 "source": "opus" 275 }, 276 "data_collection_described": { 277 "applies": true, 278 "answer": true, 279 "justification": "Section 4.1-4.2 describes the data collection: benchmarks used (EffiBench, HumanEval, MBPP), how code is generated (each LLM with greedy decoding), how profiling is performed (line_profiler, memory_profiler on open test cases), and how correctness is validated.", 280 "source": "opus" 281 }, 282 "recruitment_methods_described": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants. The study uses standard public benchmarks (EffiBench, HumanEval, MBPP) as data sources.", 286 "source": "opus" 287 }, 288 "data_pipeline_documented": { 289 "applies": true, 290 "answer": true, 291 "justification": "The pipeline is documented: LLM generates code → correctness check on open test cases → profiling with line_profiler and memory_profiler → profile-guided refinement → repeat up to 5 iterations → final evaluation on private test cases. Filtering criteria are stated (only correct code evaluated).", 292 "source": "opus" 293 } 294 }, 295 "contamination": { 296 "training_cutoff_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "No training data cutoff dates are stated for any of the 22 evaluated models. This is important since HumanEval (2021) and MBPP (2021) could be in the training data of newer models.", 300 "source": "opus" 301 }, 302 "train_test_overlap_discussed": { 303 "applies": true, 304 "answer": false, 305 "justification": "No discussion of whether benchmark problems appeared in any model's training data. HumanEval and MBPP are widely known public benchmarks that could have been memorized by models trained after 2021.", 306 "source": "opus" 307 }, 308 "benchmark_contamination_addressed": { 309 "applies": true, 310 "answer": false, 311 "justification": "HumanEval (published 2021), MBPP (published 2021), and EffiBench (published 2024) are public benchmarks. Models trained after these dates may have seen the problems. No contamination analysis is performed.", 312 "source": "opus" 313 } 314 }, 315 "human_studies": { 316 "pre_registered": { 317 "applies": false, 318 "answer": false, 319 "justification": "No human participants in this study. It is a benchmark evaluation of LLM code generation efficiency.", 320 "source": "opus" 321 }, 322 "irb_or_ethics_approval": { 323 "applies": false, 324 "answer": false, 325 "justification": "No human participants. The study evaluates LLMs on public coding benchmarks.", 326 "source": "opus" 327 }, 328 "demographics_reported": { 329 "applies": false, 330 "answer": false, 331 "justification": "No human participants.", 332 "source": "opus" 333 }, 334 "inclusion_exclusion_criteria": { 335 "applies": false, 336 "answer": false, 337 "justification": "No human participants.", 338 "source": "opus" 339 }, 340 "randomization_described": { 341 "applies": false, 342 "answer": false, 343 "justification": "No human participants.", 344 "source": "opus" 345 }, 346 "blinding_described": { 347 "applies": false, 348 "answer": false, 349 "justification": "No human participants.", 350 "source": "opus" 351 }, 352 "attrition_reported": { 353 "applies": false, 354 "answer": false, 355 "justification": "No human participants.", 356 "source": "opus" 357 } 358 }, 359 "cost_and_practicality": { 360 "inference_cost_reported": { 361 "applies": true, 362 "answer": false, 363 "justification": "The approach requires up to 5 iterative LLM calls per problem plus profiling, but no inference costs (API spend, tokens consumed, wall-clock time per problem) are reported. The limitations section acknowledges 'overhead profiles consume more tokens' but does not quantify it.", 364 "source": "opus" 365 }, 366 "compute_budget_stated": { 367 "applies": true, 368 "answer": false, 369 "justification": "Section 4.2 describes the hardware (8× A100 GPUs, 128-core CPU) but does not state total GPU hours, API costs, or training/inference time for the full experimental campaign across 22 models and 3 benchmarks.", 370 "source": "opus" 371 } 372 }, 373 "experimental_rigor": { 374 "seed_sensitivity_reported": { 375 "applies": true, 376 "answer": false, 377 "justification": "No seed sensitivity analysis is reported. The NeurIPS checklist claims greedy decoding makes results 'consistent,' but execution time and memory profiling measurements vary across runs, and no sensitivity analysis is performed.", 378 "source": "opus" 379 }, 380 "number_of_runs_stated": { 381 "applies": true, 382 "answer": false, 383 "justification": "The number of profiling runs per problem is not stated. It is unclear whether efficiency metrics are from a single profiling run or averaged across multiple runs.", 384 "source": "opus" 385 }, 386 "hyperparameter_search_budget": { 387 "applies": true, 388 "answer": false, 389 "justification": "No hyperparameter search budget is reported for the prompt design, number of iterations, or profiling configuration. The iteration count (5) is explored in ablation but the prompt design process is undocumented.", 390 "source": "opus" 391 }, 392 "best_config_selection_justified": { 393 "applies": true, 394 "answer": true, 395 "justification": "Table 2 reports results at all optimization steps (0-5) rather than cherry-picking the best. The 5-iteration configuration is used consistently across all models, not selected per-model.", 396 "source": "opus" 397 }, 398 "multiple_comparison_correction": { 399 "applies": false, 400 "answer": false, 401 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable. The paper makes comparisons via raw number differences only.", 402 "source": "opus" 403 }, 404 "self_comparison_bias_addressed": { 405 "applies": true, 406 "answer": false, 407 "justification": "The authors implement their own baselines (Unsupervised Self-Refine, Result-Aware Self-Refine) and compare against their own system without acknowledging author-evaluation bias. No independent evaluation is performed.", 408 "source": "opus" 409 }, 410 "compute_budget_vs_performance": { 411 "applies": true, 412 "answer": false, 413 "justification": "EFFI-LEARNER requires up to 5 additional LLM calls plus profiling per problem, substantially more compute than the baselines. This compute overhead is acknowledged qualitatively in limitations but never quantified or compared against the efficiency gains.", 414 "source": "opus" 415 }, 416 "benchmark_construct_validity": { 417 "applies": true, 418 "answer": false, 419 "justification": "No discussion of whether EffiBench, HumanEval, or MBPP adequately measure real-world code efficiency. The benchmarks consist of small algorithmic problems — whether efficiency gains transfer to real-world codebases is not addressed.", 420 "source": "opus" 421 }, 422 "scaffold_confound_addressed": { 423 "applies": true, 424 "answer": true, 425 "justification": "The EFFI-LEARNER scaffold is applied uniformly across all models using the same prompt template (Figure 3), profiling tools, and iteration count. Cross-model comparisons use the same framework, controlling for scaffold effects.", 426 "source": "opus" 427 } 428 }, 429 "data_leakage": { 430 "temporal_leakage_addressed": { 431 "applies": true, 432 "answer": false, 433 "justification": "HumanEval (2021) and MBPP (2021) were published years before most evaluated models were trained. Models may have memorized efficient solutions to these problems. No temporal leakage analysis is provided.", 434 "source": "opus" 435 }, 436 "feature_leakage_addressed": { 437 "applies": true, 438 "answer": false, 439 "justification": "During self-optimization, the LLM receives detailed per-line profiling data for specific test cases. This could lead to optimization overfitting to those specific inputs rather than general efficiency. This form of information leakage is not discussed.", 440 "source": "opus" 441 }, 442 "non_independence_addressed": { 443 "applies": true, 444 "answer": false, 445 "justification": "No discussion of whether the open test cases used for optimization and the private test cases used for evaluation share structural similarities that could inflate results.", 446 "source": "opus" 447 }, 448 "leakage_detection_method": { 449 "applies": true, 450 "answer": false, 451 "justification": "No concrete leakage detection or prevention methods are employed. No canary strings, membership inference, n-gram overlap analysis, or temporal splits are used.", 452 "source": "opus" 453 } 454 } 455 } 456 }, 457 "claims": [ 458 { 459 "claim": "EFFI-LEARNER reduces execution time of StarCoder2-15B-generated code by 87.1% on EffiBench", 460 "evidence": "Table 1 shows ET decreasing from 0.93s to 0.12s for StarCoder2-15B after EFFI-LEARNER optimization", 461 "supported": "strong" 462 }, 463 { 464 "claim": "EFFI-LEARNER reduces total memory usage of StarCoder2-15B by 90.8% on EffiBench", 465 "evidence": "Table 1 shows TMU decreasing from 22.02 Mb*s to 2.03 Mb*s for StarCoder2-15B", 466 "supported": "strong" 467 }, 468 { 469 "claim": "Overhead profile feedback is necessary; unsupervised self-refine without profiles actively degrades efficiency", 470 "evidence": "Table 3 shows Unsupervised Self-Refine increases ET by 51.9% and TMU by 518.8% for CodeLlama-70B, while EFFI-LEARNER reduces ET by 9.6% and TMU by 92.9%", 471 "supported": "strong" 472 }, 473 { 474 "claim": "EFFI-LEARNER is model-agnostic, generalizing across 22+ diverse LLMs", 475 "evidence": "Tables 1, 5, 8, 9 show improvements across OpenCodeInterpreter, DeepSeek, CodeLlama, XwinCoder, StarCoder, WizardCoder, GPT-3.5, GPT-4, Claude-3 variants; gains vary widely in magnitude", 476 "supported": "moderate" 477 }, 478 { 479 "claim": "Most efficiency gains occur in the first optimization step with diminishing returns thereafter", 480 "evidence": "Table 2 shows CodeLlama-70B MU drops 75.9% at step 1 with only 0.0% additional reduction by step 5; GPT-3.5-Turbo shows similar diminishing-returns pattern", 481 "supported": "strong" 482 }, 483 { 484 "claim": "EFFI-LEARNER reduces correctness (pass@1) by only 0-0.5%", 485 "evidence": "Table 6 shows pass@1 decreases ranging from 0.0% (CodeLlama-7b, XwinCoder-34B) to 0.5% (OpenCodeInterpreter-DS-1.3B, starcoder2-3b) across 16 models", 486 "supported": "moderate" 487 } 488 ], 489 "methodology_tags": [ 490 "benchmark-eval" 491 ], 492 "key_findings": "EFFI-LEARNER demonstrates that feeding line-level execution time and memory profiler outputs back to LLMs enables substantial iterative code efficiency improvement, achieving up to 87.1% execution time and 90.8% total memory reductions across 22 models on EffiBench. Critically, the granularity of the profile matters: summary metrics and unsupervised self-refinement actively degrade efficiency, while detailed line-level profiles consistently improve it. Most gains occur in the first optimization iteration with diminishing returns thereafter, and correctness is only marginally affected (0-0.5% pass@1 reduction).", 493 "red_flags": [ 494 { 495 "flag": "No statistical significance tests", 496 "detail": "All comparative claims are made without significance testing; improvements are point estimates with no confidence intervals or variance quantification across tasks." 497 }, 498 { 499 "flag": "Headline numbers cherry-pick best case", 500 "detail": "The 87.1% ET reduction applies to StarCoder2-15B, which was an outlier (NET=7.58, 7x slower than canonical); many models show far smaller gains (e.g., 4.5% ET for Magicoder-S-DS-6.7B)." 501 }, 502 { 503 "flag": "Benchmark contamination not addressed", 504 "detail": "HumanEval (2021) and MBPP are known to appear in training data of GPT-4, CodeLlama, etc.; initial code quality and optimization headroom may be inflated by memorization." 505 }, 506 { 507 "flag": "Inference cost not quantified", 508 "detail": "The paper acknowledges multi-iteration LLM calls add tokens and time but provides no quantified cost analysis, making cost-benefit assessment impossible." 509 }, 510 { 511 "flag": "Environment not reproducible from paper", 512 "detail": "No requirements.txt, Dockerfile, or library version specifications; profiler library versions (line_profiler, memory_profiler) could affect measured timings." 513 } 514 ], 515 "cited_papers": [ 516 { 517 "title": "EffiBench: Benchmarking the Efficiency of Automatically Generated Code", 518 "relevance": "Primary evaluation benchmark providing canonical solutions for normalized efficiency comparisons" 519 }, 520 { 521 "title": "Evaluating Large Language Models Trained on Code (HumanEval/Codex)", 522 "relevance": "Core code generation benchmark used for generalizability evaluation" 523 }, 524 { 525 "title": "Program Synthesis with Large Language Models (MBPP)", 526 "relevance": "Second standard benchmark used for cross-dataset generalization testing" 527 }, 528 { 529 "title": "Self-Refine: Iterative Refinement with Self-Feedback", 530 "relevance": "Key baseline showing that generic self-refinement without profiles fails for efficiency" 531 }, 532 { 533 "title": "Learning Performance-Improving Code Edits (PIE)", 534 "relevance": "Most directly competitive prior work on LLM code efficiency; used as baseline in Appendix Table 7" 535 }, 536 { 537 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation via EvalPlus", 538 "relevance": "Provides HumanEval-Plus and MBPP-Plus private test cases used for final correctness evaluation" 539 }, 540 { 541 "title": "Teaching Large Language Models to Self-Debug", 542 "relevance": "Related work on execution feedback for code correction (correctness-focused rather than efficiency-focused)" 543 }, 544 { 545 "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", 546 "relevance": "Related approach to iterative LLM improvement via verbal feedback used as baseline concept" 547 } 548 ], 549 "engagement_factors": { 550 "practical_relevance": { 551 "score": 3, 552 "justification": "Directly applicable to any team using LLM-generated code; the open-source framework requires no model retraining and works with any LLM API." 553 }, 554 "surprise_contrarian": { 555 "score": 1, 556 "justification": "The core idea is intuitive; the notable finding is that unsupervised self-refine actively degrades efficiency while detailed profiles fix it." 557 }, 558 "fear_safety": { 559 "score": 0, 560 "justification": "No safety or AI risk concerns; the paper focuses purely on computational efficiency." 561 }, 562 "drama_conflict": { 563 "score": 0, 564 "justification": "No controversy or conflict with other groups; straightforward engineering contribution." 565 }, 566 "demo_ability": { 567 "score": 3, 568 "justification": "Code is released on GitHub and works with any LLM API; practitioners can apply it to their own code generation workflows immediately." 569 }, 570 "brand_recognition": { 571 "score": 1, 572 "justification": "Authors from HKU, Cambridge, and King's College London are reputable institutions but not the top-tier AI labs that maximize HN attention." 573 } 574 }, 575 "hn_data": { 576 "threads": [ 577 { 578 "hn_id": "42258289", 579 "title": "A Survey on Employing Large Language Models for Text-to-SQL Tasks", 580 "points": 2, 581 "comments": 0, 582 "url": "https://news.ycombinator.com/item?id=42258289", 583 "created_at": "2024-11-27T18:17:00Z" 584 }, 585 { 586 "hn_id": "39253748", 587 "title": "A Comprehensive (Bottom-Up) Study on the Security of Arm Cortex-M Systems", 588 "points": 2, 589 "comments": 0, 590 "url": "https://news.ycombinator.com/item?id=39253748", 591 "created_at": "2024-02-04T19:56:25Z" 592 }, 593 { 594 "hn_id": "39521805", 595 "title": "Statistical Games", 596 "points": 1, 597 "comments": 0, 598 "url": "https://news.ycombinator.com/item?id=39521805", 599 "created_at": "2024-02-27T09:03:19Z" 600 } 601 ], 602 "top_points": 2, 603 "total_points": 5, 604 "total_comments": 0 605 } 606 }