scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (34232B)
      1 {
      2   "paper": {
      3     "title": "How Efficient is LLM-Generated Code? A Rigorous & High-Standard Benchmark",
      4     "authors": [
      5       "Ruizhong Qiu",
      6       "Weiliang Will Zeng",
      7       "James Ezick",
      8       "Christopher Lott",
      9       "Hanghang Tong"
     10     ],
     11     "year": 2024,
     12     "venue": "International Conference on Learning Representations (ICLR 2025)",
     13     "arxiv_id": "2406.06647",
     14     "doi": "10.48550/arXiv.2406.06647"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval", "theoretical"],
     19   "key_findings": "LLMs fall far short of expert-level code efficiency: the strongest commercial model GPT-4 achieves only eff@1=0.454 despite pass@1=0.831 on the ENAMEL benchmark of 142 HumanEval problems with expert-written reference solutions. The efficiency gap stems from LLMs' inability to design advanced algorithms (eff@100=0.483 for ChatGPT on algorithm design subset) and near-total unawareness of implementation optimization. Encouraging efficiency via prompting barely helps, and even revealing the optimal algorithm in the prompt fails to elicit efficient code from LLMs.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper states 'Our benchmark is publicly available at https://github.com/q-rz/enamel' in the abstract and provides a working GitHub URL."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The benchmark problems (from HumanEval/HumanEval+), expert reference solutions, and test case generators are released via the GitHub repository. Some generated code samples are re-used from the publicly available EvalPlus project."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "§C.1 specifies hardware ('8 NVIDIA A100 80GB GPUs', Google Cloud Ubuntu 20.04.6 LTS, Intel Xeon @ 2.20GHz, Python 3.10.12) but provides no requirements.txt, Dockerfile, or library version specifications sufficient to recreate the software environment."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper describes the evaluation methodology in detail (§2-3, §C.1) and releases the benchmark, but does not include step-by-step reproduction instructions, scripts, or a 'Reproducing Results' section in the paper itself."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The main results in Table 3 report only point estimates for eff@k and pass@k. While Table 11 shows standard deviations for one model (Llama 3 70B Instruct) to demonstrate the Rao-Blackwellization benefit, no confidence intervals or error bars accompany the main reported results across 30 LLMs."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper ranks 30 LLMs and makes comparative claims (e.g., 'GPT-4 Turbo has higher eff@1 than GPT-4') but no statistical significance tests are applied to any comparisons."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper reports absolute eff@k and pass@k values with clear reference baselines (expert solutions = 1.000, HumanEval canonical = 0.455, HumanEval+ canonical = 0.513 in Table 2), allowing readers to assess effect magnitude. Differences between correctness and efficiency are reported numerically."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The paper uses 142 problems (explaining the exclusion of 22 trivial ones) and 100-200 code samples per problem, citing 'financial and computational constraints' (§C.1). No power analysis or formal sample size justification is provided."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The main results (Table 3) are single-point estimates. Table 11 shows standard deviations for one model to demonstrate the estimator benefit, but variance across experimental conditions or models is not systematically reported. Execution time variance is reduced via the Hodges-Lehmann estimator (R=6 repeats) but not reported to the reader."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The paper compares against HumanEval and HumanEval+ canonical solutions (Table 2), and compares the ENAMEL benchmark against EffiBench and Mercury benchmarks (Table 9). The classic speedup metric is compared against eff@k (Table 7)."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Baselines include contemporary benchmarks EffiBench (2024) and Mercury (2024), and the evaluation covers 30 LLMs including recent models like GPT-4 Turbo, Llama 3, Claude 3, and Mixtral (all 2024)."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The paper provides ablation-style analyses: algorithm design vs implementation optimization subsets (Table 4/6), hyperparameter sensitivity (Table 10), random vs expert test generators (Table 8), different efficiency metrics (Table 7), and Rao-Blackwellization benefit (Table 11)."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The paper reports both eff@k (efficiency) and pass@k (correctness) at multiple sample sizes k=1, 10, 100, under both greedy and sampling decoding."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "All evaluation is automated through execution time measurement and test case correctness. No human evaluation of code quality, benchmark validity, or LLM outputs is reported."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The LLMs are evaluated as pre-trained models with no fine-tuning on the ENAMEL benchmark. The expert reference solutions and strong test generators are newly created and were not available to the models during training."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Table 4/6 provides breakdowns on algorithm design (20 problems) and implementation optimization (75 problems) subsets. Figure 2 shows per-problem difficulty distribution of passi@1 vs effi@1 across all 142 problems."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Tables 13-15 show detailed failure case studies where Llama 3 70B and Mixtral 8x22B fail to generate efficient algorithms even with Self-Refine prompting or explicit algorithm hints. Table 8 shows a case study where the Fermat primality test appears correct under random tests but fails under expert test cases."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Table 12 shows that encouraging efficiency via prompting 'can barely enhance the efficiency of generated code.' Tables 13-15 show that revealing the optimal algorithm in the prompt still fails to produce efficient code. These are explicit negative results."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims are supported: 'LLMs still fall short of generating expert-level efficient code' is supported by Table 3 (GPT-4 eff@1=0.454); 'LLMs struggle in designing advanced algorithms and are barely aware of implementation optimization' is supported by Tables 4/6. All specific numbers in the abstract match the results."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The main causal claim — 'such deficiency is because current LLMs struggle in designing advanced algorithms' — is supported by controlled subset analysis (algorithm design vs implementation optimization subsets in Table 4) and the prompting experiments in §C.8 which rule out the alternative that prompting alone could address the issue."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Claims are generally bounded to 'under our benchmark ENAMEL.' Limitations (§D.2) explicitly acknowledge scope: 'This work considers standalone programming problems,' notes the inability to guarantee optimality of references, and identifies Python-only limitation. The title ('How Efficient is LLM-Generated Code?') is broader than the tested setting, but is framed as a question rather than a claim."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "§C.8 tests and rules out the alternative explanation that prompting could fix efficiency (Self-Refine and algorithm-revealing prompts both fail). §C.5 discusses that LLMs have seen LeetCode solutions but not expert solutions. The paper considers that 'na¨ıve algorithms are easier to be generated correctly but are less efficient' (§4.1) as an explanatory factor."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper measures execution time under level-based evaluation and defines a formal eff@k metric. The measurement (actual code execution time normalized by expert reference) directly measures code efficiency, which is the claimed outcome. No proxy gap exists between measurement and claim."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "§C.1 specifies exact model versions: 'claude-3-opus-20240229', 'claude-3-sonnet-20240229', 'claude-3-haiku-20240307', 'gpt-4-1106-preview', 'gpt-4-0613'. Open-source models are identified by full names (e.g., 'Llama 3 70B Instruct')."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The prompts are the standard HumanEval function signatures and docstrings, which are publicly available. The paper explicitly references HumanEval (Chen et al., 2021) and HumanEval+ (Liu et al., 2023a) as prompt sources. Example prompts are shown (e.g., Fig. 1). The benchmark GitHub repository contains the full prompt set."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "§C.1 reports: temperature 0.8, top_p 0.95 for sampling; α=2, R=6, h1=h2=3, h3=4, M0=8, M1=M2=M3=4 for evaluation. Table 10 provides sensitivity analysis for α, h1, h2, h3."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. The evaluation involves direct code generation from prompts without any multi-turn, tool-use, or agent workflow."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "§3.1 documents problem selection: 164 HumanEval problems → exclude 22 with Θ(1) time complexity → 142 problems. §C.1 describes code generation (re-use from EvalPlus for some models, self-generated for others). The evaluation pipeline (levels 0-3, correctness filtering, efficiency scoring) is documented in §2."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "§D 'Concluding Remarks' includes §D.1 'Scalability of Benchmark Development' and §D.2 'Other Limitations & Future Work' as dedicated subsections with substantive discussion of multiple limitations."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "§D.2 identifies specific threats: standalone problems only (not complex software development), inability to guarantee optimality of reference solutions ('the efficiency score can be greater than 1'), no space efficiency evaluation, no advanced prompting techniques, and difficulty of automatic time complexity measurement."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "§D.2 explicitly states: 'This work considers standalone programming problems,' 'this work focuses on benchmarking code efficiency without more advanced prompting techniques,' and notes specific things not tested (space efficiency, multi-file dependencies, automatic complexity measurement)."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The benchmark (problems, reference solutions, test generators) is released at the GitHub URL. However, the raw experimental data — generated code samples from 30 LLMs and their per-problem execution times — is not explicitly stated to be released. Partial re-use of EvalPlus code samples is mentioned but full raw data for independent verification of reported numbers is not confirmed."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "§C.1 describes code collection: re-use from EvalPlus for some models, self-generation with temperature 0.8, top_p 0.95, 100-200 samples per problem. Evaluation on Google Cloud VMs with specific hardware. §3 describes reference solution and test generator creation by a human expert."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. The data sources are standard LLM APIs and the publicly available HumanEval/HumanEval+ benchmark."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The pipeline is fully documented: problem selection (§3.1), reference solution creation (§3.2), test generator curation (§3.3), code generation (§C.1), level-based evaluation with correctness filtering (§2.1), efficiency scoring (§2.2), and metric estimation via Rao-Blackwellization (§2.3, Algorithm 1)."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "The acknowledgments section states: 'This work is supported by NSF (2134079), and NIFA (2020-67021-32799), and IBM-Illinois Discovery Accelerator Institute.'"
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly stated: University of Illinois Urbana-Champaign (Qiu, Tong) and Qualcomm AI Research (Zeng, Ezick, Lott). The footnote notes 'Qualcomm AI Research is an initiative of Qualcomm Technologies, Inc.'"
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "The funders (NSF, NIFA, IBM-Illinois) are government agencies or an academic institute with no direct financial stake in which LLM ranks highest in code efficiency. The paper evaluates third-party models (GPT-4, Claude 3, Llama 3, etc.) rather than products of the funders."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is present. Authors from Qualcomm AI Research may have interests related to code efficiency evaluation, but no disclosure statement addresses this."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "The paper does not state the training data cutoff dates for any of the 30 evaluated models. This is relevant because HumanEval (published 2021) could be in the training data of post-2021 models."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "§C.5 and §D.1 explicitly discuss this: 'LLMs (i) have seen the public solutions on LeetCode (editorials and community solutions) but (ii) have never seen our expert-written efficient solutions.' They acknowledge HumanEval solutions are in training data and argue their expert solutions provide novel reference points."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "§D.1 discusses contamination directly: 'We did not use problems or solutions from online judges because their public solutions are already in LLMs' pretraining corpuses.' They argue that while HumanEval problems may be contaminated for correctness, their novel expert solutions and test generators mitigate contamination for efficiency evaluation."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study. The paper evaluates LLMs on a code efficiency benchmark."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants. The 'human expert' who wrote reference solutions and test generators is an author, not a research subject."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No API costs, token consumption, or per-example inference costs are reported. The paper mentions using Google Cloud VMs and A100 GPUs but does not quantify inference costs."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "The paper mentions hardware (8 NVIDIA A100 80GB GPUs for generation, Google Cloud VMs for evaluation) but does not state total GPU hours, API spend, or overall computational budget."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No seed sensitivity analysis is reported. While the Rao-Blackwellized estimator reduces variance from sampling, the paper does not report results across multiple random seeds or assess seed sensitivity."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "§C.1 explicitly states: R=6 repeats per test case for execution time estimation, 100-200 code samples per problem depending on model size, and greedy vs sampling decoding modes."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": true,
    312         "justification": "The paper uses fixed default hyperparameters (α=2, h1=3, h2=3, h3=4) without search, and provides sensitivity analysis in Table 10. LLM generation uses standard settings (temperature 0.8, top_p 0.95). No tuning was performed, so no search budget applies."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "All 30 models' results are reported comprehensively in Table 3 under fixed default settings. No cherry-picking — results for all models under both greedy and sampling modes are shown. Hyperparameters are fixed defaults, not selected to favor any outcome."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": false,
    321         "answer": false,
    322         "justification": "No formal statistical significance tests are performed, so multiple comparison correction is not applicable."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors compare their ENAMEL benchmark against EffiBench and Mercury (Table 9), claiming ENAMEL is 'more challenging,' but do not acknowledge potential bias in this self-comparison. They also evaluate their expert solutions against HumanEval+ canonical solutions without discussing author-evaluation bias."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "The paper compares models ranging from 1B to 70B parameters alongside commercial models with undisclosed sizes, without normalizing for compute budget or discussing the compute-performance tradeoff."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": true,
    337         "justification": "§2 extensively discusses construct validity: §2.1 explains why level-based evaluation differentiates algorithms, §2.2 addresses right-censored execution time, §C.3 compares eff@k vs speedup metrics. §C.4 demonstrates that their expert test generators correctly identify wrong-but-efficient code (Table 8). §C.5 compares difficulty against other benchmarks."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding is involved — direct code generation from prompts without any agentic workflow."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": true,
    349         "justification": "§C.5 and §D.1 discuss that HumanEval problems and LeetCode solutions existed before model training. They argue their expert solutions are novel ('have never seen our expert-written efficient solutions'), partially mitigating temporal leakage for efficiency evaluation."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "The paper does not discuss whether the evaluation setup could leak answer information through the prompts or context. HumanEval docstrings sometimes contain hints about solution approaches, which is not addressed."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "The paper does not discuss whether the 142 HumanEval problems are independent of each other or whether similar problems in training data could affect results."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No concrete leakage detection methods (canary strings, n-gram overlap, membership inference) are applied. The argument that expert solutions are novel is conceptual rather than empirically verified."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "LLMs still fall short of generating expert-level efficient code: even the strongest commercial LLM GPT-4 has eff@1=0.454 despite pass@1=0.831.",
    371       "evidence": "Table 3 shows eff@k and pass@k for 30 LLMs. GPT-4 greedy: eff@1=0.454, pass@1=0.831. GPT-4 Turbo greedy: eff@1=0.470, pass@1=0.796. Most LLMs cannot reach eff@1>0.3.",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "eff@k is consistently much lower than pass@k across all LLMs, model sizes, and sample sizes k.",
    376       "evidence": "Table 3 systematically shows this gap across all 30 models. For example, Llama 3 70B Instruct: eff@1=0.438 vs pass@1=0.747; Claude 3 Sonnet: eff@1=0.365 vs pass@1=0.677.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "LLMs struggle in designing advanced algorithms: even ChatGPT has eff@100=0.483 on the algorithm design subset.",
    381       "evidence": "Table 4/6 shows results on 20 hard algorithm design problems. ChatGPT eff@100=0.483, Llama 3 70B eff@100=0.359. All models have low eff@1 on this subset.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "LLMs are barely aware of implementation optimization: improvement with larger sample sizes is mainly from random sampling generating diverse implementations.",
    386       "evidence": "Table 4/6 on 75 implementation optimization problems. Phind Code Llama V2: eff@1=0.351 but eff@100=0.732. The large improvement from k=1 to k=100 suggests random diversity rather than intentional optimization.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Encouraging LLMs to generate 'the most efficient algorithm' via prompting barely enhances code efficiency.",
    391       "evidence": "Table 12 shows Llama 3 70B eff@1 drops from 0.421 to 0.418 with efficiency-encouraging prompt; Mixtral 8x22B increases only from 0.408 to 0.426. Tested on only 2 LLMs with greedy decoding.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Many canonical solutions in HumanEval and HumanEval+ are wrong or exceed the time limit under strong test cases.",
    396       "evidence": "Table 2: 11 HumanEval and 4 HumanEval+ canonical solutions found wrong; 34 HumanEval and 27 HumanEval+ exceed time limits. Specific examples in Table 1 and Appendix E.",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "The Rao-Blackwellized eff@k estimator is unbiased and achieves lower variance than the vanilla estimator.",
    401       "evidence": "Theorem 1 (§2.3, proof in §B) provides formal proofs of unbiasedness and variance reduction. Table 11 empirically confirms: vanilla std=0.20 vs Rao-Blackwellized std=0.02 for k=1.",
    402       "supported": "strong"
    403     },
    404     {
    405       "claim": "The classic speedup metric overestimates efficiency due to right-censored execution time, producing unreasonable rankings.",
    406       "evidence": "Table 7 shows that under speedup, Mixtral 8x22B and Llama 3 70B appear to outperform GPT-4, while eff@1 gives a more reasonable ranking with GPT-4 Turbo and GPT-4 at top.",
    407       "supported": "strong"
    408     }
    409   ],
    410   "red_flags": [
    411     {
    412       "flag": "Single unnamed human expert",
    413       "detail": "All reference solutions and test case generators were created by a single 'human expert' who is likely one of the authors. There is no inter-rater reliability check, no second expert validation, and the expert's identity is not disclosed. The entire benchmark's quality standard depends on this one person's judgment."
    414     },
    415     {
    416       "flag": "No statistical significance tests",
    417       "detail": "Rankings of 30 LLMs and comparative claims (e.g., 'GPT-4 Turbo has higher eff@1 than GPT-4') are based solely on point estimates without any significance testing. Given the variance in code generation, some ranking differences may not be meaningful."
    418     },
    419     {
    420       "flag": "Uneven evaluation across models",
    421       "detail": "Some models are evaluated only with greedy decoding (GPT-4, GPT-4 Turbo, Claude 3 Opus) while others have sampling results with 100-200 samples. The most expensive commercial models have the least thorough evaluation, making comparisons at higher k impossible."
    422     },
    423     {
    424       "flag": "Narrow domain generalization",
    425       "detail": "All 142 problems are standalone Python functions from HumanEval — a single language, single-function granularity. The title 'How Efficient is LLM-Generated Code?' implies broader applicability than this narrow domain supports. Real-world efficiency involves multi-file, multi-language, system-level optimization."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "Evaluating large language models trained on code",
    431       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    432       "year": 2021,
    433       "arxiv_id": "2107.03374",
    434       "relevance": "Introduces HumanEval benchmark and pass@k metric, which ENAMEL extends to efficiency evaluation."
    435     },
    436     {
    437       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    438       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    439       "year": 2023,
    440       "relevance": "Introduces EvalPlus/HumanEval+ with stronger test cases; ENAMEL re-uses their code samples and extends their problem set for efficiency."
    441     },
    442     {
    443       "title": "EffiBench: Benchmarking the efficiency of automatically generated code",
    444       "authors": ["Dong Huang", "Jie M. Zhang", "Yuhao Qing", "Heming Cui"],
    445       "year": 2024,
    446       "arxiv_id": "2402.02037",
    447       "relevance": "Contemporary code efficiency benchmark that ENAMEL compares against; uses GPT-generated test cases rather than expert-written ones."
    448     },
    449     {
    450       "title": "Mercury: An efficiency benchmark for LLM code synthesis",
    451       "authors": ["Mingzhe Du", "Anh Tuan Luu", "Bin Ji", "See-Kiong Ng"],
    452       "year": 2024,
    453       "arxiv_id": "2402.07844",
    454       "relevance": "Another code efficiency benchmark using LeetCode problems; ENAMEL compares against it and argues for expert-written rather than crowd-sourced solutions."
    455     },
    456     {
    457       "title": "On evaluating the efficiency of source code generated by LLMs",
    458       "authors": ["Changan Niu", "Ting Zhang", "Chuanyi Li", "Bin Luo", "Vincent Ng"],
    459       "year": 2024,
    460       "relevance": "Early work on LLM code efficiency evaluation; ENAMEL addresses its limitations including use of only one code sample and reliance on existing HumanEval test cases."
    461     },
    462     {
    463       "title": "Program synthesis with large language models",
    464       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    465       "year": 2021,
    466       "arxiv_id": "2108.07732",
    467       "relevance": "Introduces MBPP code generation benchmark; discussed as a correctness-focused benchmark that overlooks efficiency."
    468     },
    469     {
    470       "title": "Measuring coding challenge competence with APPS",
    471       "authors": ["Dan Hendrycks", "Steven Basart", "Saurav Kadavath"],
    472       "year": 2021,
    473       "relevance": "Introduces APPS benchmark for code generation; cited as another correctness-focused evaluation framework."
    474     },
    475     {
    476       "title": "Competition-level code generation with AlphaCode",
    477       "authors": ["Yujia Li", "David Choi", "Junyoung Chung"],
    478       "year": 2022,
    479       "relevance": "DeepMind's competition-level code generation system; relevant as a major code LLM trained on online judge data, raising contamination concerns."
    480     },
    481     {
    482       "title": "Code Llama: Open foundation models for code",
    483       "authors": ["Baptiste Roziere", "Jonas Gehring", "Fabian Gloeckle"],
    484       "year": 2023,
    485       "arxiv_id": "2308.12950",
    486       "relevance": "Major open-source code LLM family evaluated in the benchmark; multiple sizes (7B-70B) provide model scaling analysis."
    487     },
    488     {
    489       "title": "Self-Refine: Iterative refinement with self-feedback",
    490       "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"],
    491       "year": 2024,
    492       "relevance": "Prompting technique tested in §C.8 for improving code efficiency; shown to be ineffective for generating efficient algorithms."
    493     },
    494     {
    495       "title": "Can large language models write parallel code?",
    496       "authors": ["Daniel Nichols", "Joshua H. Davis", "Zhaojun Xie"],
    497       "year": 2024,
    498       "relevance": "Evaluates LLM-generated parallel code efficiency; one of the first works on code efficiency beyond correctness."
    499     },
    500     {
    501       "title": "DevBench: A comprehensive benchmark for software development",
    502       "authors": ["Bowen Li", "Wenhan Wu", "Ziwei Tang"],
    503       "year": 2024,
    504       "arxiv_id": "2403.08604",
    505       "relevance": "Referenced as a potential extension target for generalizing ENAMEL's methodology to complex multi-file software development."
    506     },
    507     {
    508       "title": "MultiPL-E: A scalable and extensible approach to benchmarking neural code generation",
    509       "authors": ["Federico Cassano", "John Gouwar", "Daniel Nguyen"],
    510       "year": 2022,
    511       "arxiv_id": "2208.08227",
    512       "relevance": "Multilingual code generation benchmark; cited as a correctness-focused framework that does not address efficiency."
    513     }
    514   ],
    515   "engagement_factors": {
    516     "practical_relevance": {
    517       "score": 2,
    518       "justification": "The ENAMEL benchmark is a usable tool for evaluating LLM code efficiency, relevant to LLM developers and benchmark practitioners, but not directly applicable to day-to-day software engineering."
    519     },
    520     "surprise_contrarian": {
    521       "score": 2,
    522       "justification": "The finding that GPT-4 achieves only eff@1=0.454 despite pass@1=0.831 challenges the assumption that high correctness implies high-quality code, revealing a significant blind spot in current evaluation."
    523     },
    524     "fear_safety": {
    525       "score": 0,
    526       "justification": "No safety, security, or risk implications; the paper is about code efficiency evaluation."
    527     },
    528     "drama_conflict": {
    529       "score": 1,
    530       "justification": "Mild 'existing benchmarks are inadequate' angle — 11 HumanEval canonical solutions are wrong and 34 exceed time limits under expert tests — but framed constructively rather than confrontationally."
    531     },
    532     "demo_ability": {
    533       "score": 2,
    534       "justification": "The benchmark is publicly available on GitHub (github.com/q-rz/enamel), though running the full evaluation requires significant compute resources."
    535     },
    536     "brand_recognition": {
    537       "score": 2,
    538       "justification": "Published at ICLR 2025, evaluates well-known models (GPT-4, Claude 3, Llama 3), with Qualcomm AI Research and UIUC affiliations."
    539     }
    540   }
    541 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs