scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30438B)
      1 {
      2   "paper": {
      3     "title": "EFFI-LEARNER: Enhancing Efficiency of Generated Code via Self-Optimization",
      4     "authors": [
      5       "Dong Huang",
      6       "Jianbo Dai",
      7       "Han Weng",
      8       "Puzhen Wu",
      9       "Yuhao Qing",
     10       "Heming Cui",
     11       "Zhijiang Guo",
     12       "Jie M. Zhang"
     13     ],
     14     "year": 2024,
     15     "venue": "Neural Information Processing Systems",
     16     "arxiv_id": "2405.15189"
     17   },
     18   "scan_version": 2,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "EFFI-LEARNER improves LLM-generated code efficiency by feeding execution time and memory usage profiles back to the LLM for iterative self-optimization. Across 22 LLMs on EffiBench, HumanEval, and MBPP, the approach achieves substantial efficiency gains (e.g., 87.1% execution time reduction for StarCoder2-15B on EffiBench) with only 0-0.5% pass@1 degradation. The overhead profile feedback is critical — without it, naive self-refinement often makes code worse (up to 523% TMU increase for CodeLlama-70B). Most efficiency gains occur in the first 1-2 optimization steps, with diminishing returns thereafter.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The abstract states 'The source code of EFFI-LEARNER was released in https://github.com/huangd1999/EffiLearner' and the NeurIPS checklist confirms code was uploaded in supplementary files."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The paper uses publicly available benchmarks: EffiBench, HumanEval, MBPP, and EvalPlus (HumanEval-Plus, MBPP-Plus). These are all standard public datasets."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Section 4.2 specifies hardware (Intel Xeon Platinum 8336C CPU, 8× NVIDIA A100-SXM GPUs, 2.0TiB memory) but does not provide software dependency specifications (requirements.txt, library versions, Python version) in the paper itself."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper provides a GitHub link and describes the framework pipeline (Section 3), but does not include step-by-step reproduction instructions with specific commands to replicate the main experiments."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "All tables (Tables 1-9) report only point estimates with percentage improvements. No confidence intervals or error bars are provided for any metric."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper repeatedly claims EFFI-LEARNER 'significantly' improves efficiency (e.g., Section 4.3) but no statistical significance tests are used. Comparisons are based solely on raw number differences."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Tables report absolute values before and after optimization with percentage reductions (e.g., 'ET decreases from 0.93 (s) to 0.12 (s) which reduces 87.1% execution time'). Normalized metrics (NET, NMU, NTMU) provide relative comparison to canonical solutions."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The number of evaluation problems is determined by the benchmark sizes (EffiBench, HumanEval, MBPP) without any justification for whether these sample sizes are sufficient for the claims made."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No variance, standard deviation, or spread measures are reported. The NeurIPS checklist (Q7) claims greedy decoding makes results 'consistent,' but execution time and memory measurements are inherently variable across runs and no measurement variance is reported."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Table 3 compares against Unsupervised Self-Refine, Result-Aware Self-Refine, Memory Profiler only, and Execution Time Profiler only. Table 7 compares with Self-Edit, Critic, PIE, and Supersonic."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Baselines include Self-Refine (NeurIPS 2023), Reflexion (NeurIPS 2023), PIE (ICLR 2024), Supersonic, Self-Edit (ACL 2023), and CRITIC (2023). These are recent and relevant approaches."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Table 3 ablates the contribution of each profiler component (memory profiler only, execution time profiler only, combined). Table 2 ablates the number of self-optimization steps (0-5)."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Six metrics are used: Execution Time (ET), Normalized ET (NET), Max Memory Usage (MU), Normalized MU (NMU), Total Memory Usage (TMU), and Normalized TMU (NTMU), defined in Appendix A.5."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "All evaluation is fully automated through execution time profiling and memory profiling. No human evaluation of code quality, readability, or optimization strategies is performed."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 4.1: 'we utilize the open test cases to calculate the efficiency metrics during the self-optimization process, while private test cases provided by EffiBench were used for the final result evaluation.' Similarly, EvalPlus private tests are used for HumanEval/MBPP."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Results are broken down per model (22 models in Tables 1, 5, 8, 9), per benchmark (EffiBench, HumanEval, MBPP), and per optimization step (Table 2). Individual case studies are also provided."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 4.6 'Error Analysis' discusses a case (FindMedianSortedArrays) where optimization provides minimal improvement due to already-optimal O(log(min(m,n))) complexity, with detailed code examples in Appendix Figures 12-18."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Table 1 shows StarCoder2-15B MU increases by 5.0% after optimization. Table 3 shows Unsupervised Self-Refine and Result-Aware Self-Refine dramatically worsen performance (e.g., 518.8% TMU increase). Table 6 shows pass@1 decreases. GPT-4 MU and TMU improvements are modest."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims about StarCoder2-15B (87.1% ET reduction from 0.93s to 0.12s, 90.8% TMU reduction from 22.02 to 2.03 Mb*s) and DeepSeek-6.7B-Ins (85.8% MU reduction) are exactly matched by Table 1."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The causal claim that overhead profiles cause efficiency improvement is supported by the ablation in Table 3, which compares EFFI-LEARNER against controlled variants (no feedback, result-only feedback, single-profiler feedback). This is adequate controlled single-variable manipulation."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The title ('Enhancing Efficiency of Generated Code') and abstract make broad claims about 'LLM-generated code' without bounding to Python. All experiments are Python-only. The limitation (Appendix A.1) acknowledges this but the title and abstract do not."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "No alternative explanations are discussed for why EFFI-LEARNER works. For example, the LLMs may be memorizing known efficient algorithms rather than using the profiling feedback, or the open test cases may be biasing optimization toward specific inputs."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper measures execution time and memory usage and claims improvements in execution time and memory usage. The claims match the granularity of measurements — no broader framing gap exists (they don't claim 'better code quality' or 'improved software engineering')."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "Some models are versioned (GPT-3.5-Turbo-0301), but others use marketing names without versions: 'GPT-4' (no snapshot date), 'Claude-3-Sonnet' (no snapshot), 'GPT-4-Turbo' (no date). Section 4.2 says 'detailed versions are demonstrated in supplementary file' but the paper itself lacks this for key models."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Figure 3 and Appendix A.3 provide the full prompt template used in the self-optimization stage. The fill values (task description, test case, code, overhead analysis) are programmatically determined from known benchmarks and profiling output. Complete worked examples are shown in Appendix Figures 4-18."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "The NeurIPS checklist (Q7) mentions 'greedy-decoding strategy' but the main paper does not specify temperature, top-p, max tokens, or other generation parameters. No hyperparameter table is provided."
    161       },
    162       "scaffolding_described": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The EFFI-LEARNER pipeline is described in detail in Section 3 with Figure 2: Code Generation → Overhead Profiling (line_profiler for time, memory_profiler for memory) → Code Refinement. The iterative loop, profiling tools, and feedback mechanism are clearly documented."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 4.2: 'We first collect the generated code from each LLM and evaluate its correctness using open test cases. Only the code that passes all test cases is considered for efficiency evaluation.' The filtering pipeline from generation to evaluation is clearly documented."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Appendix A.1 contains a dedicated 'Limitations' section discussing three specific limitations: time-consuming multi-iteration process, increased token consumption, and Python-only evaluation."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Appendix A.1 discusses specific threats: (1) the multi-iteration process is time-consuming for complex tasks, (2) overhead profiles consume additional tokens, (3) effectiveness is evaluated only on Python, so 'performance in different programming languages or environments may vary.'"
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Appendix A.1 explicitly states: 'the effectiveness of EffiLearner has been primarily evaluated on Python. Therefore, its performance in different programming languages or environments may vary, underscoring the need for further testing and validation.'"
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "Only aggregated results (averages across benchmarks) are reported in tables. Individual per-problem efficiency measurements, profiling outputs, and generated code are not released for independent verification."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 4.1-4.2 describes the data collection: benchmarks used (EffiBench, HumanEval, MBPP), how code is generated (each LLM with greedy decoding), how profiling is performed (line_profiler, memory_profiler on open test cases), and how correctness is validated."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. The study uses standard public benchmarks (EffiBench, HumanEval, MBPP) as data sources."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The pipeline is documented: LLM generates code → correctness check on open test cases → profiling with line_profiler and memory_profiler → profile-guided refinement → repeat up to 5 iterations → final evaluation on private test cases. Filtering criteria are stated (only correct code evaluated)."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Section 6 discloses funding: National Key R&D Program of China (2022ZD0160201), HK RGC RIF (R7030-22), HK ITF (GHP/169/20SZ), Huawei Flagship Research Grant 2023, HK RGC GRF (17208223 & 17204424), and HKU-CAS Joint Laboratory."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are clearly listed: University of Hong Kong, University of Edinburgh, Beijing University of Posts and Telecommunications, University College Dublin, University of Cambridge, King's College London, Shanghai AI Laboratory."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Funders include government grants and Huawei. The paper evaluates third-party models (GPT-4, Claude, StarCoder, CodeLlama, etc.) — none are Huawei products. The funders have no direct financial stake in which models perform better with EFFI-LEARNER."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests statement or financial interest disclosure is present in the paper. The Huawei funding is acknowledged but there is no explicit declaration that authors have no competing interests."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No training data cutoff dates are stated for any of the 22 evaluated models. This is important since HumanEval (2021) and MBPP (2021) could be in the training data of newer models."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No discussion of whether benchmark problems appeared in any model's training data. HumanEval and MBPP are widely known public benchmarks that could have been memorized by models trained after 2021."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "HumanEval (published 2021), MBPP (published 2021), and EffiBench (published 2024) are public benchmarks. Models trained after these dates may have seen the problems. No contamination analysis is performed."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study. It is a benchmark evaluation of LLM code generation efficiency."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants. The study evaluates LLMs on public coding benchmarks."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "The approach requires up to 5 iterative LLM calls per problem plus profiling, but no inference costs (API spend, tokens consumed, wall-clock time per problem) are reported. The limitations section acknowledges 'overhead profiles consume more tokens' but does not quantify it."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "Section 4.2 describes the hardware (8× A100 GPUs, 128-core CPU) but does not state total GPU hours, API costs, or training/inference time for the full experimental campaign across 22 models and 3 benchmarks."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No seed sensitivity analysis is reported. The NeurIPS checklist claims greedy decoding makes results 'consistent,' but execution time and memory profiling measurements vary across runs, and no sensitivity analysis is performed."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of profiling runs per problem is not stated. It is unclear whether efficiency metrics are from a single profiling run or averaged across multiple runs."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No hyperparameter search budget is reported for the prompt design, number of iterations, or profiling configuration. The iteration count (5) is explored in ablation but the prompt design process is undocumented."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "Table 2 reports results at all optimization steps (0-5) rather than cherry-picking the best. The 5-iteration configuration is used consistently across all models, not selected per-model."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable. The paper makes comparisons via raw number differences only."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors implement their own baselines (Unsupervised Self-Refine, Result-Aware Self-Refine) and compare against their own system without acknowledging author-evaluation bias. No independent evaluation is performed."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "EFFI-LEARNER requires up to 5 additional LLM calls plus profiling per problem, substantially more compute than the baselines. This compute overhead is acknowledged qualitatively in limitations but never quantified or compared against the efficiency gains."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "No discussion of whether EffiBench, HumanEval, or MBPP adequately measure real-world code efficiency. The benchmarks consist of small algorithmic problems — whether efficiency gains transfer to real-world codebases is not addressed."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": true,
    343         "answer": true,
    344         "justification": "The EFFI-LEARNER scaffold is applied uniformly across all models using the same prompt template (Figure 3), profiling tools, and iteration count. Cross-model comparisons use the same framework, controlling for scaffold effects."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "HumanEval (2021) and MBPP (2021) were published years before most evaluated models were trained. Models may have memorized efficient solutions to these problems. No temporal leakage analysis is provided."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "During self-optimization, the LLM receives detailed per-line profiling data for specific test cases. This could lead to optimization overfitting to those specific inputs rather than general efficiency. This form of information leakage is not discussed."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of whether the open test cases used for optimization and the private test cases used for evaluation share structural similarities that could inflate results."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No concrete leakage detection or prevention methods are employed. No canary strings, membership inference, n-gram overlap analysis, or temporal splits are used."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "EFFI-LEARNER significantly improves LLM-generated code efficiency across diverse models, achieving up to 87.1% execution time reduction (StarCoder2-15B) and 90.8% total memory usage reduction.",
    373       "evidence": "Table 1 shows efficiency metrics before and after EFFI-LEARNER for 6 representative models on EffiBench. StarCoder2-15B ET: 0.93→0.12s, TMU: 22.02→2.03 Mb*s. Tables 5, 8, 9 extend to additional models and benchmarks.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Overhead profile feedback is essential for effective code optimization; without it, naive self-refinement often degrades efficiency.",
    378       "evidence": "Table 3: For CodeLlama-70B, Unsupervised Self-Refine increases TMU by 518.8% and Result-Aware Self-Refine increases TMU by 523.2%, while EFFI-LEARNER reduces TMU by 92.9%. Similar pattern for GPT-3.5-Turbo-0301.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Most efficiency improvement occurs in the first self-optimization step, with diminishing returns in subsequent steps.",
    383       "evidence": "Table 2: For CodeLlama-70B, MU drops 75.9% after step 1 and stays flat through step 5. For GPT-3.5-Turbo-0301, TMU drops 91.3% after step 1, reaching 92.1% by step 5.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "EFFI-LEARNER generalizes across benchmarks (EffiBench, HumanEval, MBPP) and diverse model families (16 open-source, 6 closed-source).",
    388       "evidence": "Tables 1, 4, 5, 8, 9 show efficiency improvements across all three benchmarks and 22 models. Improvements vary in magnitude but are consistently positive for most model-benchmark combinations.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "EFFI-LEARNER's impact on code correctness is minimal, with pass@1 decreasing by only 0-0.5%.",
    393       "evidence": "Table 6 shows pass@1 before and after optimization on EffiBench for 16 open-source models. Maximum decrease is 0.5% (OpenCodeInterpreter-DS-1.3B: 5.8%→5.4%). Some models show no change (CodeLlama-7b: 7.0%→7.0%).",
    394       "supported": "moderate"
    395     }
    396   ],
    397   "red_flags": [
    398     {
    399       "flag": "No error bars or uncertainty quantification",
    400       "detail": "Efficiency measurements (execution time, memory usage) are inherently variable across runs, yet all results are reported as single point estimates with no variance, confidence intervals, or standard deviations. The justification that greedy decoding makes results 'consistent' applies only to code generation, not to runtime profiling."
    401     },
    402     {
    403       "flag": "Benchmark contamination unaddressed",
    404       "detail": "HumanEval and MBPP were published in 2021. Models trained after 2021 may have memorized efficient solutions to these problems. If models have seen canonical solutions, the 'self-optimization' may simply be recalling memorized efficient implementations rather than genuinely optimizing from profiles."
    405     },
    406     {
    407       "flag": "Selective reporting emphasis in abstract",
    408       "detail": "The abstract highlights StarCoder2-15B's 87.1% ET reduction — the single most dramatic result. Other models show much more modest improvements (e.g., GPT-4 ET only decreases 9.7%, and GPT-4 TMU only decreases 37.8%). The abstract's numbers represent best-case, not typical performance."
    409     },
    410     {
    411       "flag": "No statistical tests despite 'significant' claims",
    412       "detail": "The word 'significantly' is used repeatedly (e.g., 'EFFI-LEARNER significantly enhances the efficiency') but no statistical significance tests are performed. All comparisons are raw number differences across 22 models without any formal testing."
    413     },
    414     {
    415       "flag": "Inference cost of iterative approach not quantified",
    416       "detail": "EFFI-LEARNER calls the LLM up to 5 additional times per problem plus local profiling. For expensive closed-source models (GPT-4), this could multiply inference costs 6x. The paper acknowledges token overhead in limitations but never quantifies the cost-efficiency tradeoff."
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "Evaluating large language models trained on code",
    422       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    423       "year": 2021,
    424       "arxiv_id": "2107.03374",
    425       "relevance": "Introduces the HumanEval benchmark and Codex, foundational for LLM code generation evaluation."
    426     },
    427     {
    428       "title": "Program synthesis with large language models",
    429       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    430       "year": 2021,
    431       "arxiv_id": "2108.07732",
    432       "relevance": "Introduces the MBPP benchmark used to evaluate LLM code generation capabilities."
    433     },
    434     {
    435       "title": "EffiBench: Benchmarking the efficiency of automatically generated code",
    436       "authors": ["Dong Huang", "Jie M. Zhang", "Yuhao Qing", "Heming Cui"],
    437       "year": 2024,
    438       "arxiv_id": "2402.02037",
    439       "relevance": "Primary benchmark for evaluating code efficiency, with defined metrics (ET, NET, MU, NMU, TMU, NTMU) used throughout this paper."
    440     },
    441     {
    442       "title": "Self-Refine: Iterative Refinement with Self-Feedback",
    443       "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"],
    444       "year": 2023,
    445       "relevance": "Key baseline for self-refinement approaches; demonstrates LLMs can improve outputs through iterative feedback loops."
    446     },
    447     {
    448       "title": "Teaching large language models to self-debug",
    449       "authors": ["Xinyun Chen", "Maxwell Lin", "Nathanael Schärli", "Denny Zhou"],
    450       "year": 2023,
    451       "arxiv_id": "2304.05128",
    452       "relevance": "Explores using program execution feedback to help LLMs debug code, related approach to EFFI-LEARNER's profile-guided optimization."
    453     },
    454     {
    455       "title": "Reflexion: Language agents with verbal reinforcement learning",
    456       "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath"],
    457       "year": 2023,
    458       "relevance": "Demonstrates verbal self-reflection as feedback for improving LLM agent performance, conceptually related to EFFI-LEARNER's iterative self-optimization."
    459     },
    460     {
    461       "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation",
    462       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    463       "year": 2024,
    464       "relevance": "EvalPlus benchmark providing extended test cases for HumanEval and MBPP, used as private test sets in this paper's evaluation."
    465     },
    466     {
    467       "title": "Code Llama: Open foundation models for code",
    468       "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"],
    469       "year": 2023,
    470       "arxiv_id": "2308.12950",
    471       "relevance": "Open-source code LLM family (7B-70B) extensively evaluated in this paper as both open-source baseline and optimization target."
    472     },
    473     {
    474       "title": "StarCoder: May the source be with you!",
    475       "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"],
    476       "year": 2023,
    477       "arxiv_id": "2305.06161",
    478       "relevance": "Open-source code LLM trained on The Stack; StarCoder2-15B is the paper's headline result with 87.1% ET reduction."
    479     },
    480     {
    481       "title": "Learning Performance-Improving Code Edits",
    482       "authors": ["Alexander Shypula", "Aman Madaan", "Yimeng Zeng"],
    483       "year": 2024,
    484       "relevance": "PIE approach for learning code optimizations from performance-improving edits; used as a baseline in Table 7."
    485     },
    486     {
    487       "title": "ECCO: Can we improve model-generated code efficiency without sacrificing functional correctness?",
    488       "authors": ["Siddhant Waghjale", "Vishruth Veerendranath", "Zora Zhiruo Wang", "Daniel Fried"],
    489       "year": 2024,
    490       "arxiv_id": "2407.14044",
    491       "relevance": "Directly related work on improving LLM code efficiency while maintaining correctness."
    492     },
    493     {
    494       "title": "Evaluating language models for efficient code generation",
    495       "authors": ["Jiawei Liu", "Songrun Xie", "Junhao Wang"],
    496       "year": 2024,
    497       "arxiv_id": "2408.06450",
    498       "relevance": "Evaluates LLM code efficiency, finding generated code is less efficient than human solutions."
    499     }
    500   ]
    501 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs