scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28046B)
      1 {
      2   "paper": {
      3     "title": "Evaluating Language Models for Efficient Code Generation",
      4     "authors": [
      5       "Jiawei Liu",
      6       "Songrun Xie",
      7       "Junhao Wang",
      8       "Yuxiang Wei",
      9       "Yifeng Ding",
     10       "Lingming Zhang"
     11     ],
     12     "year": 2024,
     13     "venue": "COLM 2024",
     14     "arxiv_id": "2408.06450",
     15     "doi": "10.48550/arXiv.2408.06450"
     16   },
     17   "scan_version": 2,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "The paper introduces DPE (Differential Performance Evaluation) and creates EVALPERF, a benchmark of 121 performance-exercising coding tasks for evaluating LLM code efficiency. Key findings include: instruction tuning improves both code correctness and efficiency (e.g., 19% DPS improvement for DeepSeekCoder-6.7B), the scaling law does not reliably hold for code efficiency (performance degradation in 4/12 size comparisons), performance-encouraging prompts do not consistently improve efficiency, and EVALPERF achieves highly consistent cross-platform evaluation (CV ≤ 0.4%) via hardware performance counters.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper states in §1 contribution 3: 'we also fully open-source and maintain the data curation pipeline and evaluator at github.com/evalplus/evalplus as part of EvalPlus.'"
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "EVALPERF with 121 tasks, test inputs, and reference solutions is released as part of the EvalPlus open-source project at github.com/evalplus/evalplus."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "While hardware configurations are listed in Table 2 (CPUs, RAM), no software environment specifications (requirements.txt, Python version, library versions, Dockerfile) are provided in the paper."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions are provided in the paper. The open-source repository is mentioned but no specific commands or reproduction guide is included in the paper text."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Results in Table 3 report Avg/Max/Min across samples, and Table 2 reports CV across platforms, but no confidence intervals or error bars are provided for main results."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper makes numerous comparative claims (e.g., 'instruction-tuned models also tends to be more efficient') based solely on comparing DPS numbers without any statistical significance tests."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The paper reports magnitude of effects with context, e.g., 'instruction-tuned DeepSeekCoder-6.7B improves the base model by 19% regarding DPS' (§4.1), 'relative improvement of 4.8×' for SAS vs EvalPlus (§4.2)."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No justification for the choice of 121 tasks, 50 samples per model, or 10 samples for GPT-4. The 10-sample limit for GPT-4 is attributed to 'cost mitigation' without justifying adequacy."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Table 3 reports Avg, Max, and Min DPS/DPSnorm across the first 10 correct samples per task, providing a range measure. Table 2 reports Coefficient of Variation across platforms."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Multiple model families are compared (CodeLlama, DeepSeekCoder, StarCoder, StarCoder2, GPT-4 Turbo). SAS is compared against EvalPlus as a baseline in Table 1."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Models evaluated include GPT-4 Turbo, StarCoder2, DeepSeekCoder, CodeQwen1.5, and CodeLlama, all contemporary models at the time of writing."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The evaluation systematically studies the impact of individual factors: instruction tuning (base vs instruct), prompting strategies (instruct vs perf-instruct vs perf-CoT), and model sizes (within families), each controlling other variables."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Three metrics are used: DPS (Differential Performance Score), DPSnorm (normalized DPS), and pass@1 for correctness. Results are reported for all three in Table 3."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "All evaluation is fully automated via test execution and hardware performance counters. No human evaluation of code quality or efficiency is included."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "EVALPERF tasks and generated test inputs are independent of the models being evaluated. Pairwise efficiency comparison is done over the common set of passing solutions to control for correctness differences."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down per model, per variant (base/instruct/perf-instruct/perf-CoT), per model size in Figures 4-5 and Table 3. Pairwise comparison matrices are provided."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper discusses cases where the scaling law fails for efficiency (§4.1, Figure 5, e.g., 6% degradation from StarCoder2 3B to 7B), and where performance-encouraging prompts degrade correctness (Table 3)."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Several negative findings are reported: 'performance-encouraging prompts neither consistently nor noticeably improve the code efficiency' (§4.1), scaling law doesn't hold for efficiency (4 of 12 pairs show degradation), and perf-prompts 'commonly lead to correctness degradation' (§4.1/Table 3)."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Abstract claims about DPE framework, instruction tuning benefiting efficiency, scaling law not holding for efficiency, and 4.8× improvement over prior art inputs are all supported by results in §4.1 and §4.2."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Claims like 'instruction tuning benefits efficiency' and 'model size does not reliably improve efficiency' are supported by controlled comparisons within model families where only the variable of interest differs (e.g., same model, base vs instruct; same family, different sizes)."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title claims 'Evaluating Language Models for Efficient Code Generation' broadly, but experiments are limited to Python tasks from HumanEval+ and MBPP+. The abstract and conclusion make general claims about 'code generation' without bounding to Python or to algorithmic-style programming problems."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "No alternative explanations are discussed for the key findings. For example, the instruction tuning efficiency benefit could stem from training data quality rather than the tuning process itself, and the scaling law failure could have multiple explanations, none of which are explored."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper explicitly distinguishes between their proxy metric (hardware instruction count / DPS) and actual runtime efficiency. §2 and Appendix A.2 discuss limitations of runtime measurement and why DPS provides a more meaningful compound metric than raw speedup, carefully defining what DPS measures."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Specific model versions are given: 'gpt-4-0125-preview' for GPT-4 Turbo, 'AWQ-quantized DeepSeekCoder-instruct-33B' for input generation, and specific model family sizes (CodeLlama-7B/13B/34B/70B, DeepSeekCoder-1.3B/6.7B/33B, etc.)."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "The SAS prompt is shown in Figure 2 with actual text and few-shot examples. Evaluation prompts are described with actual text: perf-instruct uses 'solve the programming task efficiently by writing a fast implementation', perf-CoT adds 'Think step by step'. Base prompts come from public HumanEval/MBPP."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Temperature (0.8 for generation, 0.2 for evaluation), number of samples (50, 10 for GPT-4), scale factor (starting at 2^1, exponential), time wall (20s), memory wall (16GB), clustering parameters (bias=20%, w=10k, K=4), and tthresh=10k instructions are all reported in §3."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used. The evaluation involves direct code generation from LLMs with prompts, with no multi-step agents, tools, or feedback loops."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The pipeline is documented in detail: solution curation from 21 LLMs (§2.1), input generator synthesis via SAS (§2.2), filtering criteria with counts (Table 1: 563 → 342 → 271 → 121), and clustering methodology (§2.4, Figure 3)."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No dedicated limitations or threats-to-validity section exists. The paper has sections for Introduction, DPE, EVALPERF, Evaluation, Related Work, Conclusion, and Appendix, with no explicit limitations discussion."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No specific threats to validity are discussed anywhere in the paper."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show, what settings were excluded, or what claims the authors are not making."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "The benchmark data (tasks, test inputs, reference solutions, performance clusters) is released as open source at github.com/evalplus/evalplus, enabling independent verification."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Solution collection is described in §2.1 and §3: 'we sample and test code solutions from 21 open LLMs that achieve over a pass@1 score of 50 on the EvalPlus leaderboard, where we sample 50 solutions for each model at a temperature of 0.8.'"
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. Data sources are LLM-generated code on standard public benchmarks (HumanEval+ and MBPP+)."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The full pipeline is documented across §§2.1-2.5 and §3: valid solution curation → SAS input generation → exponential input sampling → task selection with filtering criteria → adaptive clustering. Table 1 shows counts at each filtering stage."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Acknowledgment section states: 'This work was partially supported by NSF grant CCF-2131943 and Kwai Inc, as well as API credits from the OpenAI Researcher Access Program.'"
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are listed: University of Illinois Urbana-Champaign and Tongji University. No undisclosed industry affiliations are apparent."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "OpenAI provided API credits through the Researcher Access Program, and the paper evaluates GPT-4 Turbo (an OpenAI product), which achieves the best DPS in the evaluation. OpenAI has a financial interest in GPT-4 performing well."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial disclosure statement is present in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No training data cutoff dates are stated for any of the evaluated models. The models are used without specifying when their training data was collected."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "HumanEval (2021) and MBPP (2021) are public benchmarks that likely appear in training data of models released in 2023-2024. No analysis of potential overlap is provided."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "HumanEval and MBPP were published in 2021 and are widely known public benchmarks. All evaluated models were trained after 2021 and may have seen these tasks. The paper cites Jain et al. (2024) on contamination importance but does not address it for their own evaluation."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study. The paper evaluates LLM code generation via automated benchmarks."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants. All evaluation is automated using LLM-generated code and hardware profiling."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "The paper mentions evaluation takes 'approximately no more than 15 minutes' for profiling, but does not report the API cost or compute cost of generating 50 solutions per model across 21 LLMs, nor the cost of running the SAS pipeline."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No total computational budget is stated. The paper does not report GPU hours, total API spend, or total compute used for solution sampling from 21 models, input generation, or profiling."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No seed sensitivity analysis is reported. Solutions are sampled at fixed temperatures (0.2 or 0.8) but results are not reported across different random seeds."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": true,
    308         "justification": "The paper states: 50 samples at temperature 0.2 (10 for GPT-4), first 10 correct solutions evaluated for efficiency. Appendix A.1 states '5 times' for runtime profiling. 16 input generator samples per task."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "Clustering hyperparameters (bias=20%, w=10k, K=4, tthresh=10k) are stated but no justification or search budget for how these values were selected. No alternatives were reported or discussed."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The threshold values for task filtering and clustering (bias=20%, w=10k) are stated without justification for why these specific values were chosen over alternatives."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": false,
    322         "answer": false,
    323         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors create and evaluate on their own benchmark (EVALPERF) without acknowledging potential bias in evaluating their own system. No independent evaluation or discussion of author-evaluation bias."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "Models ranging from 1.3B to 70B parameters are compared on efficiency without normalizing for compute budget. The paper compares model sizes (Figure 5) but does not report performance as a function of inference compute cost."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "The paper discusses measurement validity (why instruction count is better than runtime, why DPS is better than speedup) but does not examine whether algorithmic-style HumanEval/MBPP tasks reflect real-world code efficiency challenges."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "No scaffolding is involved. All evaluations use direct code generation from LLMs without agentic scaffolding."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "HumanEval (2021) and MBPP (2021) were published years before the evaluated models were trained. Models trained after 2021 may have seen solutions to these tasks. This temporal leakage is not discussed."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of whether the evaluation setup leaks information. The task descriptions from HumanEval/MBPP include function signatures and docstrings that may appear in training data."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether train and test examples share structural similarities. HumanEval/MBPP tasks are well-known and solutions circulate widely online."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference tests, or decontamination pipelines are used."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "General instruction tuning benefits both code correctness and efficiency, even though existing instruction tuning methods were not designed to optimize code efficiency.",
    372       "evidence": "Figure 4 shows pairwise DPS comparisons where instruct variants generally outperform base variants. DeepSeekCoder-6.7B-instruct improves base by 19% DPS. Exception: StarCoder2-15B where base slightly outperforms instruct.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "The scaling law persists for code correctness but does not seem explicit for code efficiency.",
    377       "evidence": "Figure 5 shows mixed results across model sizes: 7/12 pairs favor larger models, but 4 pairs show >1% degradation. StarCoder2 shows 6% degradation from 3B to 7B. CodeLlama shows inconsistent scaling.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Performance-encouraging prompts neither consistently nor noticeably improve code efficiency.",
    382       "evidence": "Figure 4 shows small/inconsistent DPS differences between instruct, perf-instruct, and perf-CoT variants. Table 3 shows perf-prompts commonly lead to correctness degradation.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "SAS creates inputs that are more performance-exercising than EvalPlus by 4.8×.",
    387       "evidence": "Table 1: after all filtering, SAS retains 121 tasks vs EvalPlus's 25 tasks. SAS passes 271/342 tasks (79%) on computation criterion vs EvalPlus's 204 (60%).",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "EVALPERF leads to consistent performance evaluation across various platforms with maximum CV of 0.4%.",
    392       "evidence": "Table 2 shows DPS and DPSnorm for 3 models across 4 hardware platforms (desktop, workstation, server) with CVs ranging from 0.1% to 0.4%.",
    393       "supported": "strong"
    394     }
    395   ],
    396   "red_flags": [
    397     {
    398       "flag": "No contamination discussion",
    399       "detail": "HumanEval (2021) and MBPP (2021) are well-known public benchmarks that are likely in the training data of all evaluated models (released 2023-2024). The paper cites contamination-related work (Jain et al. 2024) but does not address contamination risk in its own evaluation. This is especially concerning because better-contaminated models would appear both more correct and potentially more efficient."
    400     },
    401     {
    402       "flag": "No limitations section",
    403       "detail": "The paper has no limitations, threats-to-validity, or scope-bounding discussion. Key unacknowledged limitations include: Python-only evaluation with broad claims about 'code generation', competition-style tasks vs real-world efficiency challenges, and potential selection bias from using models that score >50 pass@1 on EvalPlus."
    404     },
    405     {
    406       "flag": "No statistical significance tests",
    407       "detail": "All comparative claims ('instruction tuning benefits efficiency', 'scaling law fails for efficiency') are based on comparing raw DPS numbers without any statistical tests, despite comparing across noisy samples with temperature-based generation."
    408     },
    409     {
    410       "flag": "OpenAI funding with GPT-4 evaluation",
    411       "detail": "OpenAI provided API credits via the Researcher Access Program, and GPT-4 Turbo achieves the best DPS score in the evaluation. While the paper's main contribution is the benchmark framework rather than a model ranking, the best-performing model is from a funder."
    412     }
    413   ],
    414   "cited_papers": [
    415     {
    416       "title": "Evaluating large language models trained on code",
    417       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    418       "year": 2021,
    419       "relevance": "Introduces HumanEval, the foundational code generation benchmark used as a source for EVALPERF."
    420     },
    421     {
    422       "title": "Program synthesis with large language models",
    423       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell I. Nye"],
    424       "year": 2021,
    425       "arxiv_id": "2108.07732",
    426       "relevance": "Introduces MBPP benchmark for code generation, used as a source for EVALPERF."
    427     },
    428     {
    429       "title": "Is your code generated by chatGPT really correct? Rigorous evaluation of large language models for code generation",
    430       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    431       "year": 2023,
    432       "relevance": "Introduces EvalPlus with extended test suites (HumanEval+ and MBPP+), the direct foundation for EVALPERF's task pool."
    433     },
    434     {
    435       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    436       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"],
    437       "year": 2023,
    438       "relevance": "Major benchmark for evaluating LLMs on real-world software engineering tasks."
    439     },
    440     {
    441       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    442       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    443       "year": 2024,
    444       "arxiv_id": "2403.07974",
    445       "relevance": "Addresses contamination in code evaluation, directly relevant to benchmark validity concerns."
    446     },
    447     {
    448       "title": "Learning performance-improving code edits",
    449       "authors": ["Alexander Shypula", "Aman Madaan", "Yimeng Zeng"],
    450       "year": 2023,
    451       "relevance": "PIE benchmark for code optimization with CPU simulators, the closest prior work to EVALPERF for efficiency evaluation."
    452     },
    453     {
    454       "title": "Code Llama: Open foundation models for code",
    455       "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"],
    456       "year": 2023,
    457       "relevance": "Major open-source code LLM family evaluated in this paper across 4 sizes."
    458     },
    459     {
    460       "title": "DeepSeek Coder: Let the code write itself",
    461       "authors": ["DeepSeek AI"],
    462       "year": 2023,
    463       "relevance": "Open-source code LLM family evaluated in this paper; also used for SAS input generation."
    464     },
    465     {
    466       "title": "StarCoder 2 and The Stack v2: The next generation",
    467       "authors": ["Anton Lozhkov", "Raymond Li", "Loubna Ben Allal"],
    468       "year": 2024,
    469       "arxiv_id": "2402.19173",
    470       "relevance": "Open-source code LLM family evaluated in this paper; evaluation setup follows their protocol."
    471     },
    472     {
    473       "title": "Asleep at the keyboard? Assessing the security of GitHub Copilot's code contributions",
    474       "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan"],
    475       "year": 2022,
    476       "relevance": "Evaluates security aspects of LLM-generated code, relevant to comprehensive code quality evaluation."
    477     },
    478     {
    479       "title": "EffiBench: Benchmarking the efficiency of automatically generated code",
    480       "authors": ["Dong Huang", "Jie M Zhang", "Yuhao Qing", "Heming Cui"],
    481       "year": 2024,
    482       "arxiv_id": "2402.02037",
    483       "relevance": "Concurrent benchmark for code efficiency evaluation, compared with DPE in the related work section."
    484     },
    485     {
    486       "title": "Magicoder: Empowering code generation with OSS-instruct",
    487       "authors": ["Yuxiang Wei", "Zhe Wang", "Jiawei Liu", "Yifeng Ding", "Lingming Zhang"],
    488       "year": 2024,
    489       "relevance": "Code instruction tuning method from overlapping authors, relevant to findings about instruction tuning improving efficiency."
    490     }
    491   ]
    492 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs