scan.json (28616B)
1 { 2 "paper": { 3 "title": "Benchmarking AI Models in Software Engineering: A Review, Search Tool, and Unified Approach for Elevating Benchmark Quality", 4 "authors": [ 5 "Roham Koohestani", 6 "Philippe de Bekker", 7 "Begüm Koç", 8 "Maliheh Izadi" 9 ], 10 "year": 2025, 11 "venue": "arXiv", 12 "arxiv_id": "2503.05860" 13 }, 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper provides a GitHub repository (https://github.com/AISE-TUDelft/AI4SE-benchmarks) and the BenchScout tool at https://evalpro.online/. Referenced as a replication package multiple times (Sections I, III-A, IV-C)." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "Section VIII-A states: 'We publicly release the results of our literature review, user study, and 50% of the manually refined benchmark.' The full benchmark is promised upon acceptance. The review data and BenchScout data are available via the GitHub repository." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "Section V-B mentions 'one NVIDIA A100 80GB GPU and 32 CPU cores' for inference, but no software environment details (requirements.txt, Dockerfile, Python version, library versions) are provided." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "While a replication package is referenced via GitHub, the paper itself does not contain step-by-step reproduction instructions. The experimental setup in Section V-B describes the general approach but lacks specific commands or scripts to replicate results." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "The benchmark evaluation results (Tables XXII-XXV) report only point estimates of pass@1 scores with no confidence intervals or error bars. The user study reports only average Likert scores without uncertainty measures." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": true, 46 "justification": "Section VI-B reports t-tests and Wilcoxon signed-rank tests for the noninferiority comparison between HumanEvalNext and HumanEvalNext-Agentic, with specific p-values (e.g., p = 3.46 × 10^−20). However, no significance tests are applied to the main benchmark comparison (HumanEval vs. HumanEvalNext) or user study results." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports absolute percentage drops in pass@1 scores with baseline context: '31.22% average decrease' from specific baselines (e.g., 87.23% to 51.22%), and for the noninferiority study reports mean ratings (0.16, 0.53, 0.35) with SDs (0.67, 0.51, 0.49)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification is provided for key sample sizes: why 10 models were selected, why 22 user study participants, why 100 MBPP problems for generalizability, or why 2 reviewers for the noninferiority study. No power analysis is mentioned." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "The benchmark evaluations report single-run results only. No variance, standard deviation, or results across multiple runs are reported for the pass@1 experiments. The noninferiority study reports SD for reviewer ratings, but the main experiments do not report any spread measures." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The HumanEvalNext benchmark is compared against both the original HumanEval and HumanEvalPlus (Table XXII). The agentic pipeline is compared to the human-improved version. MBPPNext is compared to MBPP baseline." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": false, 73 "justification": "The 10 models evaluated are all open-source models from leaderboards 'at the evaluation's start' (Section V-B), but they are all relatively small (3B-15B) and older models. No frontier models (GPT-4, Claude, Gemini) are included. The paper itself acknowledges in Section VI-D that 'it remains unclear how larger, top-performing models behind paywalls, such as GPT and Gemini, would perform.'" 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": false, 78 "justification": "No ablation study is performed to determine which specific improvements in BenchFrame (corrected solutions, type annotations, edge cases, improved descriptions) contribute most to the performance drops. The modifications are bundled together." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": false, 83 "justification": "The benchmark evaluation uses only pass@1 as the metric. No other metrics (pass@k for k>1, functional correctness breakdowns, etc.) are reported for the code generation experiments. The user study uses multiple Likert scale questions but the code generation evaluation is single-metric." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": true, 88 "justification": "The user study with 22 participants evaluates BenchScout (Section IV-C). The peer review of HumanEvalNext (Section V-A3) and the paired evaluation with two independent reviewers for the agentic pipeline (Section VI-B) both involve human evaluation of the benchmark quality." 89 }, 90 "held_out_test_set": { 91 "applies": false, 92 "answer": false, 93 "justification": "This is a benchmark creation and review paper, not a machine learning training paper. There is no train/test split relevant to the paper's own methodology." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table XXII/XXIII provides per-model breakdowns. The review provides per-category distribution (Figure 3: code generation 34.4%, code understanding 17.7%, etc.). Table XIX provides per-limitation frequency counts. Per-problem difficulty analysis is also discussed." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section VI-B discusses pitfalls of the agentic pipeline: docstrings revealing solutions, canonical solutions making unintended assumptions, test cases not adhering to format. Section V-A1 discusses specific failure modes in HumanEval (incorrect tests, suboptimal solutions). The CodeQwen1.5 extreme drop (87.2% to 10.98%) is discussed as indicating potential data leakage." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section VI-B reports negative observations about the agentic pipeline (docstring leakage, unintended assumptions in solutions). Table XXIV shows the agentic version sometimes performs worse than human-improved version on specific models. The paper honestly discusses limitations of BenchScout's visualization features receiving lower scores (3.7-3.9)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims are supported: '247 studies, identifying 273 AI4SE benchmarks' matches Section III-B; BenchScout scores of 4.5, 4.0, 4.1 match Section IV-C; pass@1 drops of 31.22% and 19.94% match Section V-C (Tables XXII-XXIII). All claims are grounded in reported results." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper makes causal claims such as CodeQwen's drop 'suggests that certain models may have benefited from data leakage' (Section V-C) and that BenchFrame 'reveals substantial performance gaps' (Section VIII). These are correlational observations attributed to causal mechanisms (benchmark flaws, contamination) without controlled experiments isolating these factors. The bundled modifications prevent attributing drops to specific causes." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title claims to be about 'Benchmarking AI Models in Software Engineering' broadly, but the evaluation is limited to Python code generation with 10 small open-source models. The MBPP generalizability test uses only 100 of 500 problems. The paper's framing as a 'unified approach' for benchmark quality extends well beyond what was tested." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "Section V-C discusses data leakage as an alternative explanation for model performance patterns. Section VI-E (Threats to Validity) discusses construct validity concerns about subjectivity in inclusion criteria, internal validity concerns about selection bias in the user study, and external validity concerns about generalizability limitations." 131 } 132 }, 133 "setup_transparency": { 134 "model_versions_specified": { 135 "applies": true, 136 "answer": true, 137 "justification": "Section V-B lists exact model identifiers for all 10 models (e.g., 'NTQAI/Nxcode-CQ-7B-orpo', 'deepseek-ai/deepseek-coder-6.7b-instruct'). Section VI-B specifies 'o3-mini-2025-01-31' for the agentic pipeline. These are HuggingFace model IDs that identify specific versions." 138 }, 139 "prompts_provided": { 140 "applies": true, 141 "answer": false, 142 "justification": "Section V-B describes the prompting approach only in natural language: 'the LLM is prompted using an instructional preamble asking the model to finish the implementation of the function.' The actual prompt text is not provided in the paper or appendix." 143 }, 144 "hyperparameters_reported": { 145 "applies": true, 146 "answer": false, 147 "justification": "No hyperparameters are reported: no temperature, top-p, max tokens, or sampling settings for any of the model inference runs. Section V-B mentions only GPU type (A100 80GB) and timeout (15 seconds) but not LLM generation parameters." 148 }, 149 "scaffolding_described": { 150 "applies": true, 151 "answer": true, 152 "justification": "The agentic pipeline for BenchFrame is described in detail in Section VI-B with a flowchart (Figure 8) showing three phases (text improvement, code improvement, test improvement), validation, retry logic (3 attempts), and minimized passing test cases fallback. The model used (o3-mini-2025-01-31) and the OpenAI API are specified." 153 }, 154 "data_preprocessing_documented": { 155 "applies": true, 156 "answer": true, 157 "justification": "Section III-A describes the literature review pipeline: structured searches on Google Scholar and Semantic Scholar with specific keywords, credibility verification, taxonomy development, duplicate removal, forward/backward snowballing, resulting in 247 papers. Inter-rater agreement (96.4%) is reported for categorization." 158 } 159 }, 160 "limitations_and_scope": { 161 "limitations_section_present": { 162 "applies": true, 163 "answer": true, 164 "justification": "Section VI-E 'Threats to the Validity' provides a dedicated subsection covering construct validity, internal validity, and external validity, with substantive discussion of specific threats." 165 }, 166 "threats_to_validity_specific": { 167 "applies": true, 168 "answer": true, 169 "justification": "The threats are specific to this study: 'subjectivity in defining and applying the inclusion/exclusion criteria' for the review, 'selection bias' in the user study mitigated by including participants from both industry and academia, the reviewer 'was not informed of specific changes made by the first author' for blinding, and acknowledgment that '10 models... could not be considered sufficient to prove generalizability.'" 170 }, 171 "scope_boundaries_stated": { 172 "applies": true, 173 "answer": false, 174 "justification": "While specific threats are discussed, the paper does not explicitly state what the results do NOT show. It does not bound claims to Python-only or small open-source models. The title and framing suggest broad applicability ('Benchmarking AI Models in Software Engineering') without explicit statements about excluded settings, languages, or model classes." 175 } 176 }, 177 "data_integrity": { 178 "raw_data_available": { 179 "applies": true, 180 "answer": true, 181 "justification": "The replication package at https://github.com/AISE-TUDelft/AI4SE-benchmarks is referenced multiple times. Section VIII-A confirms public release of review data, user study details, and 50% of HumanEvalNext. The benchmark data and metadata are available for verification." 182 }, 183 "data_collection_described": { 184 "applies": true, 185 "answer": true, 186 "justification": "Section III-A details the data collection: searches on Google Scholar and Semantic Scholar using specific keywords, PapersWithCode datasets collection, English papers from 2014-2025, two authors reviewing relevance, forward and backward snowballing, 247 papers total." 187 }, 188 "recruitment_methods_described": { 189 "applies": true, 190 "answer": false, 191 "justification": "For the user study with 22 participants, Section IV-C describes demographics (9 industry, 13 academia, roles, experience levels) but does not describe how participants were recruited. No information on recruitment channels, compensation, or potential selection bias in participant recruitment." 192 }, 193 "data_pipeline_documented": { 194 "applies": true, 195 "answer": true, 196 "justification": "The review pipeline is documented from search to final analysis: keyword search → duplicate removal → relevance review by two authors → originality/reproducibility/accessibility assessment → snowballing → 247 papers → taxonomy development → metadata extraction. The BenchFrame pipeline is documented in Figure 6 and Section V-A." 197 } 198 }, 199 "conflicts_of_interest": { 200 "funding_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Section IX (Acknowledgments) states: 'This research was supported in part by an Amazon Research Award granted to Dr. Maliheh Izadi.'" 204 }, 205 "affiliations_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "All authors are listed as affiliated with 'EEMCS faculty, Delft University of Technology, The Netherlands.' ORCID identifiers are provided for three authors. No author works at a company whose product is being evaluated." 209 }, 210 "funder_independent_of_outcome": { 211 "applies": true, 212 "answer": true, 213 "justification": "The funder is Amazon, which does not have a direct commercial stake in the specific benchmarks being evaluated (HumanEval, MBPP). The paper states: 'The views and conclusions contained in this paper are those of the authors and do not necessarily reflect the position or policies of Amazon.'" 214 }, 215 "financial_interests_declared": { 216 "applies": true, 217 "answer": false, 218 "justification": "No competing interests or financial interests statement is present in the paper. The acknowledgments mention the Amazon grant but there is no explicit declaration that the authors have no other financial interests." 219 } 220 }, 221 "contamination": { 222 "training_cutoff_stated": { 223 "applies": true, 224 "answer": false, 225 "justification": "The paper evaluates 10 pre-trained code models on HumanEval/HumanEvalNext benchmarks but does not state the training data cutoff dates for any of the models. This is important given that the paper itself discusses data leakage as a concern." 226 }, 227 "train_test_overlap_discussed": { 228 "applies": true, 229 "answer": true, 230 "justification": "The paper explicitly discusses potential data contamination: Section I discusses HumanEval contamination with ChatGPT reproducing incorrect answers, Section V-C discusses 'data leakage or other issues in the original HumanEval benchmark', and Section VI-D acknowledges HumanEvalNext would also be affected by data leakage over time." 231 }, 232 "benchmark_contamination_addressed": { 233 "applies": true, 234 "answer": true, 235 "justification": "Benchmark contamination is a central theme of the paper. Section I discusses ChatGPT reproducing HumanEval errors as evidence of contamination. Section V-C interprets the large performance drops as evidence of prior benchmark overfitting. Section VII discusses LiveCodeBench as an approach to avoid overfitting and DyCodeEval for minimizing contamination." 236 } 237 }, 238 "human_studies": { 239 "pre_registered": { 240 "applies": true, 241 "answer": false, 242 "justification": "The paper includes a user study with 22 participants but does not mention pre-registration of the study design or hypotheses." 243 }, 244 "irb_or_ethics_approval": { 245 "applies": true, 246 "answer": false, 247 "justification": "No mention of IRB or ethics board approval for the user study involving 22 human participants." 248 }, 249 "demographics_reported": { 250 "applies": true, 251 "answer": true, 252 "justification": "Section IV-C reports participant demographics: roles (6 researchers, 5 PhD candidates, 5 students, 4 engineers, 2 lead researchers), experience levels (8 at 1-3 years, 6 at 3-5, 3 at 5+, 5 at <1 year), industry vs academia split (9 vs 13), and AI4SE familiarity ratings." 253 }, 254 "inclusion_exclusion_criteria": { 255 "applies": true, 256 "answer": false, 257 "justification": "Section IV-C describes aiming for 'a diverse group of users with varying degrees of expertise' and includes people from 'both industry and academia' but does not state specific inclusion or exclusion criteria for participant selection." 258 }, 259 "randomization_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "The user study is not an experimental study with treatment/control conditions requiring randomization. All participants used the same tool and filled out the same questionnaire." 263 }, 264 "blinding_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "The user study is not an experimental study with multiple conditions where blinding would be applicable. It is a single-tool usability evaluation." 268 }, 269 "attrition_reported": { 270 "applies": true, 271 "answer": false, 272 "justification": "No information is provided about whether all 22 participants completed the study or whether any dropped out." 273 } 274 }, 275 "cost_and_practicality": { 276 "inference_cost_reported": { 277 "applies": true, 278 "answer": true, 279 "justification": "Section VI-B reports the API cost for the agentic pipeline: 'the total incurred cost from calling the apis for the models was $5.2823.' However, no cost is reported for the main benchmark inference runs with the 10 models." 280 }, 281 "compute_budget_stated": { 282 "applies": true, 283 "answer": true, 284 "justification": "Section V-B states: 'We run the inference for the models on a cluster with one NVIDIA A100 80GB GPU and 32 CPU cores.' Section V-A3 mentions 'the initial creation of the benchmark took over 100 hours' with '16 hours' for peer review. However, total GPU hours for inference are not quantified." 285 } 286 } 287 }, 288 "claims": [ 289 { 290 "claim": "The AI4SE benchmarking landscape is highly fragmented, with 273 benchmarks identified from 247 studies since 2014.", 291 "evidence": "Section III-B reports the systematic review results with 273 benchmarks, 71 published in 2024 alone. Figure 2 shows temporal growth. Section III-C identifies systemic limitations including language specificity, poor maintenance, and lack of peer review.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "BenchScout achieved usability, effectiveness, and intuitiveness scores of 4.5, 4.0, and 4.1 out of 5 in a user study with 22 participants.", 296 "evidence": "Section IV-C reports these exact scores from 22 participants (9 industry, 13 academia) using a 5-point Likert scale questionnaire. Detailed demographic and per-question breakdowns are provided.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "HumanEvalNext reveals a 31.22% average drop in pass@1 scores compared to the original HumanEval across 10 state-of-the-art code models.", 301 "evidence": "Table XXII/XXIII provides per-model pass@1 comparisons. Drops range from 18.94% (codegemma) to 76.22% (CodeQwen1.5). Section V-C reports 31.22% average and 26.02% median decline.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "The agentic pipeline for benchmark improvement is noninferior to the human-improved process.", 306 "evidence": "Section VI-B reports noninferiority testing with δ=-0.5 margin: mean ratings 0.16 (SD=0.67) for Reviewer 1, 0.53 (SD=0.51) for Reviewer 2, with t-statistics 12.68 and 25.75 and Wilcoxon tests confirming (p < 10^-20).", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "Some models' performance drops suggest potential data leakage or benchmark overfitting, as model rankings change substantially between HumanEval and HumanEvalNext.", 311 "evidence": "Section V-C: CodeQwen1.5 drops from 87.2% to 10.98%, while deepseek-coder maintains relative performance (80.22% to 58.54%). This ranking change is presented as evidence but is observational without controlled contamination analysis.", 312 "supported": "weak" 313 }, 314 { 315 "claim": "BenchFrame generalizes to MBPP, with an average decrease of 13.4 percentage points on MBPPNext.", 316 "evidence": "Section VI-C and Table XXV show per-model results for 100 MBPP problems processed through BenchFrame. Drops range from 6% to 22%.", 317 "supported": "moderate" 318 } 319 ], 320 "methodology_tags": [ 321 "meta-analysis", 322 "benchmark-eval", 323 "case-study" 324 ], 325 "key_findings": "This paper conducts a systematic review of 273 AI4SE benchmarks from 247 studies, revealing fragmentation and quality issues including language specificity, poor maintenance, and lack of peer review. The authors introduce BenchScout, a semantic search tool for benchmark discovery, and BenchFrame, a methodology for improving benchmark quality. Applying BenchFrame to HumanEval yielded HumanEvalNext, which reduced average pass@1 scores by 31.22% across 10 models, suggesting widespread benchmark saturation and potential contamination. An agentic pipeline was shown to be noninferior to the manual improvement process, costing only $5.28 in API calls.", 326 "red_flags": [ 327 { 328 "flag": "No hyperparameters reported for main experiments", 329 "detail": "The paper evaluates 10 code generation models without reporting temperature, top-p, or other sampling parameters. These settings significantly affect pass@1 results and could explain some of the observed variance between models." 330 }, 331 { 332 "flag": "Single-run results without variance", 333 "detail": "All pass@1 results appear to be from single runs. Code generation with LLMs involves stochastic sampling, so results without variance across multiple runs or seeds cannot be reliably compared." 334 }, 335 { 336 "flag": "Bundled modifications prevent attribution", 337 "detail": "BenchFrame modifies multiple aspects simultaneously (corrected solutions, type annotations, edge cases, improved descriptions, increased test count). Without ablation, it is impossible to determine which changes drive the performance drops." 338 }, 339 { 340 "flag": "Overly broad title relative to evaluation scope", 341 "detail": "The title suggests comprehensive 'Benchmarking AI Models in Software Engineering' but evaluation is limited to Python code generation with 10 small open-source models (3B-15B). No frontier models, no multi-language evaluation, and limited task diversity." 342 }, 343 { 344 "flag": "Prompts not provided", 345 "detail": "The actual prompts used for model inference are described only in natural language ('instructional preamble asking the model to finish the implementation'). This prevents faithful reproduction, especially given the paper's own emphasis on benchmark rigor." 346 }, 347 { 348 "flag": "Only 2 reviewers for noninferiority study", 349 "detail": "The agentic pipeline noninferiority comparison uses only 2 reviewers with a 5-point scale. Inter-rater reliability is not reported, and with only 2 raters, individual reviewer biases could substantially affect conclusions." 350 } 351 ], 352 "cited_papers": [ 353 { 354 "title": "Evaluating large language models trained on code", 355 "authors": ["M. Chen et al."], 356 "year": 2021, 357 "arxiv_id": "2107.03374", 358 "relevance": "Introduced HumanEval, the most widely-used code generation benchmark and the primary subject of this paper's improvement methodology." 359 }, 360 { 361 "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation", 362 "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"], 363 "year": 2023, 364 "arxiv_id": "2305.01210", 365 "relevance": "Introduced HumanEvalPlus/EvalPlus with improved test coverage, one of the baselines compared against HumanEvalNext." 366 }, 367 { 368 "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code", 369 "authors": ["N. Jain et al."], 370 "year": 2024, 371 "arxiv_id": "2403.07974", 372 "relevance": "Dynamic benchmark designed to avoid contamination through continuously gathered new challenges, representing the 'build anew' approach contrasted with BenchFrame." 373 }, 374 { 375 "title": "SWE-bench: Can language models resolve real-world github issues?", 376 "authors": ["C. E. Jimenez et al."], 377 "year": 2023, 378 "arxiv_id": "2310.06770", 379 "relevance": "Major real-world software engineering benchmark for evaluating LLMs on repository-level tasks, catalogued in this review." 380 }, 381 { 382 "title": "AgentBench: Evaluating LLMs as Agents", 383 "authors": ["X. Liu et al."], 384 "year": 2023, 385 "arxiv_id": "2308.03688", 386 "relevance": "Benchmark for evaluating LLMs as agents across multiple tasks, relevant to the agentic AI evaluation landscape." 387 }, 388 { 389 "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions", 390 "authors": ["T. Y. Zhuo et al."], 391 "year": 2024, 392 "arxiv_id": "2406.15877", 393 "relevance": "Class-level code generation benchmark with diverse function calls, representing more realistic evaluation beyond HumanEval-style problems." 394 }, 395 { 396 "title": "Top leaderboard ranking = top coding proficiency, always? EvoEval: Evolving coding benchmarks via LLM", 397 "authors": ["C. S. Xia", "Y. Deng", "L. Zhang"], 398 "year": 2024, 399 "arxiv_id": "2403.19114", 400 "relevance": "Evolving benchmarks using LLMs to address saturation, directly relevant to benchmark quality improvement methodology." 401 }, 402 { 403 "title": "MultiPL-E: A scalable and extensible approach to benchmarking neural code generation", 404 "authors": ["F. Cassano et al."], 405 "year": 2022, 406 "arxiv_id": "2208.08227", 407 "relevance": "Multi-language code generation benchmark framework, used as the language conversion framework for HumanEvalNext." 408 }, 409 { 410 "title": "Program synthesis with large language models", 411 "authors": ["J. Austin et al."], 412 "year": 2021, 413 "arxiv_id": "2108.07732", 414 "relevance": "Introduced MBPP benchmark, the second major benchmark subject to BenchFrame improvement in this paper." 415 }, 416 { 417 "title": "Competition-level code generation with AlphaCode", 418 "authors": ["Y. Li et al."], 419 "year": 2022, 420 "doi": "10.1126/science.abq1158", 421 "relevance": "Competitive programming code generation benchmark (CodeContests), representative of high-difficulty evaluation in AI4SE." 422 } 423 ] 424 }