scan.json (27783B)
1 { 2 "paper": { 3 "title": "CodeBenchGen: Creating Scalable Execution-Based Code Generation Benchmarks", 4 "authors": ["Yiqing Xie", "Alex Xie", "Divyanshu Sheth", "Pengfei Liu", "Daniel Fried", "Carolyn Rosé"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2404.00566" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "The paper provides a GitHub link: https://github.com/yiqingxyq/CodeBenchGen (footnote 1, Section 1: 'Code and dataset available at https://github.com/yiqingxyq/CodeBenchGen')." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The same footnote states 'Code and dataset available at https://github.com/yiqingxyq/CodeBenchGen.' The Exec-CSN dataset is released. The source data (CodeSearchNet) is also publicly available." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section listing specific library versions is provided in the paper. The paper mentions using various models and libraries but does not specify a reproducible environment configuration." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper describes the methodology at a high level but does not include step-by-step reproduction instructions such as specific commands to run. The GitHub repository may contain these, but the paper itself does not provide them." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "All Pass@k results in Table 4 are reported as point estimates without confidence intervals or error bars. The human study results in Figure 7 also lack uncertainty quantification." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": true, 41 "justification": "The paper uses logistic regression analysis (Appendix A.6) with significance tests to assess which factors affect model performance. Pearson-r correlations with p-values are reported in Section 4.2 (e.g., 'Pearson-r correlation coefficient for number of variables is 0.73, with p-value < 0.001'). However, the main claim that humans outperform models after revision (Table 5) lacks a formal significance test." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports effect sizes in context throughout. For example, Pass@1 scores with baselines (GPT-4 at 37.21% vs. DeepSeek-Coder-33B at 34.00%), Pearson-r correlation coefficients (0.51, 0.58, 0.73), BLEU score of 0.5116 vs. random pair BLEU of 0.0052, and Jaccard similarity of 83%. These provide sufficient context to judge magnitude." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The human study uses only 64 examples evaluated by graduate student volunteers, but no justification is given for this sample size. No power analysis is discussed. The number of human participants is not even clearly stated." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "Pass@k results are reported as single values without standard deviations across runs. The paper does not report variance or confidence intervals for any experimental results. Table 3 reports average and [min, max] for dataset statistics but no standard deviation." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper compares 12 models (10 open-source, 2 proprietary) against each other in Table 4. It also compares the CodeBenchGen framework against RepoEval and R2E approaches for benchmark creation (Figure 1, Table 2)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "The baselines include GPT-4, Llama-3 (2024), CodeQwen1.5, and DeepSeek-Coder, which were contemporary at the time of writing. The benchmark construction comparison includes R2E (Jain et al., 2024) and SWE-BENCH (Jimenez et al., 2024), both concurrent/recent." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper includes ablation-like analyses: (1) iterative debugging impact shown in Table 1 (examples increasing from 1,260 to 2,343 across 3 debugging iterations), (2) test augmentation study in Table 7 comparing original vs. augmented tests with different models, (3) comparison of instruction types (generated vs. docstring) in Figure 7." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper reports Pass@1, Pass@2, Pass@5, and Pass@10 (Table 4). It also uses BLEU score, Jaccard similarity, AST depth correlation, Pearson-r, line coverage rate, and Likert scale ratings for different aspects of the evaluation." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": true, 83 "justification": "Section 4.4 presents a human study with computer science graduate students evaluating 64 examples for solvability, difficulty, instruction clarity, and test case quality. Section 5.3 also compares human vs. model performance on code generation." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "The Exec-CSN dataset is derived from the test split of CodeSearchNet ('We sample 5,000 Python examples from the test split of CodeSearchNet', Section 3). The benchmark itself serves as the test set and was not used to tune any of the evaluated models." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Figure 8 provides detailed breakdowns by target length, context length, number of function calls, type of libraries, and specific external libraries. The logistic regression in Appendix A.6 further breaks down performance by multiple factors." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 5.2 discusses the CodeLlama-70B failure mode where it refuses to generate code due to alignment training (~8% of cases). Appendix A.5 provides a detailed case study comparing human and GPT-4 revision strategies, showing GPT-4's 'stubborn' behavior of repeating the same wrong answer." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that GPT-4 fails to improve beyond 57.81% even with 4 revisions while humans reach 81.25% (Table 5). It also reports that GPT-4 sometimes repeats the same wrong answer across revisions (Appendix A.5), and that smaller models generate worse tests (Table 7)." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims are supported: 1,931 examples from 367 repos (Table 1, Section 3), 81.3% solvable by humans (Figure 7), 61% rated as 'requires effort' (Figure 7 shows 44% medium + 17% hard = 61%), and GPT-4 achieves 37.21% Pass@1 (Table 4)." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper makes causal claims via ablation-style analysis (e.g., 'models receive overall lower scores on examples with longer target lengths'), supported by controlled factor analysis in the logistic regression (Appendix A.6) and the performance breakdown in Figure 8. The debugging iteration analysis (Table 1) demonstrates causal impact of the iterative process." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title 'Creating Scalable Execution-Based Code Generation Benchmarks' implies broad applicability, but the framework is only demonstrated on Python code from CodeSearchNet. The paper does not explicitly bound generalization to Python or to the specific types of code in CodeSearchNet. Claims about model capabilities on 'diverse real-world tasks' are based solely on Python function completion." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper discusses several alternative explanations: LLM self-bias in test generation (Appendix A.3, Table 8), the CodeLlama alignment training causing refusals rather than inability (Section 5.2), and that context length's positive effect on performance may be because 'longer context provides more information' rather than being inherently easier (Section 5.3)." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Table 9 (Appendix A.4) provides full checkpoint names for all models: e.g., 'gpt-4-0125-preview', 'gpt-3.5-turbo-0125', 'Meta-Llama-3-8B-Instruct', 'DeepSeek-Coder-7B-instruct-v1.5'. These include version identifiers sufficient for reproduction." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Appendix A.8 provides a full example of the code generation prompt used for evaluation, including the boilerplate instruction, context, and function header format. The prompt structure is fully specified." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Section 5.1 states: 'we sample outputs with a temperature of 0.3 and top-p of 0.95.' For test augmentation (Appendix A.2): 'temperature of 0.3 and top-p of 0.7.' Number of samples per example is also specified (20 for open-source, 10 for closed-source)." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The paper describes the multi-step LLM scaffolding in detail across Section 2: sandboxing (Step 1), test generation (Step 2), iterative execution and debugging (Step 3), and post-processing (Step 4). Figure 4 provides a visual workflow. Retry logic and filtering criteria are specified." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 3 documents the data pipeline: starting from 5,000 Python examples from CodeSearchNet test split, filtering invalid file links and I/O operations (4,079 remaining), then the framework's multi-step processing (Table 1 shows counts at each step), post-filtering for safety (Table 6, Appendix A.2), resulting in 1,931 final examples." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section in the paper. Section 7 is 'Conclusions and Future Works' which briefly mentions future directions but does not substantively discuss limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No specific threats to validity are discussed. The paper does not address issues such as the human study's small sample size (64 examples), potential LLM bias in the benchmark construction, or the representativeness of CodeSearchNet as a source." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to Python, to the specific types of code in CodeSearchNet, or acknowledge that the framework has only been demonstrated with GPT-4 as the sandbox LLM." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "The dataset (Exec-CSN) is released at the GitHub repository. The source data (CodeSearchNet) is publicly available. Multiple examples are shown in Appendix A.7 and A.9." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 3 describes the data collection: 5,000 Python examples sampled from CodeSearchNet test split, corresponding GitHub code files downloaded, filtering for valid file links and I/O operations. The framework's conversion process is described in Section 2." 182 }, 183 "recruitment_methods_described": { 184 "applies": true, 185 "answer": false, 186 "justification": "The human study mentions 'computer science graduate student volunteers' (Section 4.4) but does not describe how they were recruited, how many participants there were, from which institution, or whether recruitment could introduce bias. The number of participants is not explicitly stated." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "Table 1 documents the pipeline with counts at each stage: 4,079 input examples -> 1,260 after sandboxing & test gen -> 1,973/2,155/2,343 after debugging iterations -> 1,931 after post-processing. Appendix A.2 provides additional filtering details." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "Section 8 (Acknowledgement) states: 'This work was supported in part by NSF grant DSES 2222762.'" 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly listed: Carnegie Mellon University (5 authors) and Shanghai Jiao Tong University (1 author). No author is affiliated with the companies whose models are evaluated." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": true, 208 "justification": "The funder is NSF (National Science Foundation), a government agency with no financial interest in the performance of any particular code generation model. NSF is independent of the outcome." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper evaluates multiple pre-trained models on the Exec-CSN benchmark but does not state training data cutoff dates for any of the models. This is relevant because CodeSearchNet data is publicly available and could be in training data." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "The paper does not discuss whether the CodeSearchNet data (which is public and widely used) could have been in the training data of the evaluated models. This is a significant concern since CodeSearchNet is a well-known dataset." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "CodeSearchNet was published in 2020, and all evaluated models were trained after 2020. The adapted code maintains high similarity to the original (83% Jaccard similarity for variables), meaning models trained on CodeSearchNet could have memorized solutions. This contamination risk is not discussed." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": true, 236 "answer": false, 237 "justification": "No pre-registration is mentioned for the human study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": true, 241 "answer": false, 242 "justification": "No IRB or ethics board approval is mentioned for the human study involving graduate student participants." 243 }, 244 "demographics_reported": { 245 "applies": true, 246 "answer": false, 247 "justification": "Participants are described only as 'computer science graduate student volunteers' (Section 4.4). No further demographics (experience level, years of programming, gender, etc.) are reported." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": true, 251 "answer": false, 252 "justification": "No inclusion or exclusion criteria are stated for participants. It is unclear how 'computer science graduate student volunteers' were screened or selected." 253 }, 254 "randomization_described": { 255 "applies": true, 256 "answer": false, 257 "justification": "The paper does not describe how the 64 examples were assigned to participants. It is unclear whether examples were randomly assigned or if participants chose examples." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "Blinding is not clearly applicable to this study design where participants solve coding problems. There is no comparison between conditions that would require blinding." 263 }, 264 "attrition_reported": { 265 "applies": true, 266 "answer": false, 267 "justification": "The paper reports 'obtained results on 64 examples in total' but does not report how many participants were initially recruited, whether any dropped out, or whether any examples were excluded from analysis." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "The paper does not report inference costs, API costs, or total tokens consumed for either the benchmark construction (which uses GPT-4 extensively) or the model evaluation experiments." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total computational budget is stated. The paper uses GPT-4 for benchmark construction (4,079 examples with multiple retries and debugging iterations) and runs 12 models for evaluation, but does not quantify the compute required." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "CodeBenchGen successfully converts 47% of source code fragments into evaluation examples with executable test cases, covering 90% of input repositories.", 286 "evidence": "Table 1 shows 1,931 examples from 4,079 inputs (47%). Section 3 states coverage of 367 out of 408 repositories (90%).", 287 "supported": "strong" 288 }, 289 { 290 "claim": "81.3% of Exec-CSN examples are solvable by human computer science graduate students.", 291 "evidence": "Figure 7 reports 81.3% solved rate from a human study on 64 examples. However, the study uses an unspecified number of participants on only 64/1,931 examples (3.3%).", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "GPT-4 achieves the best Pass@1 score of 37.21% on Exec-CSN, indicating substantial room for improvement.", 296 "evidence": "Table 4 shows GPT-4 at 37.21% Pass@1, the highest among all 12 evaluated models.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "Models perform worse on examples with longer target lengths, more function calls, or external libraries.", 301 "evidence": "Figure 8 shows consistent performance degradation across these factors for both GPT-4 and DeepSeek-Coder-33B. Logistic regression in Appendix A.6 confirms TargetLength and NumberImports are significant predictors.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "Humans achieve significantly better scores than GPT-4 after several rounds of revision, despite GPT-4 having higher initial Pass@1.", 306 "evidence": "Table 5 shows GPT-4 Pass@1 of 40.63% vs. human 34.38%, but after 4 revisions humans reach 68.75% vs. GPT-4's 57.81%. Humans ultimately reach 81.25%. However, this is based on only 64 examples with no statistical test.", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "Exec-CSN covers more diverse libraries, repository topics, and contributor distributions than existing benchmarks.", 311 "evidence": "Table 2 and Figure 5 compare Exec-CSN (367 repos, 668 topics, 293 libraries) to RepoEval (8 repos, 23 topics, 75 libraries) and SWE-BENCH (12 repos, 60 topics, 214 libraries). Figure 6 shows contributor distribution is closer to natural distribution.", 312 "supported": "strong" 313 }, 314 { 315 "claim": "The adapted code preserves high similarity to the input code, with average Jaccard similarity of variables at 83%.", 316 "evidence": "Section 4.2 reports BLEU score of 0.5116, Jaccard similarity of 83% for variables, and Pearson-r correlations of 0.51-0.73 for code tokens, AST depth, and variables. Figure 10 visualizes the Jaccard similarity.", 317 "supported": "strong" 318 } 319 ], 320 "methodology_tags": ["benchmark-eval"], 321 "key_findings": "CodeBenchGen is a framework that uses LLMs to convert arbitrary code fragments into execution-based evaluation examples, achieving 47% conversion rate and 90% repository coverage from CodeSearchNet. The resulting Exec-CSN benchmark (1,931 examples, 367 repositories) is substantially more diverse than existing benchmarks like RepoEval and SWE-BENCH in terms of libraries, topics, and contributor diversity. GPT-4 achieves only 37.21% Pass@1, with performance degrading for longer targets and external library usage. Humans outperform GPT-4 after iterative revision (81.25% vs. 57.81%), demonstrating better error-message utilization.", 322 "red_flags": [ 323 { 324 "flag": "Benchmark contamination risk unaddressed", 325 "detail": "CodeSearchNet is a widely-used public dataset (published 2020). All evaluated models were trained after 2020 and may have seen this data. The adapted code maintains 83% Jaccard similarity to the original, meaning memorized solutions could still apply. The paper does not discuss this contamination risk at all." 326 }, 327 { 328 "flag": "Small and undescribed human study sample", 329 "detail": "The human study evaluates only 64 out of 1,931 examples (3.3%) with an unspecified number of 'graduate student volunteers.' No demographics, recruitment methods, inclusion criteria, randomization, or IRB approval are reported. The claim of 81.3% solvability is generalized from this small, poorly-characterized sample." 330 }, 331 { 332 "flag": "No limitations section", 333 "detail": "The paper lacks a dedicated limitations or threats-to-validity section. Key limitations such as Python-only evaluation, dependence on GPT-4 quality for benchmark construction, potential LLM bias in generated tests, and CodeSearchNet representativeness are not discussed." 334 }, 335 { 336 "flag": "No uncertainty quantification on main results", 337 "detail": "All Pass@k results in Table 4 are reported as point estimates without confidence intervals, error bars, or variance across runs. The human-vs-model comparison (Table 5) makes performance claims without any statistical test." 338 }, 339 { 340 "flag": "Circular LLM dependence in benchmark", 341 "detail": "GPT-4 is used to construct the benchmark (sandboxing, test generation, debugging) and is also one of the evaluated models. Although GPT-3.5 is used for test augmentation to reduce self-bias, the fundamental benchmark structure is shaped by GPT-4's capabilities, which could systematically favor GPT-4 in evaluation." 342 } 343 ], 344 "cited_papers": [ 345 { 346 "title": "Evaluating large language models trained on code", 347 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 348 "year": 2021, 349 "arxiv_id": "2107.03374", 350 "relevance": "Introduces HumanEval, a foundational execution-based code generation benchmark and the Pass@k evaluation metric used throughout this paper." 351 }, 352 { 353 "title": "SWE-bench: Can language models resolve real-world github issues?", 354 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"], 355 "year": 2024, 356 "relevance": "A major repository-level code generation benchmark that CodeBenchGen explicitly compares against for diversity and scalability." 357 }, 358 { 359 "title": "R2E: Turning any github repository into a programming agent environment", 360 "authors": ["Naman Jain", "Manish Shetty", "Tianjun Zhang"], 361 "year": 2024, 362 "relevance": "A concurrent approach to repository-based benchmark creation that CodeBenchGen compares against for scalability." 363 }, 364 { 365 "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation", 366 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 367 "year": 2023, 368 "relevance": "Proposes EvalPlus with improved test coverage for code generation evaluation, directly relevant to benchmark quality methodology." 369 }, 370 { 371 "title": "DeepSeek-Coder: When the large language model meets programming -- the rise of code intelligence", 372 "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"], 373 "year": 2024, 374 "relevance": "One of the primary code generation models evaluated in this paper, representing state-of-the-art open-source code models." 375 }, 376 { 377 "title": "DS-1000: A natural and reliable benchmark for data science code generation", 378 "authors": ["Yuhang Lai", "Chengxi Li", "Yiming Wang", "Tianyi Zhang"], 379 "year": 2023, 380 "relevance": "A manually-created data science code generation benchmark, representing the category of execution-based benchmarks with limited scalability." 381 }, 382 { 383 "title": "RepoCoder: Repository-level code completion through iterative retrieval and generation", 384 "authors": ["Fengji Zhang", "Bei Chen", "Yue Zhang"], 385 "year": 2023, 386 "relevance": "Introduces RepoEval benchmark for repository-level code completion, a key comparison point for dataset diversity." 387 }, 388 { 389 "title": "Measuring coding challenge competence with APPS", 390 "authors": ["Dan Hendrycks", "Steven Basart", "Saurav Kadavath"], 391 "year": 2021, 392 "relevance": "Large-scale execution-based benchmark sourced from coding contests, representing hand-curated benchmark approaches." 393 }, 394 { 395 "title": "Code Llama: Open foundation models for code", 396 "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"], 397 "year": 2024, 398 "relevance": "Describes the Code Llama family of models evaluated in this paper, including the alignment-related refusal behavior observed." 399 }, 400 { 401 "title": "Grounding data science code generation with input-output specifications", 402 "authors": ["Yeming Wen", "Pengcheng Yin", "Kensen Shi"], 403 "year": 2024, 404 "relevance": "Demonstrates that I/O specifications improve model performance on code generation, directly motivating the instruction design in CodeBenchGen." 405 }, 406 { 407 "title": "ChatUniTest: A ChatGPT-based automated unit test generation tool", 408 "authors": ["Zhuokui Xie", "Yinghao Chen", "Chen Zhi"], 409 "year": 2023, 410 "relevance": "LLM-based test generation approach that inspires the test generation strategy used in CodeBenchGen." 411 }, 412 { 413 "title": "MultiPL-E: A scalable and extensible approach to benchmarking neural code generation", 414 "authors": ["Federico Cassano", "John Gouwar", "Daniel Nguyen"], 415 "year": 2022, 416 "relevance": "Extends execution-based code generation evaluation to multiple programming languages." 417 } 418 ] 419 }