scan.json (27291B)
1 { 2 "paper": { 3 "title": "On Evaluating the Efficiency of Source Code Generated by LLMs", 4 "authors": [ 5 "Changan Niu", 6 "Ting Zhang", 7 "Chuanyi Li", 8 "Bin Luo", 9 "Vincent Ng" 10 ], 11 "year": 2024, 12 "venue": "FORGE '24 (AI Foundation Models and Software Engineering)", 13 "arxiv_id": "2404.06041", 14 "doi": "10.1145/3650105.3652295" 15 }, 16 "scan_version": 2, 17 "active_modules": ["experimental_rigor", "data_leakage"], 18 "methodology_tags": ["benchmark-eval"], 19 "key_findings": "The ability to generate correct code is not positively correlated with the ability to generate efficient code: GPT-4 has higher Pass@10 than GPT-3.5 but produces slower code on HumanEval and MBPP. Larger model size within a family (Code Llama, WizardCoder) does not significantly improve runtime efficiency. Chain-of-thought prompting (generating code first, then optimizing) yields larger speedups on more complex LeetCode problems (up to 1.18x) compared to simple efficiency-requesting prompts, though all prompting effects are modest on entry-level benchmarks.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "The paper states 'We also make code, data and other artifacts available online [1]' with reference [1] pointing to https://github.com/NougatCA/EfficiencyEval." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The paper claims to release code, data, and artifacts online. HumanEval and MBPP are publicly available standard benchmarks, and the newly constructed LeetCodeEval is included in the artifact release." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "No environment specification (requirements.txt, Dockerfile, library versions) is provided in the paper. They mention using the gem5 CPU simulator but do not specify software environment details." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions are provided in the paper. A GitHub link is given but the paper itself lacks instructions for replicating the experiments." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "Tables 2, 3, and 4 report only point estimates (normalized runtime, Pass@10, speedup) with no confidence intervals or error bars despite repeated measurements." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper makes comparative claims ('code generated by the former is not as efficient as the latter') based solely on comparing numbers without any statistical significance tests." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "The main RQ1 results (Table 2, Table 3) report normalized runtimes and Pass@10 without contextualizing the magnitude of differences. The speedup ratios in Table 4 (RQ2) provide relative magnitudes but the primary comparative claims lack formal effect sizes." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "No justification is given for the number of problems (164 HumanEval, 399 MBPP, 166 LeetCodeEval), the number of models tested, or k=10 generations. No power analysis is discussed." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "The paper states 'we repeat the execution of each piece of code for 10 times and take the average runtime' and 'repeat the submission of each piece of code for 3 times and record the average results' but never reports standard deviation or any spread measure for these repeated runs." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Multiple LLMs are compared against each other: GPT-4, GPT-3.5, Phi-2, Code Llama (7B/13B/34B), WizardCoder (7B/13B/34B), and DeepSeek Coder (base/instruct). Each serves as a baseline for the others." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "The models evaluated (GPT-4, GPT-3.5, Code Llama, WizardCoder, DeepSeek Coder, Phi-2) were all state-of-the-art or near-SOTA at the time of writing (late 2023/early 2024)." 80 }, 81 "ablation_study": { 82 "applies": false, 83 "answer": false, 84 "justification": "This is a benchmark evaluation paper comparing existing models, not proposing a system with components to ablate. The RQ2 prompting experiments compare different strategies but are not ablations of a single system." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "The paper reports Pass@10 (correctness), average normalized runtime (efficiency), and percentage beats (LeetCode-specific efficiency). Multiple complementary metrics are used." 90 }, 91 "human_evaluation": { 92 "applies": false, 93 "answer": false, 94 "justification": "Human evaluation is not relevant to the paper's claims about code execution efficiency, which is measured automatically via runtime." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "The paper uses established benchmarks (HumanEval, MBPP) and a newly constructed LeetCodeEval as evaluation sets. No tuning or selection is performed against these benchmarks — models are evaluated as-is." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Results are broken down by benchmark (HumanEval vs MBPP vs LeetCodeEval) and by difficulty level within LeetCodeEval (easy, medium, hard). Table 2 and Table 3 show separate results per dataset." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": false, 109 "justification": "Failed problems are simply excluded from runtime analysis. The paper notes overlap counts (70 problems all pass on HumanEval, 0 on hard LeetCodeEval) but does not discuss what kinds of problems models fail on or why." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The paper reports that prompting has minimal effect on simple benchmarks (HumanEval/MBPP speedups near 1.0), that DeepSeek Coder shows almost no improvement from prompting (1.00-1.01x on some benchmarks), and that no model could solve enough hard LeetCode problems for comparison." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The abstract claims that efficiency is evaluated, that simple prompts help basic problems, and that chain-of-thought helps complex problems. These are supported by Tables 2-4 in the paper." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": false, 126 "justification": "The paper claims 'training strategy and data have an impact on the efficiency of the generated code' based on comparing DeepSeek Coder base vs instruct, but this is a single observational comparison with many confounding differences between the two versions. No controlled experiment isolates training strategy." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The title claims to evaluate 'Source Code Generated by LLMs' broadly, but HumanEval/MBPP are Python-only and LeetCodeEval is C++-only. Claims about 'LLMs' are based on a limited set of models. These scope limitations are not explicitly bounded in the claims." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper does not discuss alternative explanations for key findings. For example, why GPT-3.5 generates more efficient code than GPT-4 could relate to RLHF training, code verbosity preferences, or other factors — none of which are discussed." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper equates 'efficiency' with runtime but does not acknowledge that efficiency could encompass memory usage, energy consumption, or code maintainability. The proxy gap between 'runtime' and the broader 'efficiency' framing is not discussed." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": true, 148 "justification": "The paper specifies exact API model IDs: 'gpt-3.5-turbo-1106 and gpt-4-1106-preview'. Open-source models are identified with specific sizes (Phi-2 2.7B, Code Llama 7B/13B/34B, WizardCoder 7B/13B/34B, DeepSeek Coder 33B base/instruct)." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": true, 153 "justification": "Figure 2 provides the LeetCodeEval prompt template, and Figure 3 provides the three prompting strategies for RQ2. For HumanEval/MBPP, they reference Liu et al.'s published source code for prompt generation. The fill values (problem descriptions, constraints) come from publicly available benchmarks." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": false, 158 "justification": "No temperature, top-p, max tokens, or other generation hyperparameters are reported for any model. Only k (number of generations: 10 for HumanEval/MBPP, 3 for LeetCodeEval) is stated." 159 }, 160 "scaffolding_described": { 161 "applies": false, 162 "answer": false, 163 "justification": "No agentic scaffolding is used. The paper evaluates direct LLM code generation from prompts." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": true, 168 "justification": "For LeetCodeEval, the paper documents the filtering pipeline: select problems from May 2023+, filter out problems with images, filter out problems with more downvotes than upvotes, divide by difficulty. Table 1 provides dataset statistics." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section 3 'THREATS TO VALIDITY' discusses potential data leakage and unstable runtime as threats, with specific mitigation strategies described." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": true, 180 "justification": "The threats section discusses study-specific issues: (1) data leakage because training data contents are unknown, mitigated by temporal filtering for LeetCodeEval, and (2) unstable runtime, mitigated by gem5 simulator and repeated runs." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": false, 185 "justification": "The paper does not explicitly state what the results do NOT show. There is no discussion of scope limitations such as generalizability to other programming languages, problem types, or non-tested models." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": true, 192 "justification": "The paper states 'We also make code, data and other artifacts available online [1]' at a GitHub repository (https://github.com/NougatCA/EfficiencyEval), suggesting raw data is available for verification." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "For LeetCodeEval, the paper describes collection of URLs, titles, descriptions, examples, constraints, and code templates from LeetCode, with specific filtering criteria (post-May 2023, no images, positive vote ratio). HumanEval and MBPP are standard benchmarks with known provenance." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants. Data sources are standard benchmarks (HumanEval, MBPP) and a publicly available programming platform (LeetCode)." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": true, 207 "justification": "The pipeline is documented in Section 2.1.2: generate k responses → test correctness → select first passing code → measure runtime via gem5 (10 repetitions) for HumanEval/MBPP, and generate 3 codes → submit to LeetCode → record correctness and runtime (3 repetitions) for LeetCodeEval." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": true, 214 "justification": "The Acknowledgments section lists 'Cooperation Fund of Huawei-NJU Creative Laboratory for the Next Programming, CCF-Huawei Populus Grove Fund, NSF award 2034508.'" 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations are listed: Nanjing University, Singapore Management University, and University of Texas at Dallas. None of the authors are affiliated with the companies whose models are evaluated." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": true, 224 "justification": "Huawei funds the research but none of Huawei's own LLM products are evaluated. The paper evaluates OpenAI, Meta, Microsoft, and DeepSeek models. NSF is a government funder with no stake in results." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests or financial interests statement is included in the paper." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": false, 236 "justification": "The paper mentions GPT-4's knowledge cutoff as May 2023 for LeetCodeEval construction but acknowledges 'we are unable to get the data cut-offs for the other model.' Training cutoffs are not systematically stated for all evaluated models." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": true, 241 "justification": "Section 3 states: 'Potential data leakage is a threat to construct validity because we can not know if the data used for evaluation is present in the training data of models.' The paper acknowledges this risk explicitly." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": true, 246 "justification": "The paper takes a concrete mitigation step: constructing LeetCodeEval with problems from May 2023+ (after GPT-4's training cutoff) specifically to reduce contamination risk. However, contamination for HumanEval and MBPP (published 2021) is only acknowledged, not mitigated." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in the study." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in the study." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in the study." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in the study." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in the study." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in the study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in the study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": false, 290 "justification": "No API costs, token counts, or inference latency are reported despite using commercial APIs (GPT-3.5, GPT-4) and generating thousands of code samples." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "No total computational budget is stated. The paper uses the gem5 CPU simulator and OpenAI APIs but does not quantify the total compute resources used." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "No random seed sensitivity analysis is reported. The paper generates k responses per problem but does not discuss seed variation or its effect on results." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": true, 307 "justification": "The paper states 'we repeat the execution of each piece of code for 10 times' on gem5 and 'repeat the submission of each piece of code for 3 times' on LeetCode." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": false, 312 "justification": "No hyperparameter search budget is reported. Generation parameters (temperature, top-p) are not even stated, let alone any search over them." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": false, 317 "justification": "The choice of k=10 for HumanEval/MBPP and k=3 for LeetCodeEval is not justified. The selection of 'first passing code' for efficiency evaluation is described but the rationale for this specific selection strategy is not discussed." 318 }, 319 "multiple_comparison_correction": { 320 "applies": false, 321 "answer": false, 322 "justification": "No statistical significance tests are performed at all, so the question of correction for multiple comparisons does not arise." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The paper does not discuss potential evaluation bias. While they evaluate third-party models rather than their own system, they do not acknowledge potential biases in their evaluation methodology (e.g., prompt template design favoring certain models)." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": false, 332 "justification": "The paper compares models of vastly different compute requirements (2.7B Phi-2 vs GPT-4) without discussing or normalizing for computational cost. No performance-vs-compute analysis is provided." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": true, 337 "justification": "The paper argues that HumanEval/MBPP have limited test cases that don't adequately reveal efficiency differences, motivating LeetCodeEval: 'Comprehensive test cases on LeetCode can make the runtime benefits of code with real less complexity more significant, and thus more accurately reflect the efficiency.' This is an explicit construct validity argument." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": false, 341 "answer": false, 342 "justification": "No scaffolding or agentic tools are involved. Models are evaluated via direct prompting." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": true, 349 "justification": "The paper explicitly addresses temporal leakage by constructing LeetCodeEval with problems from 'May 2023 and later (this is the latest GPT-4 knowledge cut-off)' to avoid training data contamination." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "No discussion of whether the evaluation setup leaks answer information through prompt context, code templates, or test case design." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of whether HumanEval, MBPP, or LeetCode problems share structural similarities with training data beyond temporal overlap." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": true, 364 "justification": "Temporal splitting is used as a concrete leakage prevention method for LeetCodeEval: only problems posted after the model's training cutoff are included." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "The ability to generate correct code is not positively correlated with the ability to generate efficient code.", 371 "evidence": "Table 2: GPT-4 has higher Pass@10 (98.2 HumanEval, 94.2 MBPP) but higher normalized runtime (8.61, 9.14) than GPT-3.5 (87.2, 88.7 Pass@10; 8.35, 8.86 runtime). Phi-2 has the lowest Pass@10 but competitive runtime.", 372 "supported": "moderate" 373 }, 374 { 375 "claim": "Larger number of parameters does not promise higher code efficiency performance.", 376 "evidence": "Table 2: Code Llama 7B/13B/34B have similar runtimes (9.95/9.87/9.93 HumanEval; 9.58/9.61/9.54 MBPP). WizardCoder shows slight improvement with size but the pattern is inconsistent.", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "Training strategy and data have an impact on the efficiency of generated code.", 381 "evidence": "Table 2: DeepSeek Coder 33B Instruct (7.54 runtime) significantly outperforms Base version (9.40 runtime) on HumanEval. Section 2.1.5 attributes this to instruct-tuning.", 382 "supported": "weak" 383 }, 384 { 385 "claim": "Chain-of-thought prompting yields larger speedups on complex problems than simple efficiency-requesting prompts.", 386 "evidence": "Table 4: On LeetCodeEval Medium, Prompt 3 achieves 1.18x speedup for GPT-4 vs 1.07x for Prompt 1. On HumanEval, all prompts yield similar modest speedups (~1.04-1.06x).", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "GPT-4 generates the most efficient code on more complex problems (LeetCodeEval).", 391 "evidence": "Table 3: GPT-4 has the lowest runtime (30.89 easy, 50.92 medium) and highest percentage beats (65.51% easy, 73.09% medium) on LeetCodeEval.", 392 "supported": "moderate" 393 } 394 ], 395 "red_flags": [ 396 { 397 "flag": "No error bars or statistical tests", 398 "detail": "All comparisons across models and prompting strategies are based on point estimates only. The paper repeats runs (10x for gem5, 3x for LeetCode) but never reports variance, and makes comparative claims without significance tests." 399 }, 400 { 401 "flag": "Severely limited comparison set on harder problems", 402 "detail": "On LeetCodeEval, only 24/44 easy problems have all models passing, only 3/85 medium problems have all three models passing, and 0/33 hard problems. The medium and hard comparisons are based on tiny or empty sample sizes." 403 }, 404 { 405 "flag": "First-passing-code selection introduces bias", 406 "detail": "Among k=10 (or k=3) generated solutions, the paper uses 'the first passing code' for efficiency evaluation. This selection strategy could systematically favor certain solution patterns (e.g., simpler implementations that appear first) and conflates generation order with efficiency." 407 }, 408 { 409 "flag": "Missing generation hyperparameters", 410 "detail": "Temperature, top-p, and other generation parameters are not reported for any model. These significantly affect output diversity and quality, making replication impossible." 411 } 412 ], 413 "cited_papers": [ 414 { 415 "title": "Evaluating Large Language Models Trained on Code", 416 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 417 "year": 2021, 418 "arxiv_id": "2107.03374", 419 "relevance": "Introduces HumanEval, the primary benchmark used in this paper and a key benchmark for evaluating LLM code generation." 420 }, 421 { 422 "title": "Program Synthesis with Large Language Models", 423 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"], 424 "year": 2021, 425 "arxiv_id": "2108.07732", 426 "relevance": "Introduces MBPP, the second primary benchmark used in this evaluation of LLM code generation." 427 }, 428 { 429 "title": "GPT-4 Technical Report", 430 "authors": ["OpenAI"], 431 "year": 2023, 432 "arxiv_id": "2303.08774", 433 "relevance": "Technical report for GPT-4, one of the primary models evaluated in this code efficiency study." 434 }, 435 { 436 "title": "Code Llama: Open Foundation Models for Code", 437 "authors": ["Baptiste Roziere", "Jonas Gehring", "Fabian Gloeckle"], 438 "year": 2023, 439 "arxiv_id": "2308.12950", 440 "relevance": "Introduces Code Llama, one of the open-source code LLM families evaluated for code efficiency." 441 }, 442 { 443 "title": "WizardCoder: Empowering Code Large Language Models with Evol-Instruct", 444 "authors": ["Ziyang Luo", "Can Xu", "Pu Zhao"], 445 "year": 2023, 446 "arxiv_id": "2306.08568", 447 "relevance": "Introduces WizardCoder, evaluated in this paper for code generation efficiency." 448 }, 449 { 450 "title": "Learning Performance-Improving Code Edits", 451 "authors": ["Aman Madaan", "Alexander Shypula", "Uri Alon"], 452 "year": 2024, 453 "relevance": "Proposes the PIE dataset for code optimization and evaluation approaches adapted by this paper for prompting strategies (Prompts 2 and 3)." 454 }, 455 { 456 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation", 457 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 458 "year": 2023, 459 "relevance": "Provides the evaluation framework and source code used for generating LLM responses on HumanEval and MBPP." 460 }, 461 { 462 "title": "Self-refine: Iterative refinement with self-feedback", 463 "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"], 464 "year": 2023, 465 "arxiv_id": "2303.17651", 466 "relevance": "Self-refinement approach for LLMs applied to code optimization, related to the iterative prompting strategies evaluated." 467 }, 468 { 469 "title": "Evaluating the code quality of ai-assisted code generation tools: An empirical study on github copilot, amazon codewhisperer, and chatgpt", 470 "authors": ["Burak Yetiştiren", "Işık Özsoy", "Miray Ayerdem", "Eray Tüzün"], 471 "year": 2023, 472 "relevance": "Evaluates code quality of LLM-based tools (Copilot, CodeWhisperer, ChatGPT), complementary to this paper's efficiency focus." 473 }, 474 { 475 "title": "Llama 2: Open foundation and fine-tuned chat models", 476 "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"], 477 "year": 2023, 478 "relevance": "Foundation model for Code Llama, one of the model families evaluated for code generation efficiency." 479 } 480 ] 481 }