scan.json (29667B)
1 { 2 "paper": { 3 "title": "A Performance Study of LLM-Generated Code on Leetcode", 4 "authors": [ 5 "Tristan Coignion", 6 "Clément Quinton", 7 "Romain Rouvoy" 8 ], 9 "year": 2024, 10 "venue": "International Conference on Evaluation and Assessment in Software Engineering (EASE 2024)", 11 "arxiv_id": "2407.21579", 12 "doi": "10.1145/3661167.3661221" 13 }, 14 "scan_version": 2, 15 "active_modules": ["experimental_rigor", "data_leakage"], 16 "methodology_tags": ["benchmark-eval"], 17 "key_findings": "Across 18 code-generation LLMs evaluated on 204 Leetcode problems, runtime performance of generated code is largely similar regardless of model (Cohen's d = 0.024). LLM-generated code is on average faster than 73% of human submissions on Leetcode. Temperature increases performance variance but not mean performance (correlation 0.41 with variance, 0.05 with mean). The study also reveals serious data contamination issues in Leetcode evaluations, with a tenfold decrease in pass@k when using post-training-cutoff problems.", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "Section 3.4 states: 'All the artifacts of this study, including our results, code, and datasets, are available in the following public repository: https://zenodo.org/doi/10.5281/zenodo.7898304.'" 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The Zenodo replication package includes results and datasets (Section 3.4). Leetcode problems are publicly available, and the canonical solutions are from public sources (WalkCC repository, Leetcode community)." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "Hardware is described (AMD EPYC 7301, 16 cores, 128GB memory, Grid5000 chiclet cluster in Section 3.3) and tools are mentioned (Deepspeed, pytest-benchmark), but no library versions, requirements.txt, or detailed software environment specification is provided in the paper." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": true, 38 "justification": "Section 3.4 provides a Zenodo replication package with code and datasets. A companion notebook is also mentioned for results exploration. The methodology sections (2-4) describe the full procedure in reproducible detail." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": true, 45 "justification": "Figure 2 shows '95% confidence interval' for pass@1 results across difficulty levels and datasets." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": true, 50 "justification": "Section 4.2 describes pairwise Student t-tests on mean run times to determine significant differences between LLMs." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section 5.2 reports Cohen's d: 'the mean Cohen's d effect size measures a mere 0.024.' This contextualizes the statistical significance of performance differences." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "No power analysis or justification for the choice of 204 problems, 10 solutions per problem per temperature, or the minimum of 5 valid solutions for pairwise comparison. The sample sizes are stated but not justified." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": true, 65 "justification": "Standard deviation of canonical solution run times is reported across runs (Section 3.3: 'over 96% had a standard deviation lower than 1/10th of their average run time'). Coefficients of variation are reported in Figure 3 (0.089 Leetcode vs 0.035 local)." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "Human-written canonical solutions serve as baselines for performance comparison. The 18 LLMs are also compared against each other via pairwise analysis." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "The 18 LLMs evaluated (CodeLlama, StarCoder, WizardCoder, CodeGen2.5, etc.) were all contemporary state-of-the-art code models at the time of the study (March-September 2023). Table 1 lists them." 78 }, 79 "ablation_study": { 80 "applies": false, 81 "answer": false, 82 "justification": "The paper evaluates existing LLMs on code generation tasks rather than proposing a system with components to ablate." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "Multiple metrics are reported: pass@1, pass@10, local run time (via pytest-benchmark), Leetcode time rank, memory usage, coefficient of variation, Cohen's d, and correlation coefficients." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "All evaluation is automated: Leetcode judge for correctness, pytest-benchmark for runtime. No human evaluation of code quality, readability, or other qualitative aspects." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": true, 97 "justification": "The 'new' dataset uses only problems published after January 1, 2023, explicitly created as a temporal split to avoid contamination from training data. This serves as a held-out test set by design." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Results are broken down by difficulty level (easy, medium, hard) in Figure 2, by model in Tables 1-2 and Figure 6, and by temperature in the RQ3 analysis." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section 3.2 details failure modes: 4,863 timeouts, 20 recursion errors, 47 other errors. Section 6.1 discusses InCoder performing worse than expected. Section 5.1.2 identifies Leetcode measurement unreliability. Only 3.6% of generated solutions were valid." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "Several negative findings: Leetcode measurements are unreliable (RQ1), memory usage measures decrease over time, rankings are affected by submissions, the low 3.6% solution validity rate, and performance differences between LLMs are negligibly small despite statistical significance." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "Abstract claims are supported: comparable performance across LLMs (Cohen's d = 0.024, Section 5.2), code more efficient than human average (73% rank, Section 5.4), data contamination discussion (RQ1, Section 5.1.1). The abstract's hedging ('on average') matches the evidence." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "The main quasi-causal claim (temperature affects performance variance) is supported by controlled experimental variation of temperature as an independent variable across the same models and problems. Language is appropriately correlational ('moderately correlated (0.41)')." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The abstract claims 'LLMs produce code with comparable performance, irrespective of the adopted LLM' but only 18 models were tested, all code-specific open-source models (0.35B-15.5B parameters) plus Copilot. No GPT-4, Claude, or larger models. Only Python, only algorithmic problems. Section 6.2 discusses some bounds but the abstract/conclusion claims are broader than the tested scope." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": true, 134 "justification": "Section 5.1.1 considers that 'The observed shift in functional validity between the two datasets could also arise from a genuine difference in the difficulty of the questions.' Section 6.1 discusses shared training datasets as an explanation for similar performance and notes InCoder may perform worse due to using it for left-to-right generation instead of infilling." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": true, 139 "justification": "The paper clearly defines what it measures (runtime via pytest-benchmark) and its claims are specifically about runtime performance on algorithmic problems. It does not conflate runtime with broader 'code quality' — Section 6.2 explicitly acknowledges they do not measure memory usage and results are limited to algorithmic problems." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": true, 146 "justification": "Table 1 lists specific model names with sizes (e.g., 'CodeGen-Mono 6B', 'CodeLlama-7B-instruct', 'StarCoder 15.5B') that map to specific Hugging Face model identifiers. GitHub Copilot is acknowledged as closed-source without version control." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": true, 151 "justification": "Figure 1 shows a complete example prompt with the actual Leetcode problem description and function signature. Section 2.2 ('Input prompts') describes exactly what is included (problem description only, no examples or constraints) and why." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": true, 156 "justification": "Section 3.1 reports: temperatures (0.1, 0.2, 0.4, 0.6, 0.8, 1.0), nucleus sampling top_p = 0.95, max tokens = 600. Copilot's default temperature is noted as non-configurable." 157 }, 158 "scaffolding_described": { 159 "applies": false, 160 "answer": false, 161 "justification": "No agentic scaffolding is used. The study performs direct code generation from prompts to LLMs with no tool use, retry logic, or feedback mechanisms." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 3.2 documents the 3-step validation pipeline with exact counts: 210,120 generated → local validation → Leetcode validation → timeout/error exclusion → 7,481 valid solutions (3.6%). Error breakdown: 4,863 timeouts (98.6%), 20 recursion errors (0.04%), 47 other errors (0.1%)." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 6.2 'Limits and Threats to Validity' is a dedicated subsection with substantive discussion of multiple specific threats." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section 6.2 identifies specific threats: Copilot retraining risk, reliance on Leetcode's test suite potentially favoring some implementations, 10-second timeout cut-off, missing memory analysis, recursion limit differences between local and Leetcode environments, and that algorithmic problems don't generalize to all programming domains." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": true, 183 "justification": "Section 6.2 explicitly states what was NOT tested: memory usage ('we do not consider memory usage at all'), non-algorithmic code ('only evaluate the LLMs on algorithmic problems, the performances of the LLMs are hard to generalize across all programming fields'), and notes the ranking comparison limitations ('the ranking evolves and we have no information about the population')." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 3.4: 'All the artifacts of this study, including our results, code, and datasets, are available in the following public repository: https://zenodo.org/doi/10.5281/zenodo.7898304.'" 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 2.2 describes problem selection (204 problems published after Jan 1, 2023), how test cases were extracted (crawling Leetcode instructions, submitting modified solutions to trigger timeout failures), and how canonical solutions were sourced (WalkCC repository, Leetcode community upvoted solutions)." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants. Data sources are standard public platforms (Leetcode problems, Hugging Face models) and their selection is documented." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": true, 205 "justification": "The full pipeline is documented with counts: 210,120 generated solutions → local validation → Leetcode validation → timeout exclusion → 7,481 valid solutions (Section 3.2). Error categories and counts at the final stage are provided (4,863 timeouts, 20 recursion errors, 47 other errors)." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Acknowledgments section discloses funding from ANR (France 2030 program) through CARECloud, DISTILLER, and KOALA projects, and use of Grid5000 testbed." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "All three authors are from Univ. Lille/CNRS/Inria, France. Affiliations are clearly stated. They evaluate third-party open-source models and a commercial tool (Copilot) — no conflict with their institution." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": true, 222 "justification": "Funding is from ANR (French national research agency) and Grid5000 (academic infrastructure). Neither has a financial stake in LLM performance outcomes." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial interests statement is present in the paper. Absence of a disclosure statement does not equal absence of conflict." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": true, 233 "answer": true, 234 "justification": "Section 2.2 states: 'the majority of their training datasets have a cut-off date between 2021 and 2022.' They also acknowledge that GitHub Copilot's training data cutoff is unknown as a closed-source tool." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": true, 238 "answer": true, 239 "justification": "Extensively discussed in RQ1 (Section 5.1.1). They found a tenfold decrease in pass@k between old and new datasets, which they attribute to data contamination. They note that '3sum' has ~4,000 matches on GitHub, likely appearing in training data." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": true, 243 "answer": true, 244 "justification": "Central contribution of the paper. They mitigate contamination by using only problems published after January 1, 2023 (post-training-cutoff). They compare old vs new datasets to demonstrate contamination effects and discuss reproducibility implications for future studies." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study. It is a benchmark evaluation of LLMs on coding problems." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants. The study evaluates LLM-generated code on algorithmic problems." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": false, 288 "justification": "No API costs, per-problem generation time, or inference cost is reported. The total number of solutions generated (210,120) is stated but not the associated cost." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "Hardware is described (Grid5000 chiclet cluster, AMD EPYC 7301, GPUs with Deepspeed) but total GPU hours, wall-clock time, or compute costs for the full experiment are not quantified." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": false, 300 "justification": "No mention of random seeds or seed sensitivity analysis. The 10 generations per problem at different temperatures provide some diversity, but seed sensitivity is not explicitly reported." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": true, 305 "justification": "Section 3.1 states 10 solutions per problem per LLM. Section 3.3 states benchmarks run 'at least 10 times and for at least 1 second in total.' Six temperatures and 18 models are specified." 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "Settings like top_p=0.95 and max_tokens=600 are adopted from Chen et al. without reporting alternatives considered. Temperature is varied systematically but no search budget is reported for other hyperparameters." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": true, 315 "justification": "Section 4.1: 'we calculated the pass@k for each temperature when evaluating an LLM's functional correctness and considered the best one as the pass@k for that LLM.' The selection criterion is transparent." 316 }, 317 "multiple_comparison_correction": { 318 "applies": true, 319 "answer": false, 320 "justification": "Pairwise t-tests are performed across all pairs of 18 LLMs (153 pairs) without any mention of Bonferroni, Holm, or other corrections for multiple comparisons." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": false, 324 "answer": false, 325 "justification": "The authors evaluate third-party LLMs, not their own system. There is no self-comparison bias to address." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": false, 330 "justification": "Models range from 350M to 15.5B parameters but performance is not plotted or discussed as a function of model size or compute budget. The finding that model differences are negligible implicitly addresses this but no explicit analysis is provided." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": true, 335 "justification": "RQ1 is entirely dedicated to evaluating Leetcode's validity as a benchmarking platform. They analyze measurement reliability (coefficient of variation), rank stability, memory measure drift, and data contamination effects. They compare Leetcode measures to local benchmarks." 336 }, 337 "scaffold_confound_addressed": { 338 "applies": false, 339 "answer": false, 340 "justification": "No scaffolding is used. All models receive direct prompts and generate code completions without agent loops or tool use." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": true, 347 "justification": "Major focus of the paper. Section 2.2: problems published after January 1, 2023 are used as the 'new' dataset, post-dating the training data cutoff of 2021-2022 for most models. The tenfold pass@k decrease between old and new datasets demonstrates the contamination effect." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the Leetcode prompt format or structure could leak information to the models. The prompts are presented exactly as Leetcode formats them, and the paper does not discuss whether this familiar format provides an unfair advantage." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether Leetcode problems share structural similarities that could introduce non-independence (e.g., similar problem types, data structures, or solution patterns appearing in both training and test sets)." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": true, 362 "justification": "Temporal splitting is used as a concrete prevention method (problems after Jan 1, 2023). Additionally, the comparison between old and new dataset pass@k rates (tenfold decrease) serves as an empirical detection method for contamination. Section 5.1.1 also analyzes GitHub search results for specific prompts (~4,000 matches for '3sum')." 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "LLMs produce code with comparable runtime performance regardless of the model used (Cohen's d = 0.024).", 369 "evidence": "Section 5.2: Pairwise t-test comparisons across 18 LLMs on common problems. Mean Cohen's d effect size of 0.024. Figure 6 shows the pairwise comparison matrix.", 370 "supported": "strong" 371 }, 372 { 373 "claim": "LLM-generated code is on average faster than 73% of human-submitted Leetcode solutions.", 374 "evidence": "Section 5.4: Leetcode ranking for CodeGen-6B-mono shows mean rank of 73%. Figure 8 shows the distribution. However, rankings evolve with submissions (Section 5.1.2).", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "Temperature increases performance variance but does not significantly affect mean performance.", 379 "evidence": "Section 5.3: Correlation of 0.05 between temperature and performance (negligible), but 0.41 between temperature and performance variance (moderate). Figure 7 shows correlation distribution.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Data contamination causes approximately tenfold inflation in LLM pass@k scores on Leetcode.", 384 "evidence": "Section 5.1.1 and Figure 2: Comparing old dataset (pre-2023 problems, likely in training data) to new dataset (post-2023) shows roughly tenfold decrease in pass@k. However, difficulty differences between datasets cannot be fully ruled out.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "Leetcode measurements are less reliable than local benchmarking (coefficient of variation 0.089 vs 0.035).", 389 "evidence": "Section 5.1.2 and Figure 3: Coefficient of variation comparison. Figure 4 shows scatter plot where Leetcode cannot distinguish clusters visible in local measurements. Correlation between local and Leetcode times is only 0.28.", 390 "supported": "strong" 391 }, 392 { 393 "claim": "Success rate of LLMs (functional correctness) does not substantially impact code performance.", 394 "evidence": "Section 5.3: Only slight negative correlation (-0.08) between success rate and run time, and close to no correlation (-0.11) between success rate and performance variation.", 395 "supported": "moderate" 396 } 397 ], 398 "red_flags": [ 399 { 400 "flag": "Very low valid solution rate limits comparison power", 401 "detail": "Only 7,481 out of 210,120 generated solutions (3.6%) were valid. Only 24 problems had sufficient valid solutions from 10+ models for pairwise comparison. This severely limits statistical power for RQ2." 402 }, 403 { 404 "flag": "No multiple comparison correction", 405 "detail": "Pairwise t-tests across 18 LLMs (153 pairs) are performed without Bonferroni or other corrections, inflating the risk of spurious significant differences." 406 }, 407 { 408 "flag": "Human comparison methodology is weak", 409 "detail": "The Leetcode ranking used for RQ4 (LLMs vs humans) evolves with submissions. The authors' own submissions changed the ranking: Copilot dropped from rank 77 to 54 after testing all models. The 'population' of human submitters is uncharacterized." 410 }, 411 { 412 "flag": "Narrow scope of tested models", 413 "detail": "All 18 models are code-specific open-source models (350M-15.5B parameters) plus Copilot. No GPT-4, Claude, or other frontier models are tested, but conclusions are framed broadly ('irrespective of the adopted LLM')." 414 }, 415 { 416 "flag": "Contamination alternative explanation not fully ruled out", 417 "detail": "The tenfold pass@k decrease between old and new datasets is attributed to data contamination, but the paper acknowledges 'quantifying this last hypothesis [difficulty difference] proves to be challenging.' The two datasets differ in composition (95/105/100 vs 56/104/44 by difficulty)." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "Evaluating Large Language Models Trained on Code", 423 "authors": ["Mark Chen"], 424 "year": 2021, 425 "arxiv_id": "2107.03374", 426 "relevance": "Introduces HumanEval benchmark and pass@k metric used throughout this study for evaluating LLM code generation capability." 427 }, 428 { 429 "title": "StarCoder: May the Source Be with You!", 430 "authors": ["Raymond Li"], 431 "year": 2023, 432 "arxiv_id": "2305.06161", 433 "relevance": "Open-source code LLM evaluated in this study; achieved highest pass@1 (0.095) on Leetcode problems." 434 }, 435 { 436 "title": "Do Users Write More Insecure Code with AI Assistants?", 437 "authors": ["Neil Perry"], 438 "year": 2023, 439 "doi": "10.1145/3576915.3623157", 440 "relevance": "Studies security implications of AI code assistants; relevant to understanding real-world impact of LLM-generated code quality." 441 }, 442 { 443 "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions", 444 "authors": ["Hammond Pearce"], 445 "year": 2022, 446 "doi": "10.1109/SP46214.2022.9833571", 447 "relevance": "Evaluates security quality of Copilot code suggestions; complements this study's performance-focused evaluation." 448 }, 449 { 450 "title": "Program Synthesis with Large Language Models", 451 "authors": ["Jacob Austin"], 452 "year": 2021, 453 "arxiv_id": "2108.07732", 454 "relevance": "Introduces MBPP benchmark and studies temperature effects on code generation, which this paper builds upon." 455 }, 456 { 457 "title": "Piloting Copilot and Codex: Hot Temperature, Cold Prompts, or Black Magic?", 458 "authors": ["Jean-Baptiste Döderlein"], 459 "year": 2023, 460 "doi": "10.2139/ssrn.4496380", 461 "relevance": "Closely related prior work using Leetcode to evaluate Copilot/Codex; this paper extends the approach with performance measurement." 462 }, 463 { 464 "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation", 465 "authors": ["Jiawei Liu"], 466 "year": 2024, 467 "relevance": "Investigates correctness of LLM-generated code with rigorous evaluation methodology." 468 }, 469 { 470 "title": "Assessing the Quality of GitHub Copilot's Code Generation", 471 "authors": ["Burak Yetistiren"], 472 "year": 2022, 473 "doi": "10.1145/3558489.3559072", 474 "relevance": "Evaluates Copilot code quality including success rate measurement, a related evaluation dimension." 475 }, 476 { 477 "title": "Stop Uploading Test Data in Plain Text: Practical Strategies for Mitigating Data Contamination by Evaluation Benchmarks", 478 "authors": ["Alon Jacovi"], 479 "year": 2023, 480 "arxiv_id": "2305.10160", 481 "relevance": "Addresses data contamination in LLM evaluation, directly relevant to this paper's RQ1 findings on benchmark contamination." 482 }, 483 { 484 "title": "Learning Performance-Improving Code Edits", 485 "authors": ["Aman Madaan"], 486 "year": 2023, 487 "relevance": "Fine-tunes LLMs for code performance improvement; complements this study's finding that LLMs already generate reasonably performant code." 488 }, 489 { 490 "title": "Large Language Models and Simple, Stupid Bugs", 491 "authors": ["Kevin Jesse"], 492 "year": 2023, 493 "doi": "10.1109/MSR59073.2023.00082", 494 "relevance": "Studies prevalence of bugs in LLM-generated code, complementing this paper's focus on code performance." 495 }, 496 { 497 "title": "Code Llama: Open Foundation Models for Code", 498 "authors": ["Baptiste Rozière"], 499 "year": 2023, 500 "arxiv_id": "2308.12950", 501 "relevance": "Introduces CodeLlama family of models, five variants of which are evaluated in this study." 502 } 503 ] 504 }