scan.json (27903B)
1 { 2 "paper": { 3 "title": "Learn to Code Sustainably: An Empirical Study on LLM-based Green Code Generation", 4 "authors": [ 5 "Tina Vartziotis", 6 "Ippolyti Dellatolas", 7 "George Dasoulas", 8 "Maximilian Schmidt", 9 "Florian Schneider", 10 "Tim Hoffmann", 11 "Sotirios Kotsopoulos", 12 "Michael Keckeisen" 13 ], 14 "year": 2024, 15 "venue": "arXiv", 16 "arxiv_id": "2403.03344", 17 "doi": "10.48550/arXiv.2403.03344" 18 }, 19 "scan_version": 2, 20 "active_modules": ["experimental_rigor", "data_leakage"], 21 "methodology_tags": ["benchmark-eval"], 22 "key_findings": "This paper evaluates the 'green capacity' of ChatGPT-3.5, GitHub Copilot, and Amazon CodeWhisperer on 6 LeetCode problems, measuring runtime, memory, FLOPs, and energy consumption. AI tools show partial ability to improve code sustainability when explicitly asked to optimize for specific metrics, but generally produce less efficient code than top human LeetCode submissions. Copilot demonstrates the strongest optimization capability, while ChatGPT and CodeWhisperer more frequently fail to produce valid optimized code for certain problems.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": false, 28 "justification": "The paper states 'Our evaluation code is available here' (Section I.B) but no URL is visible in the text. No repository link or archive is provided." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": false, 33 "justification": "No dataset of generated code samples, measurement results, or test cases is released. The LeetCode problems are public but the paper's experimental data is not shared." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "Hardware is specified (Intel Core i7-6700k, 32GB RAM, Linux Mint 21.2, kernel 5.17.0-79-generic, perf 5.15.111, Python 3.12.0rc1) but no requirements.txt, Dockerfile, or full dependency specification is provided." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "No step-by-step reproduction instructions are provided. The methodology section describes the process conceptually but does not provide runnable commands or scripts." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "Results are reported as point estimates and heatmap values. Despite averaging 10 runtime measurements, no confidence intervals, error bars, or uncertainty ranges are reported." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "No statistical significance tests are used. Comparisons between tools and human submissions are based solely on raw metric values and Performance Delta calculations without any tests." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "The Performance Delta (PD) metric provides relative improvements with baseline context: e.g., 'Copilot achieves a PD of 0.61 on the NDTO coding problem. This implies that Copilot's optimized code consumes 0.61 times less energy than the initially generated code' (Section IV)." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "Only 6 LeetCode problems are used with no justification for why this number is sufficient. No power analysis or discussion of sample size adequacy is provided." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "The paper states '10 runtime measurements' are averaged (Section III.D) but reports no standard deviation, IQR, or any spread measure. Only averages are presented." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Three AI code generation tools are compared against each other and against human submissions (top 0.05% of LeetCode submissions by runtime)." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "ChatGPT 3.5, GitHub Copilot Individual, and Amazon CodeWhisperer Individual Tier were current commercial tools at the time of writing (2024)." 83 }, 84 "ablation_study": { 85 "applies": false, 86 "answer": false, 87 "justification": "The paper evaluates third-party code generation tools as black boxes rather than proposing a system with components to ablate." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "Five sustainability metrics are used: code correctness, runtime, memory usage, FLOPs, and energy consumption (Section II.B)." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": true, 97 "justification": "Code correctness is evaluated both via LeetCode's automated tests and by human analysis: 'The correctness is evaluated by LeetCode and by human evaluation' (Section III.D)." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": true, 102 "justification": "Code is submitted to LeetCode's comprehensive test suite which includes 'tests encompassing critical inputs and boundary scenarios' (Section III.D). No tuning is performed on these test cases." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Results are broken down per coding problem (6 problems) and per sustainability metric. Figures 2-5 show per-problem, per-tool breakdowns for GC, energy, runtime, and memory." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "Failures are discussed: 'ChatGPT fails to produce a valid code output for 3 out of 6 coding problems' for runtime, and 'CodeWhisperer and ChatGPT fail to produce or optimize codes for 3 and 4 coding problems, respectively' for energy (Section IV)." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "Negative PD values are reported showing optimization requests sometimes make code worse. 'Copilot outputs code whose runtime is worse than the initial implementation' for some problems (Section IV). Overall finding that AI tools are less efficient than human submissions is a negative result." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "The abstract claims 'findings shed light on the current capacity of AI models to contribute to sustainable software development,' which is appropriately hedged. Results support that AI tools show partial optimization capability but lag behind human submissions." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "The main causal claim — that requesting optimization improves sustainability metrics — is based on a controlled manipulation (initial prompt vs. optimization prompt to the same tool). This is an adequate design for this specific claim." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": false, 134 "justification": "The title claims 'LLM-based Green Code Generation' broadly, but the study tests only Python, only 6 LeetCode algorithmic problems, and only 3 specific tools. No explicit bounds are placed on generalization despite the narrow scope." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": false, 139 "justification": "No alternative explanations are discussed. The paper does not consider prompt sensitivity, problem selection bias, the effect of using the tools' default interfaces, or whether the LeetCode problems are representative of real-world coding tasks." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": true, 144 "justification": "The paper explicitly discusses the relationship between its metrics and actual sustainability, noting that 'FLOPs are the metric most representative of a code's sustainability' but 'the link between FLOPs and energy consumption or runtime is not always straightforward' (Section II.B). They also scope their study to code-level efficiency, excluding grid carbon intensity, PUE, and hardware variation." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": false, 151 "justification": "Only product-level versions given: 'ChatGPT version 3.5, GitHub Copilot Individual Version, and Amazon CodeWhisperer Individual Tier' (Section III.C). No API version, snapshot date, or model checkpoint identifiers are provided." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": false, 156 "justification": "Only the optimization instruction pattern is provided ('Q: Give me a runtime-optimized solution for this problem'). The full task descriptions from LeetCode and function headers are referenced but not reproduced in the paper or a linked repository." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": false, 161 "justification": "No API hyperparameters (temperature, top-p, max tokens) are reported for any of the three tools. The paper does not mention any sampling or generation settings." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "The paper evaluates ChatGPT, Copilot, and CodeWhisperer as third-party tools through their standard interfaces. The authors cannot describe internal scaffolding they have no access to." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "Test case creation is documented: random input generation for problems like Search, iteration counts chosen per problem complexity (1,000 or 100,000), and the use of perf and tracemalloc for measurements are described (Section III.C-D)." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": false, 178 "justification": "No dedicated limitations or threats-to-validity section exists. The conclusion mentions future work directions (broader problems, better prompts, more metrics, hardware variability) but these are aspirations, not limitations discussion." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": false, 183 "justification": "No specific threats to validity are discussed. Issues like small problem set, contamination risk, prompt sensitivity, and platform-specific measurement artifacts are not acknowledged." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": false, 188 "justification": "No explicit scope boundaries are stated. The paper does not clearly state what the results do NOT show or what populations/settings are excluded from the claims." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": false, 195 "justification": "No raw measurement data, generated code samples, or detailed per-run results are made available for independent verification." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "The code generation process (Section III.B), measurement procedures using perf and tracemalloc (Section III.D), and hardware setup (Section III.C) are described in sufficient detail to understand data collection." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants were recruited. Human baseline code comes from LeetCode's public submission pool, which is a standard platform source." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "The pipeline from problem selection → code generation (initial and optimized) → correctness validation via LeetCode → sustainability measurement via perf/tracemalloc → GC calculation is documented across Sections III.A-D and Figure 1." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": false, 217 "justification": "No explicit funding source is disclosed. The acknowledgments mention 'This work inspired a development project between Mercedes-Benz and TWT' and gratitude to IBM, Microsoft, and GitHub for discussions, but no formal funding disclosure." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "All author affiliations are listed: TWT GmbH, NTUA, MIT, Harvard, and Mercedes-Benz. None of the authors are affiliated with OpenAI, GitHub/Microsoft, or Amazon (the companies whose tools are evaluated)." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": false, 227 "justification": "Funding is not disclosed, so independence cannot be assessed. The acknowledged connections to Microsoft and GitHub (whose product Copilot is evaluated) raise questions about potential conflicts." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests or financial interests statement is present in the paper." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": true, 238 "answer": false, 239 "justification": "No training data cutoff dates are stated for ChatGPT-3.5, Copilot, or CodeWhisperer. This is critical since LeetCode solutions are widely available online and almost certainly in training data." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": true, 243 "answer": false, 244 "justification": "LeetCode problems and their solutions are among the most common programming benchmarks and very likely in the training data of all three models. This overlap is never discussed." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": true, 248 "answer": false, 249 "justification": "LeetCode problems have been publicly available for years before any of these models were trained. The paper does not address whether the models may have memorized solutions rather than generating them fresh." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study. The 'human submissions' are existing LeetCode solutions from the platform's public pool." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants; the study evaluates AI tools and uses publicly available LeetCode submissions." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in the study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in the study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants or experimental conditions requiring randomization." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants or blinding-relevant experimental design." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in the study." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "No API costs, tokens consumed, or cost per code generation are reported for any of the three tools. The paper measures energy of executing generated code but not the cost of generating it." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "Hardware is described (Intel i7-6700k, 32GB RAM) but no total computational budget, GPU hours, or API spend is reported." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "No seed sensitivity analysis is reported. The non-deterministic nature of LLM outputs is not addressed — each tool was apparently queried once per problem per prompt variant." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": true, 310 "justification": "The number of measurement runs is stated: 'we chose to sample 10 runtime measurements' (Section III.D) and iterations per problem are specified (1,000 or 100,000)." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "No hyperparameter search is reported. Prompt wording, iteration counts, and tool configurations appear chosen without documented exploration of alternatives." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": false, 320 "justification": "The human baseline is selected as 'the top 0.05% of LeetCode submissions based on runtime,' explicitly cherry-picking the best human code. No justification for why single prompt variants represent each tool's best capability." 321 }, 322 "multiple_comparison_correction": { 323 "applies": false, 324 "answer": false, 325 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": false, 329 "answer": false, 330 "justification": "The paper evaluates third-party tools (ChatGPT, Copilot, CodeWhisperer) rather than a system built by the authors, so self-comparison bias in the Lucic et al. sense does not apply." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "The cost of generating code via each tool is not reported or compared. The tools may use vastly different amounts of compute for generation, but this is not discussed." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": false, 340 "justification": "No discussion of whether 6 algorithmic LeetCode problems actually measure 'green code generation' capability. LeetCode problems are designed for interview practice, not sustainability assessment, and this gap is not addressed." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "The three tools are evaluated as bundled products through their standard interfaces (ChatGPT web, Copilot IDE, CodeWhisperer IDE). The paper does not claim to isolate the model from the tool." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "LeetCode problems have been publicly available for years before these models were trained. The temporal overlap between training data and test problems is not discussed." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether providing the full LeetCode problem description and function signature gives the models enough context to recall memorized solutions rather than generate new ones." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "No discussion of whether the 6 LeetCode problems (or their solutions) appear in the models' training data. LeetCode is one of the most common sources in programming training corpora." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": false, 367 "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, or temporal analysis applied." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "AI models demonstrate partial understanding of optimization criteria and can achieve sustainability improvements when explicitly asked to optimize for specific metrics.", 374 "evidence": "Green Capacity values in Figure 2 show positive GC scores for ChatGPT and Copilot across most problems when optimization is requested (Section IV).", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "Human submissions (top 0.05% LeetCode) generally surpass AI-generated code in sustainability metrics.", 379 "evidence": "Figure 2 shows GC_human generally exceeds GC_AI: 'selecting the runtime-optimized human submissions leads to generally surpassing the AI models in performance' (Section IV).", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "Runtime-optimized human code achieves improvements across multiple sustainability metrics simultaneously, suggesting correlation among metrics.", 384 "evidence": "The paper states 'the same human submission can achieve improvements over multiple sustainability metrics (without explicitly optimizing them)' indicating 'a correlation among the sustainability metrics' (Section IV).", 385 "supported": "weak" 386 }, 387 { 388 "claim": "Copilot demonstrates the best optimization behavior across coding problems, particularly for energy consumption.", 389 "evidence": "Figure 3 shows Copilot achieves positive PD values across more problems than ChatGPT or CodeWhisperer for energy. 'Copilot showcases the best behavior across the coding problems' (Section IV).", 390 "supported": "weak" 391 }, 392 { 393 "claim": "Memory was not a significant contributor to Green Capacity values, with most PD values being zero or non-positive.", 394 "evidence": "Figure 5 and Section IV: 'We observe a high number of zero PD values, highlighting that memory was not a significant contributor to the Green capacity values.'", 395 "supported": "moderate" 396 } 397 ], 398 "red_flags": [ 399 { 400 "flag": "Extremely small sample size", 401 "detail": "Only 6 LeetCode algorithmic problems are used. This is far too few to draw general conclusions about AI tools' green coding capabilities. No justification for this sample size." 402 }, 403 { 404 "flag": "No statistical tests or uncertainty quantification", 405 "detail": "Despite averaging 10 runtime measurements, no standard deviations, confidence intervals, or significance tests are reported. All comparisons are based on raw point estimates." 406 }, 407 { 408 "flag": "Contamination completely ignored", 409 "detail": "LeetCode problems are among the most common programming benchmarks and are almost certainly in the training data of all three models. The paper does not acknowledge or address this, making it impossible to know whether models are generating or recalling solutions." 410 }, 411 { 412 "flag": "Cherry-picked human baseline", 413 "detail": "The human baseline is the top 0.05% of LeetCode submissions — the absolute best human solutions on the platform. Comparing average AI tool output against elite human code is an unfair comparison presented without adequate framing." 414 }, 415 { 416 "flag": "No limitations section", 417 "detail": "The paper has no limitations or threats-to-validity section despite significant methodological gaps including tiny sample, no contamination controls, and no statistical analysis." 418 }, 419 { 420 "flag": "Different tool interfaces confound comparison", 421 "detail": "ChatGPT is used via web browser (conversational), while Copilot and CodeWhisperer are used via IDE (comment-based code generation). These fundamentally different interaction modes are not controlled for or acknowledged as a confound." 422 } 423 ], 424 "cited_papers": [ 425 { 426 "title": "Evaluating the code quality of AI-assisted code generation tools: An empirical study on GitHub Copilot, Amazon CodeWhisperer, and ChatGPT", 427 "authors": ["B. Yetistiren", "I. Özsoy", "M. Ayerdem", "E. Tüzün"], 428 "year": 2023, 429 "doi": "10.48550/arXiv.2304.10778", 430 "relevance": "Directly evaluates code quality of the same three AI code generation tools studied in this paper." 431 }, 432 { 433 "title": "An empirical evaluation of GitHub Copilot's code suggestions", 434 "authors": ["N. Nguyen", "S. Nadi"], 435 "year": 2022, 436 "doi": "10.1145/3524842.3528470", 437 "relevance": "Empirical evaluation of GitHub Copilot code suggestions quality and correctness." 438 }, 439 { 440 "title": "A systematic evaluation of large language models of code", 441 "authors": ["F. F. Xu", "U. Alon", "G. Neubig", "V. J. Hellendoorn"], 442 "year": 2022, 443 "doi": "10.1145/3520312.3534862", 444 "relevance": "Systematic evaluation of LLMs for code generation, directly relevant to understanding code LLM capabilities." 445 }, 446 { 447 "title": "Green AI: Do deep learning frameworks have different costs?", 448 "authors": ["S. Georgiou", "M. Kechagia", "T. Sharma", "F. Sarro", "Y. Zou"], 449 "year": 2022, 450 "doi": "10.1145/3510003.3510221", 451 "relevance": "Evaluates environmental costs of deep learning frameworks, directly related to AI sustainability measurement." 452 }, 453 { 454 "title": "Energy and policy considerations for deep learning in NLP", 455 "authors": ["E. Strubell", "A. Ganesh", "A. McCallum"], 456 "year": 2019, 457 "doi": "10.18653/v1/P19-1355", 458 "relevance": "Foundational work on energy costs of training NLP models, key reference for AI sustainability research." 459 }, 460 { 461 "title": "Green AI", 462 "authors": ["R. Schwartz", "J. Dodge", "N. A. Smith", "O. Etzioni"], 463 "year": 2020, 464 "doi": "10.1145/3381831", 465 "relevance": "Defines the Green AI concept and argues for efficiency as a primary evaluation criterion alongside accuracy." 466 }, 467 { 468 "title": "Towards the systematic reporting of the energy and carbon footprints of machine learning", 469 "authors": ["P. Henderson", "J. Hu", "J. Romoff", "E. Brunskill", "D. Jurafsky", "J. Pineau"], 470 "year": 2020, 471 "doi": "10.48550/ARXIV.2002.05651", 472 "arxiv_id": "2002.05651", 473 "relevance": "Proposes systematic reporting of ML energy/carbon footprints, foundational for sustainability measurement methodology." 474 }, 475 { 476 "title": "Jigsaw: Large language models meet program synthesis", 477 "authors": ["N. Jain"], 478 "year": 2022, 479 "doi": "10.1145/3510003.3510203", 480 "relevance": "LLM-based program synthesis combining language models with program synthesis techniques." 481 }, 482 { 483 "title": "Energy efficiency across programming languages: How do energy, time, and memory relate?", 484 "authors": ["R. Pereira"], 485 "year": 2017, 486 "relevance": "Foundational study on energy efficiency of programming languages, relevant to green coding metrics." 487 }, 488 { 489 "title": "Assessing the quality of GitHub Copilot's code generation", 490 "authors": ["B. Yetistiren", "E. Tüzün", "I. Özsoy"], 491 "year": 2022, 492 "doi": "10.1145/3558489.3559072", 493 "relevance": "Quality assessment of Copilot-generated code, directly relevant to AI code generation evaluation." 494 } 495 ] 496 }