scan.json (24656B)
1 { 2 "paper": { 3 "title": "Carbon Footprint Evaluation of Code Generation through LLM as a Service", 4 "authors": ["Tina Vartziotis", "Maximilian Schmidt", "George Dasoulas", "Ippolyti Dellatolas", "Stefano Attademo", "Viet Dung Le", "Anke Wiechmann", "Tim Hoffmann", "Michael Keckeisen", "Sotirios Kotsopoulos"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2504.01036", 8 "doi": "10.48550/arXiv.2504.01036" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": false, 15 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. The energy measurement tools (E3, Microsoft Fabric) are mentioned but no analysis code or scripts are released." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": false, 20 "justification": "No dataset download link or raw data release is provided. The paper reports summary statistics (Tables 1-4) but no underlying data files are made available." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "The paper mentions using Windows Energy Estimation Engine (E3) on a Windows laptop with battery, and Intel's computational method for server estimates, but no software versions, OS version, hardware specifications (beyond generic mention of CPU/memory), or dependency specifications are provided." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "No step-by-step reproduction instructions are provided. The methodology is described at a high level but there are no scripts, commands, or detailed procedures that would allow replication." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "All results are reported as single point estimates (e.g., 1.777 kgCO2e total, 9.203 kWh embodied energy in Table 4). No confidence intervals, error bars, or uncertainty measures are provided." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper claims '78 hours without Copilot reduced to just over 5 hours' and that operational emissions are 'eight times less than the equivalent embodied carbon emissions' without any statistical tests. These are comparative claims supported only by point estimates." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": false, 47 "justification": "While the paper reports absolute numbers (1.582 kgCO2e embodied vs 0.194 kgCO2e operational), there is no formal effect size reporting. The '78 hours vs 5 hours' time savings comparison lacks baseline context for how the 78-hour estimate was derived." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "The study evaluates a single project (149 frontend + 50 backend test files). There is no justification for why this sample is sufficient to draw general conclusions, nor any acknowledgment that a single case study limits generalizability." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "No variance, standard deviation, or spread measures are reported. All measurements appear to be from single runs. No repeated measurements are mentioned." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": false, 64 "justification": "The paper briefly compares LLM-generated code time (5 hours) vs manual coding time (78 hours) but does not include a proper baseline comparison for carbon footprint. There is no comparison against human-written test code's carbon footprint or against alternative LLM tools." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": false, 69 "justification": "No formal baselines are included, so the question of whether they are contemporary is moot. The paper only uses GitHub Copilot without comparing to any other tools or approaches." 70 }, 71 "ablation_study": { 72 "applies": false, 73 "answer": false, 74 "justification": "The system is a measurement framework applied to a single tool (GitHub Copilot). There are no components to ablate — the study measures carbon footprint of a fixed pipeline." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": true, 79 "justification": "The paper reports multiple metrics: embodied energy, operational energy, carbon intensity, embodied carbon emissions, operational carbon emissions, total carbon emissions (Table 4), plus time savings, token counts, and consumption rates (Tables 1-2)." 80 }, 81 "human_evaluation": { 82 "applies": false, 83 "answer": false, 84 "justification": "The paper measures energy consumption and carbon emissions of code execution, which are objective physical measurements. Human evaluation of outputs is not relevant to these claims." 85 }, 86 "held_out_test_set": { 87 "applies": false, 88 "answer": false, 89 "justification": "This is not a benchmark evaluation paper. The study measures carbon footprint of a specific software project, not model performance on a test set." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "The paper provides breakdowns by frontend vs backend (Tables 1, 3) and by embodied vs operational carbon (Table 4), rather than only reporting aggregate numbers." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": true, 99 "justification": "The paper notes that 'tests for the backend applications had a 50% accuracy rate, which required a software engineer to correct and rewrite part of the generated code' (Section 4.2). This is an honest discussion of where the approach partially failed." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "The paper reports that the embodied carbon of Copilot 'can not be accurately estimated due to the lack of information' (Section 3.1), and that the backend test accuracy was only 50%, which are negative findings about the approach's limitations." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims are modest: presenting 'an overview of green coding and metrics,' introducing 'LLM as a service,' and defining 'embodied and operational carbon.' These descriptive claims are supported by the paper's content in Sections 2-4." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": false, 116 "justification": "The paper claims GitHub Copilot 'reduced' development time from 78 hours to 5 hours (Section 3.1), which is a causal claim. However, the 78-hour baseline is described as an estimate without explanation of how it was derived. No controlled comparison was conducted to support this causal claim." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": false, 121 "justification": "The conclusion discusses the 'environmental implications of AI-driven code generation' broadly and mentions implications for 'industries like automotive,' but the study is based on a single project at one company using one tool. The title ('Carbon Footprint Evaluation of Code Generation through LLM as a Service') is broader than what a single case study can support." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": false, 126 "justification": "The paper does not discuss alternative explanations for its findings. For example, the time savings could be partly due to task simplicity rather than Copilot effectiveness. The carbon estimates rely on Intel's simplified method without discussing how different estimation approaches might yield different results." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper refers to 'GitHub Copilot' without specifying which version or underlying model was used. No model version, snapshot date, or API version is provided." 134 }, 135 "prompts_provided": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper describes that Copilot was given 'source already written' as input to generate test modules (Table 1), but no actual prompts, prompt templates, or examples of inputs to Copilot are provided." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": false, 143 "justification": "No hyperparameters for GitHub Copilot are reported (temperature, sampling settings, etc.). The paper also does not report any configuration settings for the energy measurement tools." 144 }, 145 "scaffolding_described": { 146 "applies": false, 147 "answer": false, 148 "justification": "The paper uses GitHub Copilot as a third-party black-box tool. The authors cannot be expected to describe its internal scaffolding." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": false, 153 "justification": "The paper does not describe how the input files were selected or prepared for Copilot, how E3 energy data was filtered beyond a brief mention of filtering by process name, or what preprocessing was done on the raw measurements." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": false, 160 "justification": "There is no dedicated limitations or threats-to-validity section. The paper mentions individual limitations in passing (e.g., E3 requiring battery-powered devices, inability to accurately estimate Copilot's embodied carbon) but has no structured discussion." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": false, 165 "justification": "No specific threats to validity are discussed. The paper does not address measurement uncertainty, the representativeness of a single project, or the limitations of using Intel's simplified estimation method." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": false, 170 "justification": "While Section 1.3 briefly states 'This study excludes variability in grid carbon intensity, PUE, or hardware efficiency,' the paper does not explicitly state what populations, settings, or use cases the results do NOT generalize to. The conclusion draws broad implications without acknowledging scope boundaries." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": false, 177 "justification": "No raw data is available. Only summary tables (Tables 1-4) are provided. The raw E3 energy measurements, token logs, and timing data are not released." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "The data collection procedure is described: E3 monitors energy consumption in one-minute intervals (Section 4.2), Microsoft Fabric measures capacity unit seconds (Section 3.1), and the token estimation approach using Intel's method is detailed with the formula Energyemb = P * T * N (Section 3.1)." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No human participants were recruited. This is a measurement study of code generation tools applied to an existing software project." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": false, 192 "justification": "The pipeline from raw E3 recordings to final carbon footprint numbers is only partially documented. The paper mentions filtering E3 records by process name and summing TotalEnergyConsumption values, but the full pipeline from measurement to the numbers in Table 4 is not fully traceable." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": false, 199 "justification": "No funding source is disclosed. The Acknowledgments section mentions the work is 'inspired by a development project between Mercedes-Benz and TWT' and thanks IBM, Microsoft, and GitHub, but no explicit funding disclosure is made." 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "Author affiliations are clearly listed: TWT GmbH, Harvard, MIT, Mercedes-Benz, and NTUA. The connection between the authors' employers (TWT and Mercedes-Benz) and the case study subject is visible." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": false, 209 "justification": "The work involves Mercedes-Benz (whose employees are co-authors) and TWT (employer of multiple authors), and the case study evaluates a tool used in their joint project. The Acknowledgments thank Microsoft and GitHub (makers of the evaluated tool). These relationships create potential conflicts that are not addressed." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests or financial interests statement is present in the paper. Authors from TWT and Mercedes-Benz evaluate tools used in their commercial project without a declaration of interests." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": false, 220 "answer": false, 221 "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. It measures carbon footprint of code generation in a real project setting." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": false, 225 "answer": false, 226 "justification": "No benchmark evaluation is performed. The study measures energy consumption and carbon emissions of a specific software project." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": false, 230 "answer": false, 231 "justification": "No benchmark is used. The study is a case study of carbon footprint measurement, not a model capability evaluation." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "No human participants are involved. This is a measurement study of software tools." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants are involved." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants are involved." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants are involved." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants are involved." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants are involved." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants are involved." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": true, 275 "justification": "The paper's central contribution is reporting inference cost. Table 4 reports embodied energy (9.203 kWh), operational energy (1.131 kWh), and total carbon emissions (1.777 kgCO2e). Token counts and consumption rates are also provided." 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": true, 280 "justification": "The total computational budget is stated: server power consumption P = 356W (350W CPU + 6W memory), token latency T = 0.47s, N = 203,717 tokens processed (Section 3.1). Total embodied energy of 9.203 kWh and operational energy of 1.131 kWh are reported." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "Using GitHub Copilot reduced development time from approximately 78 hours to just over 5 hours for the software testing case study.", 287 "evidence": "Section 3.1 states this time comparison but provides no methodology for how the 78-hour manual estimate was derived.", 288 "supported": "weak" 289 }, 290 { 291 "claim": "The total carbon footprint of the LLMaaS cycle for this project was 1.777 kgCO2e, with embodied carbon (1.582 kgCO2e) dominating operational carbon (0.194 kgCO2e).", 292 "evidence": "Table 4 provides the detailed breakdown. The embodied energy calculation uses Intel's simplified method with explicit parameters (Section 3.1).", 293 "supported": "moderate" 294 }, 295 { 296 "claim": "Operational carbon emissions are approximately eight times less than embodied carbon emissions.", 297 "evidence": "Table 4 shows 0.194 kgCO2e operational vs 1.582 kgCO2e embodied. The ratio is roughly 8:1 (Section 4.2).", 298 "supported": "moderate" 299 }, 300 { 301 "claim": "Frontend tests were all successfully generated by Copilot while backend tests had only a 50% accuracy rate.", 302 "evidence": "Section 4.2 states this directly, though no detailed error analysis or definition of 'accuracy' is provided.", 303 "supported": "moderate" 304 }, 305 { 306 "claim": "The methods and tools available to assess the environmental footprint of computing activities are inadequate.", 307 "evidence": "The conclusion states this (Section 5) but it is presented as an opinion rather than a finding supported by systematic evidence.", 308 "supported": "weak" 309 } 310 ], 311 "methodology_tags": ["case-study"], 312 "key_findings": "This paper introduces a taxonomy distinguishing embodied and operational carbon footprints in the 'LLM as a Service' (LLMaaS) cycle for code generation. Applied to a single case study using GitHub Copilot for software testing at Mercedes-Benz/TWT, the total carbon footprint was estimated at 1.777 kgCO2e, with embodied carbon (inference) dominating at ~89% of total emissions. Frontend test generation was fully successful while backend tests achieved only 50% accuracy, requiring manual correction.", 313 "red_flags": [ 314 { 315 "flag": "Single case study presented as general evaluation", 316 "detail": "The paper evaluates one project at one company using one tool but draws broad conclusions about 'Carbon Footprint Evaluation of Code Generation through LLM as a Service' generally. The title and conclusion significantly outrun what a single case study can support." 317 }, 318 { 319 "flag": "Unvalidated baseline estimate", 320 "detail": "The claim that manual development would take 78 hours (vs 5 hours with Copilot) is stated without any explanation of how the 78-hour estimate was derived. This is a critical comparison point that lacks methodological support." 321 }, 322 { 323 "flag": "Simplified estimation method for core results", 324 "detail": "The embodied energy calculation uses 'Intel's simplified computational method' referenced only to 'a blog post by Intel on Medium' (footnote 5). The actual blog post URL is not provided, and the method's accuracy is not validated against actual measurements." 325 }, 326 { 327 "flag": "No uncertainty quantification", 328 "detail": "All carbon footprint numbers are reported as precise point estimates (e.g., 1.777 kgCO2e) despite relying on estimated parameters (server power, token latency) and a simplified calculation method. No error bounds or sensitivity analysis is provided." 329 }, 330 { 331 "flag": "Potential conflict of interest", 332 "detail": "Authors from TWT and Mercedes-Benz evaluate GitHub Copilot in a joint project, while acknowledging Microsoft and GitHub (Copilot's developer) 'for the discussions on automated code generation.' This relationship is not declared as a potential conflict." 333 }, 334 { 335 "flag": "No reproducibility artifacts", 336 "detail": "No code, data, or detailed measurement logs are released. The E3 energy measurements, raw token counts, and calculation scripts are not available for independent verification." 337 } 338 ], 339 "cited_papers": [ 340 { 341 "title": "Evaluating large language models trained on code", 342 "authors": ["M. Chen", "J. Tworek", "H. Jun"], 343 "year": 2021, 344 "arxiv_id": "2107.03374", 345 "relevance": "Foundational work on code generation with LLMs (Codex/HumanEval), directly relevant to evaluating LLM coding capabilities." 346 }, 347 { 348 "title": "Energy and policy considerations for deep learning in NLP", 349 "authors": ["E. Strubell", "A. Ganesh", "A. McCallum"], 350 "year": 2019, 351 "relevance": "Seminal work on the energy and carbon costs of training large NLP models, foundational to the green AI movement." 352 }, 353 { 354 "title": "Carbon emissions and large neural network training", 355 "authors": ["D. A. Patterson", "J. Gonzalez", "Q. V. Le"], 356 "year": 2021, 357 "arxiv_id": "2104.10350", 358 "relevance": "Quantifies carbon emissions of training large neural networks, providing baseline data for the environmental impact discussion." 359 }, 360 { 361 "title": "LLMCarbon: Modeling the end-to-end carbon footprint of large language models", 362 "authors": ["A. Faiz", "S. Kaneda", "R. Wang"], 363 "year": 2024, 364 "relevance": "Provides a model for estimating total carbon footprint of LLMs including embodied and operational costs, directly used in this paper's framework." 365 }, 366 { 367 "title": "Green AI", 368 "authors": ["R. Schwartz", "J. Dodge", "N. A. Smith", "O. Etzioni"], 369 "year": 2020, 370 "doi": "10.1145/3381831", 371 "relevance": "Advocates for considering computational efficiency alongside accuracy in AI research, foundational to the sustainability evaluation approach." 372 }, 373 { 374 "title": "Learn to code sustainably: An empirical study on LLM-based green code generation", 375 "authors": ["T. Vartziotis", "I. Dellatolas", "G. Dasoulas"], 376 "year": 2024, 377 "arxiv_id": "2403.03344", 378 "relevance": "Predecessor work by the same authors defining sustainability metrics for LLM-generated code, directly relevant to evaluating code quality and environmental impact." 379 }, 380 { 381 "title": "Measuring the carbon intensity of AI in cloud instances", 382 "authors": ["J. Dodge", "T. Prewitt", "R. Tachet des Combes"], 383 "year": 2022, 384 "doi": "10.1145/3531146.3533234", 385 "relevance": "Addresses measurement of AI carbon intensity in cloud computing, relevant to understanding environmental costs of AI-as-a-service." 386 }, 387 { 388 "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation", 389 "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"], 390 "year": 2023, 391 "relevance": "Evaluates correctness of LLM-generated code at NeurIPS, relevant to understanding quality of AI-generated code." 392 }, 393 { 394 "title": "Sustainable AI: environmental implications, challenges and opportunities", 395 "authors": ["C. Wu", "R. Raghavendra", "U. Gupta"], 396 "year": 2021, 397 "arxiv_id": "2111.00364", 398 "relevance": "Comprehensive analysis of environmental implications of AI systems including embodied carbon from hardware manufacturing." 399 }, 400 { 401 "title": "FrugalGPT: How to use large language models while reducing cost and improving performance", 402 "authors": ["L. Chen", "M. Zaharia", "J. Zou"], 403 "year": 2023, 404 "arxiv_id": "2305.05176", 405 "relevance": "Addresses cost reduction strategies for LLM usage, relevant to practical and sustainable deployment of AI coding assistants." 406 }, 407 { 408 "title": "Refactoring programs using large language models with few-shot examples", 409 "authors": ["A. Shirafuji", "Y. Oda", "J. Suzuki", "M. Morishita", "Y. Watanobe"], 410 "year": 2023, 411 "arxiv_id": "2311.11690", 412 "relevance": "Examines LLM-based code refactoring, relevant to understanding LLM contributions to code quality and sustainability." 413 } 414 ] 415 }