scan.json (20671B)
1 { 2 "paper": { 3 "title": "AI's Environmental Cost: Comparing Resource Consumption Between SLMs and LLMs Across Queries", 4 "authors": ["Aryaanshi Sundaram", "Sparsh Kamdar", "Shreyas Kumar"], 5 "year": 2025, 6 "venue": "The Proceedings of the 5th International Conference on AI Research (ICAIR 2025)" 7 }, 8 "checklist": { 9 "artifacts": { 10 "code_released": { 11 "applies": true, 12 "answer": false, 13 "justification": "No repository URL or code archive is provided. Visualizations were done in Jupyter Notebook with Matplotlib but no code is shared." 14 }, 15 "data_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "Data was organized in a Google Drive folder and Google Sheet, but no public link or download is provided in the paper." 19 }, 20 "environment_specified": { 21 "applies": true, 22 "answer": false, 23 "justification": "The paper mentions using Jupyter Notebook and Matplotlib but provides no dependency versions, requirements file, or environment specification." 24 }, 25 "reproduction_instructions": { 26 "applies": true, 27 "answer": false, 28 "justification": "The methodology section describes the procedure narratively but provides no step-by-step reproduction instructions, scripts, or README." 29 } 30 }, 31 "statistical_methodology": { 32 "confidence_intervals_or_error_bars": { 33 "applies": true, 34 "answer": false, 35 "justification": "Results report averages and standard deviations for some consumption values but no confidence intervals or error bars are reported on the main results." 36 }, 37 "significance_tests": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper claims SLMs consume 60-70% less energy and water than LLMs and makes comparative claims throughout, but no statistical significance tests are performed." 41 }, 42 "effect_sizes_reported": { 43 "applies": true, 44 "answer": false, 45 "justification": "Raw differences and 'difference factors' are reported but no formal effect sizes (Cohen's d, etc.) are provided. The percentage differences lack baseline context in a standardized form." 46 }, 47 "sample_size_justified": { 48 "applies": true, 49 "answer": false, 50 "justification": "48 questions with 3 repetitions each were used. No justification for why this sample size is adequate, and no power analysis is discussed." 51 }, 52 "variance_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "Standard deviations are reported for power consumption across questions in several figures (e.g., Claude Puzzles LLM std dev 0.00604 kW, SLM 0.0244 kW)." 56 } 57 }, 58 "evaluation_design": { 59 "baselines_included": { 60 "applies": true, 61 "answer": true, 62 "justification": "SLMs serve as baselines against LLMs, and three commercial AI systems (ChatGPT, Claude, DeepSeek) are compared against each other." 63 }, 64 "baselines_contemporary": { 65 "applies": true, 66 "answer": true, 67 "justification": "The models tested (GPT-4o, Claude Sonnet 4, DeepSeek R1/V3) are contemporary commercial models." 68 }, 69 "ablation_study": { 70 "applies": false, 71 "answer": false, 72 "justification": "The study compares existing commercial models; there is no system with components to ablate." 73 }, 74 "multiple_metrics": { 75 "applies": true, 76 "answer": true, 77 "justification": "The study reports power consumption (kW), water consumption (L), and accuracy across all models and query types." 78 }, 79 "human_evaluation": { 80 "applies": false, 81 "answer": false, 82 "justification": "Human evaluation is not relevant here; accuracy is assessed against known correct answers (SAT questions with answer keys)." 83 }, 84 "held_out_test_set": { 85 "applies": false, 86 "answer": false, 87 "justification": "This is not a machine learning training/evaluation study; questions are used as test stimuli for commercial models, not for model development." 88 }, 89 "per_category_breakdown": { 90 "applies": true, 91 "answer": true, 92 "justification": "Results are broken down by subject (Math, Reading, Writing, Puzzles), by difficulty level (easy, medium, hard), and by model type (SLM vs LLM)." 93 }, 94 "failure_cases_discussed": { 95 "applies": true, 96 "answer": true, 97 "justification": "Specific question IDs that each model got wrong are listed in Section 4, and accuracy drops for Puzzles are discussed." 98 }, 99 "negative_results_reported": { 100 "applies": true, 101 "answer": true, 102 "justification": "The paper reports that SLM accuracy drops considerably for Puzzles, and that DeepSeek's high resource consumption is due to network latency rather than computation, which undermines its environmental comparison." 103 } 104 }, 105 "claims_and_evidence": { 106 "abstract_claims_supported": { 107 "applies": true, 108 "answer": true, 109 "justification": "The abstract claims SLMs consumed 60-70% less energy/water and had same accuracy for Math/Reading but declined for Puzzles. These are supported by the results in Section 4 and Discussion." 110 }, 111 "causal_claims_justified": { 112 "applies": true, 113 "answer": false, 114 "justification": "The paper implies causal relationships (e.g., query complexity 'causes' higher resource consumption) but the study design cannot establish causation — response time is confounded with network latency, server load, and other factors. The DeepSeek latency issue is acknowledged but not controlled for." 115 }, 116 "generalization_bounded": { 117 "applies": true, 118 "answer": false, 119 "justification": "The title and abstract frame findings broadly ('AI's Environmental Cost') but the study tests only 48 text-based questions on three commercial systems measured via browser response time. The limitations section acknowledges some bounds but the title and framing overgeneralize." 120 }, 121 "alternative_explanations_discussed": { 122 "applies": true, 123 "answer": true, 124 "justification": "The Discussion section acknowledges that DeepSeek's higher resource consumption likely stems from network latency (servers in China, testing in US), and notes that internet traffic and server load could affect response times." 125 } 126 }, 127 "setup_transparency": { 128 "model_versions_specified": { 129 "applies": true, 130 "answer": true, 131 "justification": "Specific model versions are named: GPT-4o, GPT-4o-mini, Claude Sonnet 4, Claude 3.5-Haiku, DeepSeek R1, DeepSeek V3." 132 }, 133 "prompts_provided": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper describes question categories and sources (SAT question bank, puzzles) but does not provide the actual prompts or questions used." 137 }, 138 "hyperparameters_reported": { 139 "applies": true, 140 "answer": false, 141 "justification": "No hyperparameters (temperature, top-p, max tokens) are reported. The models were used via browser interfaces with default settings, but this is not stated explicitly." 142 }, 143 "scaffolding_described": { 144 "applies": false, 145 "answer": false, 146 "justification": "No agentic scaffolding is used; questions are submitted directly to commercial chatbot interfaces." 147 }, 148 "data_preprocessing_documented": { 149 "applies": true, 150 "answer": true, 151 "justification": "Section 3 describes question selection criteria (SAT question bank with difficulty filters, random number generator for selection, exclusion of visual prompts), data organization in Google Drive/Sheets, and the formulas used to compute energy and water from response times." 152 } 153 }, 154 "limitations_and_scope": { 155 "limitations_section_present": { 156 "applies": true, 157 "answer": true, 158 "justification": "The Discussion section includes a paragraph on limitations covering network latency, exclusion of image queries, internet traffic variability, and system-specific testing differences." 159 }, 160 "threats_to_validity_specific": { 161 "applies": true, 162 "answer": true, 163 "justification": "Specific threats are discussed: DeepSeek's server location in China causing latency inflation, exclusion of image/OCR queries limiting generalizability, internet traffic variation during testing windows, and system-specific differences (temporary chat vs. new chat)." 164 }, 165 "scope_boundaries_stated": { 166 "applies": true, 167 "answer": false, 168 "justification": "The limitations mention exclusion of image queries but do not explicitly state what the results do NOT show. The paper does not bound its claims to specific populations, settings, or explicitly list what claims it is NOT making." 169 } 170 }, 171 "data_integrity": { 172 "raw_data_available": { 173 "applies": true, 174 "answer": false, 175 "justification": "Raw data is stored in a Google Drive folder and Google Sheet but no public access link is provided." 176 }, 177 "data_collection_described": { 178 "applies": true, 179 "answer": true, 180 "justification": "Section 3 describes in detail the data collection procedure: browser setup, developer console timing, temporary/new chat sessions, three repetitions per question, screenshot capture, and testing window (2-5 PM)." 181 }, 182 "recruitment_methods_described": { 183 "applies": false, 184 "answer": false, 185 "justification": "No human participants; the study tests commercial AI models with standardized questions." 186 }, 187 "data_pipeline_documented": { 188 "applies": true, 189 "answer": true, 190 "justification": "The pipeline from question selection → execution on models → response time recording → energy/water computation via formulas → visualization in Matplotlib is documented in Section 3." 191 } 192 }, 193 "conflicts_of_interest": { 194 "funding_disclosed": { 195 "applies": true, 196 "answer": false, 197 "justification": "No funding sources are disclosed. The Acknowledgements thank DiscoverSTEM Innovation and Research Lab for hosting but do not mention any grants or funding." 198 }, 199 "affiliations_disclosed": { 200 "applies": true, 201 "answer": true, 202 "justification": "Author affiliations are listed: DiscoverSTEM (Plano, TX) and Texas A&M University. None of the authors are affiliated with the AI companies being evaluated." 203 }, 204 "funder_independent_of_outcome": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure is scored as NO per schema guidance." 208 }, 209 "financial_interests_declared": { 210 "applies": true, 211 "answer": false, 212 "justification": "No competing interests or financial interests statement is present in the paper." 213 } 214 }, 215 "contamination": { 216 "training_cutoff_stated": { 217 "applies": false, 218 "answer": false, 219 "justification": "The study measures resource consumption and accuracy on SAT/puzzle questions; it does not evaluate model capability on a benchmark where contamination would matter. The questions test known-answer accuracy, not whether the model memorized them." 220 }, 221 "train_test_overlap_discussed": { 222 "applies": false, 223 "answer": false, 224 "justification": "Same as above — contamination is not relevant to the environmental cost measurement, which is the primary contribution." 225 }, 226 "benchmark_contamination_addressed": { 227 "applies": false, 228 "answer": false, 229 "justification": "Same as above — the study's primary contribution is resource consumption measurement, not benchmark performance evaluation." 230 } 231 }, 232 "human_studies": { 233 "pre_registered": { 234 "applies": false, 235 "answer": false, 236 "justification": "No human participants in this study." 237 }, 238 "irb_or_ethics_approval": { 239 "applies": false, 240 "answer": false, 241 "justification": "No human participants. The paper states 'This paper did not require ethical clearance.'" 242 }, 243 "demographics_reported": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants." 247 }, 248 "inclusion_exclusion_criteria": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants." 252 }, 253 "randomization_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants." 257 }, 258 "blinding_described": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants." 262 }, 263 "attrition_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants." 267 } 268 }, 269 "cost_and_practicality": { 270 "inference_cost_reported": { 271 "applies": true, 272 "answer": true, 273 "justification": "Inference cost IS the paper's primary contribution — power consumption (kW) and water consumption (L) per query are reported for all models." 274 }, 275 "compute_budget_stated": { 276 "applies": true, 277 "answer": false, 278 "justification": "The total computational budget for running the experiments is not stated. The paper reports per-query costs but not the total resources consumed for the full experimental campaign." 279 } 280 } 281 }, 282 "claims": [ 283 { 284 "claim": "SLMs consumed 60-70% less energy and water than their LLM counterparts on average.", 285 "evidence": "Section 4 and Discussion present per-query power and water consumption comparisons. E.g., ChatGPT SLM Math avg 0.0202 kW vs LLM 0.0271 kW.", 286 "supported": "moderate" 287 }, 288 { 289 "claim": "SLMs had the same level of accuracy as LLMs in Math and Reading subjects.", 290 "evidence": "Section 4 reports accuracy figures: all SLMs were 91.67% accurate in Reading (same as each other); Math accuracy was 100% for most models.", 291 "supported": "moderate" 292 }, 293 { 294 "claim": "SLM accuracy dropped considerably for abstract reasoning tasks such as Puzzles.", 295 "evidence": "Section 4: Claude SLM accuracy dropped to 50% for Puzzles vs 66% for its LLM. ChatGPT SLM was 100% accurate but this contradicts the claim.", 296 "supported": "weak" 297 }, 298 { 299 "claim": "DeepSeek consumed at least nine times more power and water than other models across all subjects.", 300 "evidence": "Discussion section states this explicitly; supported by data such as DeepSeek LLM Math avg 0.2839 kW vs ChatGPT 0.0202 kW and Claude 0.0188 kW.", 301 "supported": "moderate" 302 }, 303 { 304 "claim": "Query complexity is positively correlated with both energy and water consumption in inference (H1).", 305 "evidence": "The paper shows some spikes at harder questions but no formal correlation analysis is performed. The relationship appears inconsistent across models.", 306 "supported": "weak" 307 } 308 ], 309 "methodology_tags": ["benchmark-eval"], 310 "key_findings": "SLMs consumed 60-70% less energy and water than LLMs on average, while maintaining comparable accuracy on structured tasks like Math and Reading. However, SLM accuracy declined for abstract reasoning tasks (Puzzles). DeepSeek consumed dramatically more resources than ChatGPT and Claude, likely due to network latency from server location in China rather than computational differences. The authors propose context-aware model switching as a strategy to balance environmental cost and accuracy.", 311 "red_flags": [ 312 { 313 "flag": "Response time as proxy for energy consumption", 314 "detail": "Energy and water consumption are computed entirely from browser-measured response times multiplied by assumed GPU power draw constants. Response time from a browser includes network latency, queuing, and other factors unrelated to actual compute. This is acknowledged for DeepSeek (China servers) but applies to all models — the measurements do not reflect actual energy consumption at the data center." 315 }, 316 { 317 "flag": "Extremely small sample size", 318 "detail": "Only 48 questions with 3 repetitions each, across 4 subjects and 3 difficulty levels. With 4 questions per subject-difficulty cell, the statistical power to detect real differences is very low. No significance tests are performed." 319 }, 320 { 321 "flag": "No statistical tests for comparative claims", 322 "detail": "Claims about differences between SLMs and LLMs, and between AI systems, are made by comparing raw numbers without any statistical testing. Given the small sample and high variance, many observed differences may not be statistically significant." 323 }, 324 { 325 "flag": "Assumed hardware parameters not validated", 326 "detail": "The energy formula uses assumed Pcritical (GPU power per hour) and PUE values from literature, but these may not match the actual hardware and data center configurations used by ChatGPT, Claude, or DeepSeek. The resulting energy figures are estimates based on assumptions, not measurements." 327 }, 328 { 329 "flag": "Contradictory accuracy results", 330 "detail": "The abstract claims SLM accuracy drops for complex tasks, but ChatGPT's SLM was 100% accurate on Puzzles while its LLM was only 58.33%. This directly contradicts the general narrative." 331 } 332 ], 333 "cited_papers": [ 334 { 335 "title": "How Hungry is AI? Benchmarking Energy, Water, and Carbon Footprint of LLM Inference", 336 "authors": ["N. Jegham", "M. Abdelatti", "L. Elmoubarki", "A. Hendawi"], 337 "year": 2025, 338 "arxiv_id": "2505.09598", 339 "relevance": "Directly relevant benchmark of LLM inference environmental costs, source of energy/water formulas used in this paper." 340 }, 341 { 342 "title": "Energy and Policy Considerations for Deep Learning in NLP", 343 "authors": ["E. Strubell", "A. Ganesh", "A. McCallum"], 344 "year": 2019, 345 "arxiv_id": "1906.02243", 346 "relevance": "Foundational work quantifying the carbon footprint of NLP model training." 347 }, 348 { 349 "title": "The growing energy footprint of artificial intelligence", 350 "authors": ["A. de Vries"], 351 "year": 2023, 352 "doi": "10.1016/j.joule.2023.09.004", 353 "relevance": "Analysis of AI's growing energy consumption and data center requirements." 354 }, 355 { 356 "title": "Towards the systematic reporting of the energy and carbon footprints of machine learning", 357 "authors": ["P. Henderson", "J. Hu", "J. Romoff", "E. Brunskill", "D. Jurafsky", "J. Pineau"], 358 "year": 2020, 359 "relevance": "Proposes systematic reporting frameworks for ML energy and carbon footprints." 360 }, 361 { 362 "title": "Trends in AI inference energy consumption: Beyond the performance-vs-parameter laws of deep learning", 363 "authors": ["R. Desislavov", "F. Martínez-Plumed", "J. Hernández-Orallo"], 364 "year": 2023, 365 "doi": "10.1016/j.suscom.2023.100857", 366 "relevance": "Analysis of inference-time energy consumption trends in AI, relevant to environmental sustainability of LLM deployment." 367 }, 368 { 369 "title": "Are emergent abilities of large language models a mirage?", 370 "authors": ["R. Schaeffer", "B. Miranda", "S. Koyejo"], 371 "year": 2023, 372 "arxiv_id": "2304.15004", 373 "relevance": "Questions whether scaling laws reliably predict model capabilities, relevant to the SLM vs LLM efficiency debate." 374 }, 375 { 376 "title": "Factuality of large language models: A survey", 377 "authors": ["Y. Wang", "M. Wang", "M.A. Manzoor", "F. Liu", "G. Georgiev", "R.J. Das", "P. Nakov"], 378 "year": 2024, 379 "arxiv_id": "2402.02420", 380 "relevance": "Survey of LLM factuality, relevant to understanding accuracy tradeoffs between model sizes." 381 }, 382 { 383 "title": "Train large, then compress: Rethinking model size for efficient training and inference of transformers", 384 "authors": ["Z. Li", "E. Wallace", "S. Shen", "K. Lin", "K. Keutzer", "D. Klein", "J.E. Gonzalez"], 385 "year": 2020, 386 "arxiv_id": "2002.11794", 387 "relevance": "Demonstrates model compression strategies that reduce energy use while preserving accuracy, directly relevant to SLM efficiency." 388 } 389 ] 390 }