scan.json (34312B)
1 { 2 "paper": { 3 "title": "Intelligence per Watt: Measuring Intelligence Efficiency of Local AI", 4 "authors": [ 5 "Jon Saad-Falcon", 6 "Avanika Narayan", 7 "Hakki Orhun Akengin", 8 "J. Wes Griffin", 9 "Herumb Shandilya", 10 "Adrian Gamarra Lafuente", 11 "Medhya Goel", 12 "Rebecca Joseph", 13 "Shlok Natarajan", 14 "Etash Kumar Guha", 15 "Shang Zhu", 16 "Ben Athiwaratkun", 17 "John Hennessy", 18 "Azalia Mirhoseini", 19 "Christopher Ré" 20 ], 21 "year": 2025, 22 "venue": "arXiv", 23 "arxiv_id": "2511.07885", 24 "doi": "10.48550/arXiv.2511.07885" 25 }, 26 "scan_version": 2, 27 "active_modules": ["experimental_rigor", "data_leakage"], 28 "checklist": { 29 "artifacts": { 30 "code_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The paper explicitly releases a profiling harness: 'We release our IPW profiling harness to enable intelligence-per-watt benchmarking' with a GitHub URL at https://github.com/HazyResearch/intelligence-per-watt." 34 }, 35 "data_released": { 36 "applies": true, 37 "answer": true, 38 "justification": "All evaluation datasets are publicly available: WildChat, NaturalReasoning, MMLU Pro, and SuperGPQA. The paper references these public sources and describes the sampling procedure." 39 }, 40 "environment_specified": { 41 "applies": true, 42 "answer": false, 43 "justification": "The paper describes hardware platforms in detail (Table 9) and inference hyperparameters (App B.1) but does not provide a requirements.txt, Dockerfile, or detailed software dependency list with library versions. Software tools (NVML, powermetrics, ROCm SMI) are named but not versioned." 44 }, 45 "reproduction_instructions": { 46 "applies": true, 47 "answer": false, 48 "justification": "While the released profiling harness presumably contains instructions, the paper itself does not include step-by-step reproduction instructions. The experimental setup is described at a high level but lacks a 'Reproducing Results' section with specific commands or scripts." 49 } 50 }, 51 "statistical_methodology": { 52 "confidence_intervals_or_error_bars": { 53 "applies": true, 54 "answer": true, 55 "justification": "Tables 2, 3, and 4 all report ± values (e.g., '23.2 ± 1.9%', '7.92 ± 0.32 × 10^-4'). Uncertainty is consistently reported for the main metrics." 56 }, 57 "significance_tests": { 58 "applies": true, 59 "answer": false, 60 "justification": "The paper makes many comparative claims (e.g., 'B200 achieves 1.40× higher intelligence per watt than the M4 MAX') based solely on comparing numbers with ± ranges. No formal significance tests (p-values, t-tests, etc.) are performed." 61 }, 62 "effect_sizes_reported": { 63 "applies": true, 64 "answer": true, 65 "justification": "Effect sizes are consistently reported with baseline context throughout: '5.3× overall increase', '3.1× improvement in accuracy', 'from 23.2% to 71.3%', percentage reductions (80.4% energy, 77.3% compute, 73.8% cost). The reader can assess magnitude." 66 }, 67 "sample_size_justified": { 68 "applies": true, 69 "answer": false, 70 "justification": "The study uses 500K queries from WildChat, 500K from NaturalReasoning, 12K from MMLU Pro, and 26.5K from SuperGPQA. These sizes are never justified — no power analysis or explanation for why these specific counts were chosen." 71 }, 72 "variance_reported": { 73 "applies": true, 74 "answer": true, 75 "justification": "App B.1 states 'we execute each query 10 times and aggregate power measurements across runs. For each query, we compute the mean power draw (watts) and mean energy consumption (joules) per query by averaging across these 10 independent executions.' The ± values in tables reflect this variance." 76 } 77 }, 78 "evaluation_design": { 79 "baselines_included": { 80 "applies": true, 81 "answer": true, 82 "justification": "The paper compares local models against frontier cloud models (Claude Sonnet 4.5, Gemini 2.5 Pro, GPT-5) and includes historical baselines (Mixtral-8x7B, Llama-3.1-8B) for longitudinal comparison. Cloud-only deployment serves as the resource consumption baseline." 83 }, 84 "baselines_contemporary": { 85 "applies": true, 86 "answer": true, 87 "justification": "Cloud baselines are state-of-the-art as of October 2025: Claude Sonnet 4.5, Gemini 2.5 Pro, GPT-5 (2025-08-07). Local model baselines include the latest releases from Qwen3, GPT-OSS, Gemma3, and IBM Granite 4.0." 88 }, 89 "ablation_study": { 90 "applies": true, 91 "answer": true, 92 "justification": "The paper decomposes IPW gains by holding accelerator fixed while varying model generation (3.1× from model advances) and holding model fixed while varying accelerator (1.7× from hardware advances), as shown in Table 2 and Figure 5. App C.4 ablates model precision (FP16/FP8/FP4)." 93 }, 94 "multiple_metrics": { 95 "applies": true, 96 "answer": true, 97 "justification": "The paper defines four complementary efficiency metrics (APW, PPW, APJ, PPJ) and also reports accuracy, latency, throughput, TTFT, energy consumption, memory usage, and more (Table 8 lists the full set)." 98 }, 99 "human_evaluation": { 100 "applies": true, 101 "answer": false, 102 "justification": "All evaluation is automated: LLM-as-a-judge (GPT-4O) for WildChat and NaturalReasoning, exact match for MMLU Pro and SuperGPQA. No human evaluation of model outputs is performed. Given claims about 'intelligence' and real-world query handling, human validation of output quality would strengthen the findings." 103 }, 104 "held_out_test_set": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper uses standard benchmark test sets (MMLU Pro, SuperGPQA) and samples from WildChat and NaturalReasoning datasets. There is no model tuning on these sets — the models are evaluated in their released forms." 108 }, 109 "per_category_breakdown": { 110 "applies": true, 111 "answer": true, 112 "justification": "Extensive per-category breakdowns provided: by 22 economic domains (Tables 6, 7, Figure 7), by task type (chat vs reasoning), by difficulty level (Figures 8, 9), by model family, and by hardware accelerator." 113 }, 114 "failure_cases_discussed": { 115 "applies": true, 116 "answer": true, 117 "justification": "The paper identifies where local models struggle: technical domains like Architecture & Engineering (40.8-68% solvability), reasoning tasks (24pp gap vs chat), Level 5 unsolved problems (only 4.72% solved, App C.1), and specific domain weaknesses in Table 7." 118 }, 119 "negative_results_reported": { 120 "applies": true, 121 "answer": true, 122 "justification": "Several negative results reported: local accelerators are 1.4× less efficient than cloud (Tables 3-4), SambaNova SN40L achieves 6.5-7.4× higher efficiency than Apple M4 Max (Table 4), hardest reasoning problems remain largely unsolved (Level 5, App C.1), and reasoning improvements are slower on hard problems than easy ones." 123 } 124 }, 125 "claims_and_evidence": { 126 "abstract_claims_supported": { 127 "applies": true, 128 "answer": true, 129 "justification": "Abstract claims — 88.7% query coverage (Figure 2), 5.3× IPW improvement (Table 2), 23.2% to 71.3% coverage increase (Table 2), 1.4× local vs cloud gap (Table 3) — are all directly supported by results in the paper." 130 }, 131 "causal_claims_justified": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper decomposes IPW gains by holding one variable fixed (accelerator or model) while varying the other, providing a controlled single-variable analysis. Table 2 isolates model contributions (3.1×) from hardware contributions (1.7×). This controlled decomposition is adequate for the causal claims made." 135 }, 136 "generalization_bounded": { 137 "applies": true, 138 "answer": false, 139 "justification": "The study evaluates only single-turn queries, yet the title ('Intelligence per Watt: Measuring Intelligence Efficiency of Local AI') and conclusions ('local inference can meaningfully redistribute demand from centralized infrastructure') suggest general applicability to all LLM inference. Multi-turn conversations, long-context tasks, coding, tool use, and agentic workflows are not tested but not explicitly excluded from the claims." 140 }, 141 "alternative_explanations_discussed": { 142 "applies": true, 143 "answer": false, 144 "justification": "The paper presents its interpretation of findings without substantively discussing alternatives. For example, the 88.7% coverage claim depends on LLM-as-a-judge evaluation, but judge bias toward certain model families is not discussed. The IPW metric's sensitivity to the choice of accuracy measure is not explored. Inline caveats about power measurement accuracy (10-15%) are methodological notes, not alternative explanations." 145 }, 146 "proxy_outcome_distinction": { 147 "applies": true, 148 "answer": false, 149 "justification": "The paper measures benchmark accuracy and LLM-judge win rates but frames these as 'intelligence' in the metric name (IPW). The gap between 'accuracy on selected benchmarks' and 'intelligence' is never acknowledged. The metric also reduces 'intelligence' to a scalar, ignoring task difficulty, reasoning depth, and other dimensions. The framing significantly exceeds the granularity of the measurement." 150 } 151 }, 152 "setup_transparency": { 153 "model_versions_specified": { 154 "applies": true, 155 "answer": false, 156 "justification": "Local models are named with family and size (Qwen3-4B, GPT-OSS-120B, etc.), which is acceptable for open models with single releases. However, the LLM judge is 'GPT-4O' and the annotator is 'GPT-4O-MINI' — both marketing names without snapshot dates or API versions. Frontier baselines include 'CLAUDE SONNET 4.5' and 'GEMINI 2.5 PRO' without version identifiers. Per schema: marketing names without snapshot dates do not count." 157 }, 158 "prompts_provided": { 159 "applies": true, 160 "answer": true, 161 "justification": "Full prompt text is provided in App B.1 for both LLM-as-a-judge evaluations (WildChat and NaturalReasoning) and for the economic category annotation. The actual text used is reproduced verbatim." 162 }, 163 "hyperparameters_reported": { 164 "applies": true, 165 "answer": true, 166 "justification": "App B.1 specifies: 'temperature = 0.6, top-p = 0.95, top-k = 20, min-p = 0.0, and a 32768-token output limit.' Additional settings: repetition penalty of 1.1 and length penalty of 1.0 for Qwen models. Deliberative prompting enabled for reasoning tasks." 167 }, 168 "scaffolding_described": { 169 "applies": false, 170 "answer": false, 171 "justification": "No agentic scaffolding is used. The paper performs direct model inference with standard prompt-response evaluation." 172 }, 173 "data_preprocessing_documented": { 174 "applies": true, 175 "answer": true, 176 "justification": "App B.1 documents preprocessing steps: filtering non-English entries from WildChat, removing malformed/nonsensical queries via GPT-4O-MINI, removing duplicates, filtering queries exceeding 32,000 characters, and filtering NaturalReasoning queries without ground truth answers." 177 } 178 }, 179 "limitations_and_scope": { 180 "limitations_section_present": { 181 "applies": true, 182 "answer": false, 183 "justification": "The paper has no dedicated Limitations or Threats to Validity section. Some limitations are mentioned inline (e.g., power measurement accuracy of 10-15% in Section 4.2), but there is no substantive dedicated discussion." 184 }, 185 "threats_to_validity_specific": { 186 "applies": true, 187 "answer": false, 188 "justification": "The inline mention of 10-15% power measurement inaccuracy (Section 4.2) is specific but isolated. No systematic discussion of threats like LLM-judge reliability, benchmark representativeness, single-turn limitation generalizability, or whether the 10-run averaging adequately captures variance." 189 }, 190 "scope_boundaries_stated": { 191 "applies": true, 192 "answer": false, 193 "justification": "The paper states it focuses on 'single-turn interactions' and models '≤20B active parameters' but does not explicitly state what the results do NOT show. Multi-turn conversations, coding tasks, long-context queries, tool use, and agentic workflows are all excluded from evaluation but this is never stated as a boundary on the conclusions." 194 } 195 }, 196 "data_integrity": { 197 "raw_data_available": { 198 "applies": true, 199 "answer": false, 200 "justification": "The profiling harness is released for generating new data, but the raw telemetry measurements (per-query power readings, latency traces, temperature logs from the actual study) are not explicitly released. The paper says 'We release our IPW profiling harness' — the tool, not the raw data." 201 }, 202 "data_collection_described": { 203 "applies": true, 204 "answer": true, 205 "justification": "Detailed description of telemetry collection: vendor APIs (NVML, powermetrics, ROCm SMI), 50ms sampling interval, per-GPU power aggregation, nanosecond timestamp synchronization. Query sources and sizes specified. App B.1 provides implementation details for each hardware platform." 206 }, 207 "recruitment_methods_described": { 208 "applies": false, 209 "answer": false, 210 "justification": "No human participants. All data comes from standard public benchmarks (WildChat, NaturalReasoning, MMLU Pro, SuperGPQA). The data sources are well-documented." 211 }, 212 "data_pipeline_documented": { 213 "applies": true, 214 "answer": true, 215 "justification": "The pipeline is documented: (1) query curation with filtering stages described in App B.1, (2) model inference with specified hyperparameters, (3) telemetry collection via vendor APIs with 10 repetitions per query, (4) evaluation via LLM-as-a-judge or exact match, (5) metric computation. The profiling harness comprises three described components." 216 } 217 }, 218 "conflicts_of_interest": { 219 "funding_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Extensive acknowledgments section listing NIH, NSF, DEVCOM ARL, ONR, Stanford HAI, and many industry partners: Google DeepMind, Google Research, Google Cloud, IBM, Microsoft, Intel, Qualcomm, Anthropic, Meta, Together, and more." 223 }, 224 "affiliations_disclosed": { 225 "applies": true, 226 "answer": true, 227 "justification": "Author affiliations are clearly listed: Stanford CS and Together AI. Together AI provides model serving infrastructure, and Stanford's lab receives funding from companies whose models are evaluated." 228 }, 229 "funder_independent_of_outcome": { 230 "applies": true, 231 "answer": false, 232 "justification": "Multiple funders have direct stakes in the outcomes: Together AI (co-author affiliation, provides model serving), Google (makes Gemma models and cloud accelerators evaluated), IBM (Granite models evaluated), Anthropic (Claude evaluated as baseline), NVIDIA (accelerators evaluated). These companies benefit from findings showing either local inference viability (chip/device makers) or cloud superiority (cloud providers)." 233 }, 234 "financial_interests_declared": { 235 "applies": true, 236 "answer": false, 237 "justification": "No competing interests or financial disclosure statement appears in the paper. Given the extensive industry funding and co-authors from Together AI, a financial interests declaration would be expected." 238 } 239 }, 240 "contamination": { 241 "training_cutoff_stated": { 242 "applies": true, 243 "answer": false, 244 "justification": "The paper evaluates pre-trained models (Qwen3, GPT-OSS, Gemma3, etc.) on benchmarks but never states the training data cutoff dates for any of the evaluated models." 245 }, 246 "train_test_overlap_discussed": { 247 "applies": true, 248 "answer": false, 249 "justification": "No discussion of whether benchmark examples (MMLU Pro, SuperGPQA, NaturalReasoning) appeared in the training data of the evaluated models. Given these are public benchmarks and the models are trained on web data, this is a significant omission." 250 }, 251 "benchmark_contamination_addressed": { 252 "applies": true, 253 "answer": false, 254 "justification": "MMLU Pro and its predecessor MMLU have been widely available online since before the training cutoffs of all evaluated models. NaturalReasoning sources from web content. No contamination analysis is performed or discussed." 255 } 256 }, 257 "human_studies": { 258 "pre_registered": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "irb_or_ethics_approval": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "demographics_reported": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "inclusion_exclusion_criteria": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "randomization_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 }, 283 "blinding_described": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in this study." 287 }, 288 "attrition_reported": { 289 "applies": false, 290 "answer": false, 291 "justification": "No human participants in this study." 292 } 293 }, 294 "cost_and_practicality": { 295 "inference_cost_reported": { 296 "applies": true, 297 "answer": true, 298 "justification": "Detailed cost analysis throughout. Table 12 provides per-token pricing for all models. Figures 6 shows cumulative cost over 24 hours. Energy consumption (joules per query) and power draw (watts) are reported for all model-accelerator pairs." 299 }, 300 "compute_budget_stated": { 301 "applies": true, 302 "answer": false, 303 "justification": "The paper reports per-query compute metrics but never states the total computational budget consumed by this study. Evaluating 20+ models × 8 accelerators × 1M+ queries × 10 repetitions is enormous, but the total GPU hours, energy consumed, or cost of conducting the study is not reported." 304 } 305 }, 306 "experimental_rigor": { 307 "seed_sensitivity_reported": { 308 "applies": true, 309 "answer": false, 310 "justification": "Results are averaged over 10 repetitions for power measurements, but there is no analysis of sensitivity to random seeds for model generation (temperature = 0.6 introduces stochasticity). The ± values reflect measurement variance, not seed sensitivity." 311 }, 312 "number_of_runs_stated": { 313 "applies": true, 314 "answer": true, 315 "justification": "App B.1 explicitly states: 'we execute each query 10 times and aggregate power measurements across runs. For each query, we compute the mean power draw (watts) and mean energy consumption (joules) per query by averaging across these 10 independent executions.'" 316 }, 317 "hyperparameter_search_budget": { 318 "applies": true, 319 "answer": false, 320 "justification": "The paper uses specific inference hyperparameters (temperature=0.6, top-p=0.95, top-k=20) without justifying these choices or reporting whether alternative configurations were evaluated. Given that temperature and sampling parameters can significantly affect output quality and energy consumption, this is relevant." 321 }, 322 "best_config_selection_justified": { 323 "applies": true, 324 "answer": true, 325 "justification": "The paper evaluates and reports results across all model-accelerator configurations rather than selecting a single best. The 'oracle routing' and 'best-of-local' are clearly defined selection criteria, and all individual model results are also shown (Figure 2, Tables 3-4)." 326 }, 327 "multiple_comparison_correction": { 328 "applies": false, 329 "answer": false, 330 "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable." 331 }, 332 "self_comparison_bias_addressed": { 333 "applies": true, 334 "answer": false, 335 "justification": "The authors evaluate models from companies that fund their lab (Google/Gemma, IBM/Granite, Together/GPT-OSS) using their own profiling harness and their own LLM-as-a-judge setup. The bias of evaluating systems from funders/collaborators is not acknowledged or discussed." 336 }, 337 "compute_budget_vs_performance": { 338 "applies": true, 339 "answer": true, 340 "justification": "This is the central contribution of the paper. Performance is systematically reported as a function of power (IPW, Tables 2-3), energy (IPJ, Table 4), and compute. Figures 5 and 10 track efficiency trends over time. The decomposition in Table 2 explicitly controls for compute differences." 341 }, 342 "benchmark_construct_validity": { 343 "applies": true, 344 "answer": false, 345 "justification": "The paper uses 'intelligence' in its core metric name (Intelligence Per Watt) but measures benchmark accuracy and LLM-judge win rates. The gap between 'accuracy on WildChat/NaturalReasoning/MMLU Pro/SuperGPQA' and 'intelligence' is never discussed. No analysis of whether these benchmarks measure what is claimed." 346 }, 347 "scaffold_confound_addressed": { 348 "applies": false, 349 "answer": false, 350 "justification": "No agentic scaffolding is used. Models are evaluated via direct inference without scaffolding." 351 } 352 }, 353 "data_leakage": { 354 "temporal_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether training data for the 2025 models (Qwen3, GPT-OSS, Gemma3) includes MMLU Pro content (published 2024), NaturalReasoning content (published 2025), or SuperGPQA content (published 2025). Temporal relationships between benchmark creation and model training are not analyzed." 358 }, 359 "feature_leakage_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "No discussion of whether the evaluation setup leaks information. For example, the LLM-as-a-judge prompt includes reference answers alongside model outputs, but the impact of judge prompt design on evaluation fairness across model families is not discussed." 363 }, 364 "non_independence_addressed": { 365 "applies": true, 366 "answer": false, 367 "justification": "NaturalReasoning queries are sourced from the web, and the evaluated models are trained on web data. The potential for train-test overlap due to shared web sources is not discussed." 368 }, 369 "leakage_detection_method": { 370 "applies": true, 371 "answer": false, 372 "justification": "No leakage detection or prevention methods are used. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines are applied." 373 } 374 } 375 }, 376 "claims": [ 377 { 378 "claim": "Local LMs can successfully answer 88.7% of single-turn chat and reasoning queries when routing to the best local LM per query.", 379 "evidence": "Figure 2 shows best-of-local ensemble coverage across WildChat (97.8%), NaturalReasoning (88.3%), SuperGPQA (77.0%), and MMLU Pro (92.4%). Evaluation on 1M+ queries across 20+ models.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Intelligence per watt improved 5.3× from 2023-2025, driven by compounding model (3.1×) and hardware (1.7×) advances.", 384 "evidence": "Table 2 tracks IPW from Mixtral-8x7B on Quadro RTX 6000 (7.92×10⁻⁴) to GPT-OSS-120B on Apple M4 Max (4.18×10⁻³). Decomposition holds one variable fixed to isolate contributions.", 385 "supported": "strong" 386 }, 387 { 388 "claim": "Local model coverage on single-turn queries increased from 23.2% to 71.3% from 2023 to 2025.", 389 "evidence": "Table 2 shows best local LM success rate progression. Figure 3 confirms with temporal analysis of SOTA local model releases on WildChat and NaturalReasoning.", 390 "supported": "strong" 391 }, 392 { 393 "claim": "Oracle routing between local and cloud models reduces energy consumption by 80.4%, compute by 77.3%, and cost by 73.8%.", 394 "evidence": "Figure 6 shows simulated 24-hour workload of 80.2M queries with oracle routing vs cloud-only baseline. Based on per-query energy/compute/cost measurements from Tables 3-4 and pricing from Table 12.", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "A realistic router with 80% accuracy captures approximately 80% of oracle gains (64.3% energy, 61.8% compute, 59.0% cost reduction).", 399 "evidence": "Figure 6 shows routing simulation results. Misrouted queries default to the cloud frontier model, maintaining task quality.", 400 "supported": "moderate" 401 }, 402 { 403 "claim": "Local accelerators achieve at least 1.4× lower intelligence per watt than cloud accelerators running identical models.", 404 "evidence": "Table 3 shows Apple M4 Max vs NVIDIA B200 across Qwen3 models: B200 achieves 1.40× higher IPW. Table 4 extends to intelligence per joule: B200 is 1.6-2.3× better, SambaNova SN40L is 6.5-7.4× better.", 405 "supported": "strong" 406 }, 407 { 408 "claim": "FP4 quantization yields 3-3.5× energy reduction with approximately 2.5 percentage points accuracy degradation per precision step.", 409 "evidence": "Figure 11 and App C.4 show systematic evaluation across 8 models on 3 datasets (N=10,000 each). Example: Qwen3-14B on SuperGPQA drops from 54.5% (FP16) to 49.0% (FP4) with 3.23× energy reduction.", 410 "supported": "strong" 411 }, 412 { 413 "claim": "Chat queries are substantially more amenable to local processing than reasoning queries, with a 24 percentage point gap.", 414 "evidence": "Section 5.1: best local LM achieves 88.9% coverage on WildChat vs 64.9% on NaturalReasoning. Confirmed across domains in Figure 7 and difficulty analysis in App C.1.", 415 "supported": "strong" 416 } 417 ], 418 "methodology_tags": ["benchmark-eval"], 419 "key_findings": "Local LMs (≤20B active parameters) can handle 88.7% of single-turn chat and reasoning queries when routing to the best model per query, with intelligence per watt improving 5.3× from 2023-2025. This improvement decomposes into 3.1× from model architecture advances and 1.7× from hardware improvements. Local accelerators remain 1.4× less power-efficient than cloud accelerators for identical models, but hybrid local-cloud routing can reduce energy, compute, and cost by 60-80% while maintaining answer quality. Coverage varies substantially by domain, exceeding 90% for creative tasks but dropping to 68% for technical fields.", 420 "red_flags": [ 421 { 422 "flag": "Inflated metric framing", 423 "detail": "The core metric 'Intelligence Per Watt' equates benchmark accuracy and LLM-judge win rates with 'intelligence.' This framing significantly overclaims what is actually measured (task-specific accuracy on selected benchmarks). The paper never acknowledges this gap." 424 }, 425 { 426 "flag": "Undisclosed conflicts of interest", 427 "detail": "The paper is funded by many companies whose products are evaluated (Google/Gemma, IBM/Granite, Together AI/GPT-OSS, NVIDIA/accelerators, Anthropic/Claude). Two co-authors are from Together AI. No competing interests statement is provided, and the conflict is not acknowledged despite directly evaluating funders' products." 428 }, 429 { 430 "flag": "LLM-as-a-judge validity unexamined", 431 "detail": "All WildChat evaluation uses GPT-4O as judge comparing outputs against Qwen3-235B reference answers. Judge bias toward certain model families (e.g., rewarding stylistic similarities to GPT/Qwen outputs) could systematically affect coverage measurements. The reference model (Qwen3-235B) is from the same family as several evaluated models. No judge validation or inter-rater agreement analysis is provided." 432 }, 433 { 434 "flag": "Complete contamination silence", 435 "detail": "The paper evaluates 2025 models on benchmarks (MMLU Pro, SuperGPQA) that were publicly available during model training, yet never mentions contamination. If models have seen benchmark questions during training, the 88.7% coverage claim is inflated." 436 }, 437 { 438 "flag": "Single-turn scope with broad claims", 439 "detail": "The study exclusively evaluates single-turn queries but makes sweeping claims about 'redistributing inference demand from centralized infrastructure.' Real-world LLM usage includes multi-turn conversations, coding, tool use, and agentic workflows — none tested — which may be precisely the queries least suited to local models." 440 }, 441 { 442 "flag": "Oracle routing is unrealizable", 443 "detail": "The headline resource savings (80.4% energy, 77.3% compute, 73.8% cost) assume oracle routing with perfect query-to-model assignment. While the paper also shows realistic router results, the oracle numbers are prominently featured and represent a theoretical upper bound that cannot be achieved in practice." 444 } 445 ], 446 "cited_papers": [ 447 { 448 "title": "FrugalGPT: How to use large language models while reducing cost and improving performance", 449 "authors": ["L. Chen", "M. Zaharia", "J. Zou"], 450 "year": 2023, 451 "arxiv_id": "2305.05176", 452 "relevance": "Directly relevant as a cost/efficiency-aware LLM routing approach, one of the foundational works in the query routing space that IPW extends." 453 }, 454 { 455 "title": "RouteLLM: Learning to route LLMs with preference data", 456 "authors": ["I. Ong", "A. Almahairi", "V. Wu", "W.-L. Chiang", "T. Wu", "J. E. Gonzalez", "M. W. Kadous", "I. Stoica"], 457 "year": 2024, 458 "relevance": "Key prior work on LLM routing using preference-trained classifiers, demonstrating 85% cost reduction with quality maintenance." 459 }, 460 { 461 "title": "Minions: Cost-efficient collaboration between on-device and cloud language models", 462 "authors": ["A. Narayan", "D. Biderman", "S. Eyuboglu", "A. May", "S. Linderman", "J. Zou", "C. Ré"], 463 "year": 2025, 464 "arxiv_id": "2502.15964", 465 "relevance": "Local-cloud LM collaboration protocol from the same research group, directly relevant to the hybrid inference paradigm studied." 466 }, 467 { 468 "title": "Small language models are the future of agentic AI", 469 "authors": ["P. Belcak", "G. Heinrich", "S. Diao"], 470 "year": 2025, 471 "arxiv_id": "2506.02153", 472 "relevance": "Argues for small LMs in agentic AI settings, directly complementary to this paper's empirical evidence that small models can handle most queries." 473 }, 474 { 475 "title": "RouterBench: A benchmark for multi-LLM routing system", 476 "authors": ["Q. J. Hu", "J. Bieker", "X. Li"], 477 "year": 2024, 478 "arxiv_id": "2403.12031", 479 "relevance": "Benchmark for evaluating LLM routing policies along cost-quality tradeoffs, relevant to the routing evaluation methodology." 480 }, 481 { 482 "title": "Green AI", 483 "authors": ["R. Schwartz", "J. Dodge", "N. A. Smith", "O. Etzioni"], 484 "year": 2020, 485 "doi": "10.1145/3381831", 486 "relevance": "Foundational paper proposing energy as a first-class metric for AI evaluation, directly inspiring the IPW metric." 487 }, 488 { 489 "title": "From words to watts: Benchmarking the energy costs of large language model inference", 490 "authors": ["S. Samsi", "D. Zhao", "J. McDonald"], 491 "year": 2023, 492 "arxiv_id": "2310.03003", 493 "relevance": "Prior work on LLM inference energy benchmarking that this paper extends with more models, hardware, and the accuracy-efficiency unification." 494 }, 495 { 496 "title": "Energy considerations of large language model inference and efficiency optimizations", 497 "authors": ["J. Fernandez", "C. Na", "V. Tiwari"], 498 "year": 2025, 499 "arxiv_id": "2504.17674", 500 "relevance": "Contemporary work on LLM inference energy optimization, complementary to this paper's energy profiling methodology." 501 }, 502 { 503 "title": "Which economic tasks are performed with AI? Evidence from millions of Claude conversations", 504 "authors": ["K. Handa", "A. Tamkin", "M. McCain"], 505 "year": 2025, 506 "arxiv_id": "2503.04761", 507 "relevance": "Provides the Anthropic Economic Index taxonomy used in this paper to categorize queries by occupational domain." 508 }, 509 { 510 "title": "Chatbot arena: An open platform for evaluating LLMs by human preference", 511 "authors": ["W.-L. Chiang", "L. Zheng", "Y. Sheng"], 512 "year": 2024, 513 "relevance": "LM evaluation platform (LMArena) used to determine SOTA open-source model ranking for selecting the WildChat reference model." 514 }, 515 { 516 "title": "NaturalReasoning: Reasoning in the wild with 2.8M challenging questions", 517 "authors": ["W. Yuan", "J. Yu", "S. Jiang"], 518 "year": 2025, 519 "arxiv_id": "2502.13124", 520 "relevance": "Major reasoning benchmark used in this study, providing 500K reasoning-focused queries for evaluation." 521 }, 522 { 523 "title": "A survey on collaborative mechanisms between large and small language models", 524 "authors": ["Y. Chen", "J. Zhao", "H. Han"], 525 "year": 2025, 526 "arxiv_id": "2505.07460", 527 "relevance": "Surveys collaboration strategies between large and small LMs including pipeline, routing, distillation, and fusion approaches." 528 } 529 ] 530 }