scan.json (19860B)
1 { 2 "paper": { 3 "title": "The Cost of Dynamic Reasoning: Demystifying AI Agents and Test-Time Scaling from an AI Infrastructure Perspective", 4 "authors": ["Jiin Kim", "Byeongjun Shin", "Jinha Chung", "Minsoo Rhu"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2506.04301" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "The paper states 'Open-sourced at https://github.com/VIA-Research/AgentBench' in Section I, providing a repository URL." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses publicly available benchmarks: HotpotQA, WebShop, MATH, HumanEval, and ShareGPT, all of which are publicly available datasets." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": true, 24 "justification": "Section III specifies vLLM version 0.6.6, PyTorch 2.6, CUDA 12.8, GCP instance types (a2-highgpu-1g, a2-highgpu-8g), NVIDIA A100 40GB GPUs, and specific CPU/memory configurations." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "While the GitHub repo is linked and hardware is specified, the paper does not include step-by-step reproduction instructions or describe how to replicate the experiments." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Results are reported as point estimates (e.g., accuracy percentages, latency values) without confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper compares agents and configurations by raw numbers without any statistical significance tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper consistently reports relative increases with baselines, e.g., '62.1x-136.5x increase in GPU energy per query' compared to ShareGPT, and percentage improvements like '60.1% reduction in prefill latency'." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "Section V states 'we used a benchmark of 50 sample questions' without justifying why 50 was chosen or discussing whether this is sufficient for the claims made." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "While latency distributions are shown (Figure 7) and 95th percentile latencies are reported, no standard deviations or variance measures across experimental runs are provided. Results appear to be single-run." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "CoT serves as a non-agentic baseline, and ShareGPT serves as the conventional single-turn LLM inference baseline. Multiple agents are compared against each other." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "The agents evaluated (ReAct, Reflexion, LATS, LLMCompiler) are well-established and representative of current agent design patterns." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Section V analyzes the impact of individual design parameters: iteration budget (Figure 14), few-shot examples (Figure 15), sequential vs. parallel scaling (Figure 16), and model size (Figure 17)." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper reports accuracy, latency (average and 95th percentile), GPU energy consumption (Wh), token counts, GPU utilization, KV cache memory, and accuracy-per-latency ratio." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "This is a systems characterization paper measuring computational costs; human evaluation of outputs is not relevant to the claims." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "The paper uses established benchmark test sets (HotpotQA, WebShop, MATH, HumanEval) with their official evaluation protocols." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down per benchmark (HotpotQA, WebShop, MATH, HumanEval), per agent type, and per component (prefill vs. decode, LLM vs. tool latency)." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "The paper discusses diminishing returns, outlier tasks that consume full iteration budgets, cases where LLMCompiler's DAG-style planning leads to unnecessary tool invocations on WebShop, and accuracy decline from excessive few-shot examples." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that LLMCompiler performs worse than ReAct on WebShop, that excessive few-shot examples can decrease accuracy, and that sequential scaling with smaller models shows limited improvement." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims about diminishing returns, widening latency variance, and unsustainable infrastructure costs are all supported by quantitative results in Sections IV-VI." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "Causal claims like 'prefix caching reduces prefill latency' are supported by controlled comparisons (with/without prefix caching). Claims about iteration budget effects are supported by systematic parameter variation." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper uses only Llama-3.1-Instruct (8B and 70B) but makes broad claims about 'AI agents' and 'test-time scaling' generally. The datacenter power projections extrapolate from a single model family to industry-wide estimates without caveating model-specific limitations." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper does not discuss alternative explanations for its findings, such as whether different serving frameworks, batching strategies, or model architectures would substantially change the conclusions." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Section III specifies 'Llama-3.1-8B-Instruct' and 'Llama-3.1-70B-Instruct' with references [44] and [45]." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper describes prompt components (instruction, few-shot, user tokens) but does not provide the actual prompt text used for any agent configuration." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "No temperature, top-p, or other LLM sampling parameters are reported. The paper does not state what generation settings were used." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "Section II-B and III describe each agent's workflow in detail: CoT, ReAct, Reflexion, LATS (tree search), and LLMCompiler (DAG planning with async execution). Figure 3 shows execution timelines." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": false, 152 "justification": "Section V mentions '50 sample questions' but does not describe how these were selected from the full benchmarks or whether any filtering was applied." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section VII (Discussion) discusses limitations including the lack of SLA constraints, the focus on fundamental optimizations only, and acknowledges that batching is not accounted for in energy estimates." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "The discussion mentions general future directions but does not identify specific threats to validity of the study's findings, such as the small sample size (50 questions), single model family, or potential non-representativeness of selected agents." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "The paper explicitly states that LLM request batching is not accounted for, that only GPU energy is measured (excluding CPU, memory, cooling), and that the analysis uses a single model family. Section VII notes SLA analysis is left as future work." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "No raw measurement data (latency logs, energy readings, per-query results) is made available for independent verification." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section III describes the measurement setup: GCP instances, vLLM serving, NVIDIA DCGM for GPU utilization, Poisson arrival distribution for traffic simulation." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants; data sources are standard benchmarks." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": false, 191 "justification": "The paper does not document how the 50 sample questions were drawn from each benchmark or how measurements were aggregated into the reported statistics." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "The Acknowledgment section lists IITP grants funded by the Korea government (MSIT) and Samsung Research Funding Center." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "All authors are affiliated with KAIST, clearly stated on the first page." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": true, 208 "justification": "The funders (Korean government IITP grants and Samsung Research) do not have a direct financial stake in showing that AI agents are costly or sustainable." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper uses Llama-3.1-Instruct models evaluated on HumanEval and MATH but does not state the training data cutoff date." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "HumanEval (published 2021) and MATH are used with Llama-3.1 (2024) without discussing potential training data overlap." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "No discussion of whether HumanEval, MATH, or HotpotQA appeared in Llama-3.1's training data, despite these being well-known public benchmarks predating the model." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "This is the core contribution of the paper. Table III reports energy per query (Wh), latency per query, and Section VI projects datacenter-wide power demands." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": true, 279 "justification": "The paper specifies hardware used (A100 GPUs on GCP), reports GPU energy consumption per query, and provides detailed compute cost analysis across all configurations." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "AI agents consume 62.1x to 136.5x more GPU energy per query than conventional single-turn LLM inference.", 286 "evidence": "Table III shows Reflexion consumes 130.9x (8B) and 136.5x (70B) more energy, LATS consumes 71.7x (8B) and 62.1x (70B) more energy compared to ShareGPT baseline.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Prefix caching achieves an average 5.62x throughput increase for ReAct-based agent workloads versus only 1.03x for chatbot workloads.", 291 "evidence": "Figure 11 and surrounding text in Section IV-C report these throughput improvements.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "Test-time scaling yields diminishing returns: achieving the same 4% accuracy gain costs 31x more compute at higher scaling levels.", 296 "evidence": "Section V-B reports that in Reflexion, the same accuracy improvement costs 31x more latency at higher scaling points (from 56.0s requiring 269.5s additional vs. earlier 16.9s to 25.6s).", 297 "supported": "strong" 298 }, 299 { 300 "claim": "Agentic workloads at current ChatGPT-scale traffic would require gigawatt-scale datacenter power.", 301 "evidence": "Table IV shows 70B Reflexion at 71.4M queries/day requires ~1 GW. The paper notes this aligns with OpenAI's planned Stargate facility.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "Smaller models (8B) with parallel scaling (LATS) can approach the accuracy of larger models (70B) with lower energy cost.", 306 "evidence": "Figure 17 shows 8B LATS with parallel scaling achieves 80% accuracy on HotpotQA vs 82% for 70B, while consuming less energy per query.", 307 "supported": "moderate" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "AI agents consume 62-137x more GPU energy per query than single-turn LLM inference, with test-time scaling exhibiting sharply diminishing returns in accuracy. Prefix caching is particularly effective for agentic workloads, achieving 5.62x throughput improvements versus 1.03x for chatbots. The paper projects that scaling agentic workloads to current ChatGPT traffic levels would require gigawatt-scale datacenter power, comparable to the announced budget for OpenAI's Stargate facility. Smaller models with parallel scaling strategies can approach larger model accuracy at lower energy cost.", 312 "red_flags": [ 313 { 314 "flag": "Very small evaluation sample", 315 "detail": "All accuracy and cost-efficiency results in Section V are based on only 50 sample questions per benchmark, which is small for drawing the broad conclusions presented." 316 }, 317 { 318 "flag": "Single model family", 319 "detail": "All experiments use only Llama-3.1-Instruct (8B and 70B). Results may not generalize to other architectures (e.g., GPT-4, Claude, Gemini) that may have different inference characteristics." 320 }, 321 { 322 "flag": "No uncertainty quantification", 323 "detail": "No error bars, confidence intervals, or multi-run variance reported for any results. It is unclear whether results are from single runs." 324 }, 325 { 326 "flag": "Datacenter projections extrapolate heavily", 327 "detail": "Power demand projections in Table IV and Section VI extrapolate single-query energy measurements to billions of queries without accounting for batching, hardware improvements, or efficiency optimizations that would apply at scale." 328 } 329 ], 330 "cited_papers": [ 331 { 332 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 333 "authors": ["S. Yao"], 334 "year": 2022, 335 "relevance": "Foundational agent framework evaluated in this paper; key baseline for agentic reasoning cost analysis." 336 }, 337 { 338 "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", 339 "authors": ["N. Shinn"], 340 "year": 2023, 341 "relevance": "Reflective agent framework evaluated as a sequential test-time scaling strategy." 342 }, 343 { 344 "title": "Language Agent Tree Search Unifies Reasoning Acting and Planning in Language Models", 345 "authors": ["A. Zhou"], 346 "year": 2023, 347 "relevance": "Tree-search agent framework evaluated for parallel test-time scaling characteristics." 348 }, 349 { 350 "title": "An LLM Compiler for Parallel Function Calling", 351 "authors": ["S. Kim"], 352 "year": 2023, 353 "relevance": "DAG-based planning agent evaluated for structured planning efficiency." 354 }, 355 { 356 "title": "Evaluating Large Language Models Trained on Code", 357 "authors": ["M. Chen"], 358 "year": 2021, 359 "relevance": "HumanEval benchmark used for code generation evaluation in the study." 360 }, 361 { 362 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 363 "authors": ["J. Wei"], 364 "year": 2022, 365 "relevance": "CoT prompting serves as the static reasoning baseline in this infrastructure analysis." 366 }, 367 { 368 "title": "The growing energy footprint of artificial intelligence", 369 "authors": ["A. de Vries"], 370 "year": 2023, 371 "relevance": "Provides context on AI energy consumption that motivates this work's infrastructure analysis." 372 }, 373 { 374 "title": "Small language models are the future of agentic AI", 375 "authors": ["P. Belcak"], 376 "year": 2025, 377 "relevance": "Advocates heterogeneous SLM/LLM multi-agent systems for cost reduction, aligned with this paper's findings." 378 }, 379 { 380 "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversation", 381 "authors": ["Q. Wu"], 382 "year": 2023, 383 "relevance": "Multi-agent framework relevant to understanding scaling costs of agent coordination." 384 }, 385 { 386 "title": "Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters", 387 "authors": ["C. Snell"], 388 "year": 2024, 389 "relevance": "Foundational work on test-time scaling that this paper extends with infrastructure cost analysis." 390 } 391 ] 392 }