scan.json (23541B)
1 { 2 "paper": { 3 "title": "Beyond Benchmarks: The Economics of AI Inference", 4 "authors": ["Boqin Zhuang", "Jiacheng Qiao", "Mingqian Liu", "Mingxing Yu", "Ping Hong", "Rui Li", "Xiaoxia Song", "Xiangjun Xu", "Xu Chen", "Yaoyao Ma", "Yujie Gao"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2510.26136" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "The paper proposes an 'economics of inference' framework evaluating 9 LLMs on WiNEval-3.0 (2,993 medical tasks) across cost, performance, and quality. It finds diminishing marginal cost with increasing concurrency, an optimal concurrency inflection point per model, and identifies WiNGPT-3.5 (the authors' own model) as the best cost-quality option at $0.34/76.2 score. The framework estimates A800 GPU hourly cost at ~$0.79 and maps a cost-quality Pareto frontier.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "WiNEval-3.0 is described as a proprietary medical evaluation set from Winning Health. No download link or public access is provided." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "The paper states 'A800 80G × 2 cards' but provides no software environment details — no inference framework version, OS, driver version, or dependency specifications." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No reproduction instructions, README, or scripts are provided. The benchmark is proprietary and unavailable." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "All results are point estimates. No confidence intervals or error bars are reported in any table or figure." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper claims WiNGPT-3.5 is the 'overall leader' and makes comparative claims across models, but no statistical significance tests are performed." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "Table 2 reports absolute cost and score values with enough context to compute differences (e.g., WiNGPT-3.5 at $0.34/76.2 vs Seed-OSS-36B at $0.55/72.2). Appendix B shows performance changes across concurrency levels with absolute values." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "The benchmark has 2,993 requests but no justification is given for why this number is sufficient. No power analysis or discussion of statistical adequacy." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "The paper acknowledges 'slight fluctuations' from model randomness and vLLM scheduling but reports no standard deviation, variance, or multiple-run statistics. Single-run results only." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Nine models are compared including WiNGPT variants, Qwen3-30B, GLM-4-32B, Mistral-Small, medgemma-27b, Seed-OSS-36B, and gpt-oss-20b (Table 2)." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "Models include Qwen3-30B, GLM-4-32B-0414, medgemma-27b, and Mistral-Small, which are contemporary as of 2025." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": false, 77 "justification": "No ablation study is conducted. The framework has multiple components (cost model, performance metrics, quality score) but none are ablated." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Three performance metrics (Total Completion Time, Avg TTFT, Avg Throughput) plus quality score and cost are reported (Table 1, Table 2)." 83 }, 84 "human_evaluation": { 85 "applies": false, 86 "answer": false, 87 "justification": "The paper benchmarks inference economics — human evaluation of model outputs is not the focus. Quality is measured via WiNEval-3.0 automated scoring." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": false, 92 "justification": "WiNEval-3.0 is described only as a test set. No discussion of held-out vs. development splits, or whether any models were tuned on this data." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": false, 97 "justification": "WiNEval-3.0 covers '10 core scenarios' (medical licensing exams, clinical diagnosis, etc.) but no per-category breakdown of scores is provided. Only aggregate scores appear in Table 2." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": false, 102 "justification": "No failure cases or error analysis are discussed. No examples of model failures on specific tasks." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper reports that increasing concurrency beyond the optimal point causes throughput to drop sharply and TTFT to increase (Section 6.1 point 2). WiNGPT-3.0's extreme cost ($3.47) is discussed honestly as an outlier." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims about diminishing marginal cost, diminishing returns to scale, and an optimal cost-effectiveness zone are supported by the concurrency data in Appendix B and the Pareto frontier in Figure 1." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": false, 119 "justification": "The paper makes causal claims like 'increasing concurrency is the most effective way to amortize fixed overhead' (Section 6.1) without controlling for confounds. The concurrency-cost relationship is presented as causal without experimental isolation." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The paper claims to provide 'the first quantifiable decision-making tool for selecting the best AI technology within a limited budget' (Section 8) but all results are from a single medical benchmark (WiNEval-3.0) on A800 GPUs. The title 'The Economics of AI Inference' is far broader than what was tested." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "No alternative explanations are considered. For example, the paper does not discuss whether WiNGPT-3.5's high score could be due to training on similar medical data, or whether tokenizer efficiency differences confound cost comparisons." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": false, 134 "justification": "WiNEval-3.0 score is used as a proxy for 'quality' and 'intelligence' without acknowledging the gap. The paper equates benchmark performance with clinical utility: 'the core quality metric to measure a model's comprehensive abilities in medical knowledge understanding, clinical reasoning, and instruction following' (Section 5.2) but does not discuss whether benchmark scores translate to real clinical value." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "Models are listed as 'WiNGPT-3.5', 'Qwen3-30B', 'GLM-4-32B-0414', etc. GLM includes a version suffix but most models lack snapshot dates or exact version identifiers. Parameter counts are given but not specific checkpoints." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": false, 146 "justification": "No prompts or system instructions used in the WiNEval-3.0 evaluation are provided. The evaluation setup is described only at a high level." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": false, 151 "justification": "No inference hyperparameters (temperature, top-p, max tokens) are reported. Only concurrency levels are varied." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "No agentic scaffolding is used. Models are evaluated via direct inference." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": false, 161 "justification": "No description of how WiNEval-3.0 tasks were constructed, selected, or preprocessed. The benchmark's composition is described only at a high level ('10 core scenarios')." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 7 'Limitations' lists five specific limitations including exclusion of training costs, hardware dependency, proxy nature of benchmarks, lack of statistical confidence analysis, and capital expenditure considerations." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "The limitations are specific to this study: 'Changing the GPU, inference engine, or quantization strategy could significantly alter the performance and cost data' (Limitation 2), and 'WiNEval-3.0 serves as a high-quality proxy metric... not entirely equivalent to a model's final performance in specific, specialized clinical business scenarios' (Limitation 3)." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section 7 explicitly states training costs are not included (Limitation 1), results depend on specific hardware (Limitation 2), and upfront capital expenditure is not considered (Limitation 5)." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "No raw data is available. WiNEval-3.0 is proprietary and not released. Only aggregated results in tables are provided." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": false, 190 "justification": "WiNEval-3.0 is described as covering '10 core scenarios' from 'real clinical applications' but the actual data collection procedure is not documented." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. Data is a benchmark of medical tasks, not a human study." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": false, 200 "justification": "The pipeline from task execution to cost calculation is outlined (Sections 3-4) but the construction of WiNEval-3.0 itself and any filtering or curation steps are not documented." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding disclosure or acknowledgments section is present in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Authors are identified as 'WiNGPT Team' from 'Winning Health AI Research' in the header. The affiliation is clear." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "Winning Health is the developer of the WiNGPT model family. They are directly evaluating their own commercial products using their own proprietary benchmark, and their model (WiNGPT-3.5) is declared the winner. The funder/employer has a direct financial interest in the outcome." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests statement is present. The authors work for the company whose products are being favorably evaluated." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "No training data cutoff dates are stated for any of the models tested." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No discussion of whether WiNEval-3.0 data could have appeared in any model's training data. Particularly concerning for WiNGPT models which are developed by the same company." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "WiNEval-3.0 is a proprietary benchmark from Winning Health, and WiNGPT models are also from Winning Health. The obvious contamination risk — that WiNGPT may have been trained or validated on WiNEval data — is never addressed." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": true, 283 "justification": "Inference cost is the central topic. Table 2 reports cost per model for the full test set. Section 3 derives GPU hourly cost (~$0.79/hour for A800)." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": true, 288 "justification": "Hardware is specified as A800 80G × 2 cards. Total execution times per model and concurrency level are reported in Appendix B." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No multiple-seed results reported. The paper acknowledges 'model generation has some inherent randomness' (Appendix B) but does not run multiple seeds." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "The number of experimental runs is never stated. Results appear to be from single runs." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": false, 304 "answer": false, 305 "justification": "No hyperparameter search is conducted. Only concurrency levels are varied, which is a system configuration rather than model hyperparameter tuning." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": true, 310 "justification": "Section 6.1 explicitly states the selection criterion: 'find the concurrency setting with the lowest cost (i.e., shortest total completion time) while meeting the performance baselines (e.g., throughput > 20 tokens/s and latency < 1s).' All configurations are shown in Appendix B." 311 }, 312 "multiple_comparison_correction": { 313 "applies": false, 314 "answer": false, 315 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "Winning Health evaluates their own WiNGPT models on their own WiNEval-3.0 benchmark and finds WiNGPT-3.5 is the 'overall leader.' No acknowledgment of self-evaluation bias." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": true, 325 "justification": "Performance as a function of concurrency (a compute variable) is reported for every model in Appendix B. The Pareto frontier (Figure 1) plots quality vs. cost." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": false, 330 "justification": "WiNEval-3.0 is introduced as a 'professional evaluation set for the medical field' with representative properties, but no construct validity analysis is provided. The paper does not discuss whether WiNEval-3.0 scores actually measure clinical utility." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": false, 334 "answer": false, 335 "justification": "No scaffolding is involved. Models are evaluated via direct inference." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "No discussion of when WiNEval-3.0 tasks were created relative to model training periods." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether the evaluation setup leaks information. For WiNGPT models tested on their own company's benchmark, this is particularly relevant." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether WiNEval-3.0 tasks overlap with WiNGPT training data or share structural similarities." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No leakage detection or prevention methods are applied." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "WiNGPT-3.5 is the overall leader in cost-quality tradeoff at $0.34 cost and 76.2 quality score", 364 "evidence": "Table 2 shows WiNGPT-3.5 achieves the highest score (76.2) at a competitive cost ($0.34) among all tested models.", 365 "supported": "weak" 366 }, 367 { 368 "claim": "Increasing concurrency reduces total time until a performance inflection point, after which throughput drops sharply", 369 "evidence": "Section 6.1 and Appendix B show concurrency-performance curves for all models. WiNGPT-3.5 drops from 2034s at concurrency 8 to 774s at concurrency 48, then throughput degrades beyond that.", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "A800 80G baseline hourly cost is approximately $0.79/hour under common assumptions", 374 "evidence": "Appendix A provides the cost breakdown: depreciation ($0.64) + power ($0.08) + maintenance ($0.06) = $0.78/hour, with parameter assumptions documented.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "This is the first 'LLM Inference Production Frontier'", 379 "evidence": "The abstract claims this is the 'first' such frontier. No evidence is provided that prior work has not done similar cost-quality analysis.", 380 "supported": "unsupported" 381 } 382 ], 383 "red_flags": [ 384 { 385 "flag": "Company evaluating its own product as the winner", 386 "detail": "Winning Health evaluates their own WiNGPT model family on their own proprietary WiNEval-3.0 benchmark and concludes WiNGPT-3.5 is the 'overall leader.' This is an undisclosed conflict of interest with no mitigation — the benchmark, the models, and the evaluation are all controlled by the same entity." 387 }, 388 { 389 "flag": "Proprietary, unavailable benchmark", 390 "detail": "WiNEval-3.0 is not publicly available, making independent verification impossible. Since the benchmark creator also makes the winning model, results cannot be validated externally." 391 }, 392 { 393 "flag": "No uncertainty quantification", 394 "detail": "All results are single-run point estimates with no error bars, confidence intervals, or multi-run variance, despite the paper acknowledging randomness in model generation and inference scheduling." 395 }, 396 { 397 "flag": "Potential training data contamination", 398 "detail": "WiNGPT models may have been trained or validated on WiNEval-3.0 data since both come from Winning Health. This critical concern is never addressed." 399 }, 400 { 401 "flag": "Overclaiming in title and conclusion", 402 "detail": "The paper claims to present 'the first quantifiable decision-making tool' for AI technology selection and 'The Economics of AI Inference' broadly, but all evidence comes from a single medical benchmark on one GPU type." 403 } 404 ], 405 "cited_papers": [ 406 { 407 "title": "Language models are few-shot learners", 408 "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"], 409 "year": 2020, 410 "relevance": "Foundational GPT-3 paper establishing LLM capabilities and scaling properties." 411 }, 412 { 413 "title": "Llama 2: Open foundation and fine-tuned chat models", 414 "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"], 415 "year": 2023, 416 "arxiv_id": "2307.09288", 417 "relevance": "Open-weight LLM family relevant to inference cost and deployment studies." 418 }, 419 { 420 "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena", 421 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 422 "year": 2023, 423 "relevance": "LLM evaluation methodology and benchmarking approaches." 424 }, 425 { 426 "title": "Carbon emissions and large neural network training", 427 "authors": ["David Patterson", "Joseph Gonzalez", "Quoc Le"], 428 "year": 2021, 429 "arxiv_id": "2104.10350", 430 "relevance": "Cost and environmental impact of large model training, related to inference economics." 431 }, 432 { 433 "title": "Scaling laws for neural language models", 434 "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"], 435 "year": 2020, 436 "arxiv_id": "2001.08361", 437 "relevance": "Foundational scaling laws relating compute, data, and model performance." 438 }, 439 { 440 "title": "Training compute-optimal large language models", 441 "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"], 442 "year": 2022, 443 "arxiv_id": "2203.15556", 444 "relevance": "Chinchilla scaling laws for compute-optimal training, directly relevant to inference cost tradeoffs." 445 }, 446 { 447 "title": "Efficient memory management for large language model serving with PagedAttention", 448 "authors": ["Woosuk Kwon", "Zhuohan Li", "Siyuan Zhuang"], 449 "year": 2023, 450 "relevance": "vLLM inference framework used in many deployment setups; relevant to inference cost optimization." 451 } 452 ] 453 }