scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23541B)
      1 {
      2   "paper": {
      3     "title": "Beyond Benchmarks: The Economics of AI Inference",
      4     "authors": ["Boqin Zhuang", "Jiacheng Qiao", "Mingqian Liu", "Mingxing Yu", "Ping Hong", "Rui Li", "Xiaoxia Song", "Xiangjun Xu", "Xu Chen", "Yaoyao Ma", "Yujie Gao"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2510.26136"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "The paper proposes an 'economics of inference' framework evaluating 9 LLMs on WiNEval-3.0 (2,993 medical tasks) across cost, performance, and quality. It finds diminishing marginal cost with increasing concurrency, an optimal concurrency inflection point per model, and identifies WiNGPT-3.5 (the authors' own model) as the best cost-quality option at $0.34/76.2 score. The framework estimates A800 GPU hourly cost at ~$0.79 and maps a cost-quality Pareto frontier.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "WiNEval-3.0 is described as a proprietary medical evaluation set from Winning Health. No download link or public access is provided."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper states 'A800 80G × 2 cards' but provides no software environment details — no inference framework version, OS, driver version, or dependency specifications."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No reproduction instructions, README, or scripts are provided. The benchmark is proprietary and unavailable."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "All results are point estimates. No confidence intervals or error bars are reported in any table or figure."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper claims WiNGPT-3.5 is the 'overall leader' and makes comparative claims across models, but no statistical significance tests are performed."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Table 2 reports absolute cost and score values with enough context to compute differences (e.g., WiNGPT-3.5 at $0.34/76.2 vs Seed-OSS-36B at $0.55/72.2). Appendix B shows performance changes across concurrency levels with absolute values."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The benchmark has 2,993 requests but no justification is given for why this number is sufficient. No power analysis or discussion of statistical adequacy."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper acknowledges 'slight fluctuations' from model randomness and vLLM scheduling but reports no standard deviation, variance, or multiple-run statistics. Single-run results only."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Nine models are compared including WiNGPT variants, Qwen3-30B, GLM-4-32B, Mistral-Small, medgemma-27b, Seed-OSS-36B, and gpt-oss-20b (Table 2)."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Models include Qwen3-30B, GLM-4-32B-0414, medgemma-27b, and Mistral-Small, which are contemporary as of 2025."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No ablation study is conducted. The framework has multiple components (cost model, performance metrics, quality score) but none are ablated."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Three performance metrics (Total Completion Time, Avg TTFT, Avg Throughput) plus quality score and cost are reported (Table 1, Table 2)."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "The paper benchmarks inference economics — human evaluation of model outputs is not the focus. Quality is measured via WiNEval-3.0 automated scoring."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "WiNEval-3.0 is described only as a test set. No discussion of held-out vs. development splits, or whether any models were tuned on this data."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "WiNEval-3.0 covers '10 core scenarios' (medical licensing exams, clinical diagnosis, etc.) but no per-category breakdown of scores is provided. Only aggregate scores appear in Table 2."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "No failure cases or error analysis are discussed. No examples of model failures on specific tasks."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports that increasing concurrency beyond the optimal point causes throughput to drop sharply and TTFT to increase (Section 6.1 point 2). WiNGPT-3.0's extreme cost ($3.47) is discussed honestly as an outlier."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims about diminishing marginal cost, diminishing returns to scale, and an optimal cost-effectiveness zone are supported by the concurrency data in Appendix B and the Pareto frontier in Figure 1."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper makes causal claims like 'increasing concurrency is the most effective way to amortize fixed overhead' (Section 6.1) without controlling for confounds. The concurrency-cost relationship is presented as causal without experimental isolation."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper claims to provide 'the first quantifiable decision-making tool for selecting the best AI technology within a limited budget' (Section 8) but all results are from a single medical benchmark (WiNEval-3.0) on A800 GPUs. The title 'The Economics of AI Inference' is far broader than what was tested."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "No alternative explanations are considered. For example, the paper does not discuss whether WiNGPT-3.5's high score could be due to training on similar medical data, or whether tokenizer efficiency differences confound cost comparisons."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "WiNEval-3.0 score is used as a proxy for 'quality' and 'intelligence' without acknowledging the gap. The paper equates benchmark performance with clinical utility: 'the core quality metric to measure a model's comprehensive abilities in medical knowledge understanding, clinical reasoning, and instruction following' (Section 5.2) but does not discuss whether benchmark scores translate to real clinical value."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "Models are listed as 'WiNGPT-3.5', 'Qwen3-30B', 'GLM-4-32B-0414', etc. GLM includes a version suffix but most models lack snapshot dates or exact version identifiers. Parameter counts are given but not specific checkpoints."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "No prompts or system instructions used in the WiNEval-3.0 evaluation are provided. The evaluation setup is described only at a high level."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No inference hyperparameters (temperature, top-p, max tokens) are reported. Only concurrency levels are varied."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. Models are evaluated via direct inference."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "No description of how WiNEval-3.0 tasks were constructed, selected, or preprocessed. The benchmark's composition is described only at a high level ('10 core scenarios')."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 7 'Limitations' lists five specific limitations including exclusion of training costs, hardware dependency, proxy nature of benchmarks, lack of statistical confidence analysis, and capital expenditure considerations."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The limitations are specific to this study: 'Changing the GPU, inference engine, or quantization strategy could significantly alter the performance and cost data' (Limitation 2), and 'WiNEval-3.0 serves as a high-quality proxy metric... not entirely equivalent to a model's final performance in specific, specialized clinical business scenarios' (Limitation 3)."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 7 explicitly states training costs are not included (Limitation 1), results depend on specific hardware (Limitation 2), and upfront capital expenditure is not considered (Limitation 5)."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw data is available. WiNEval-3.0 is proprietary and not released. Only aggregated results in tables are provided."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "WiNEval-3.0 is described as covering '10 core scenarios' from 'real clinical applications' but the actual data collection procedure is not documented."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Data is a benchmark of medical tasks, not a human study."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The pipeline from task execution to cost calculation is outlined (Sections 3-4) but the construction of WiNEval-3.0 itself and any filtering or curation steps are not documented."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding disclosure or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Authors are identified as 'WiNGPT Team' from 'Winning Health AI Research' in the header. The affiliation is clear."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Winning Health is the developer of the WiNGPT model family. They are directly evaluating their own commercial products using their own proprietary benchmark, and their model (WiNGPT-3.5) is declared the winner. The funder/employer has a direct financial interest in the outcome."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement is present. The authors work for the company whose products are being favorably evaluated."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No training data cutoff dates are stated for any of the models tested."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether WiNEval-3.0 data could have appeared in any model's training data. Particularly concerning for WiNGPT models which are developed by the same company."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "WiNEval-3.0 is a proprietary benchmark from Winning Health, and WiNGPT models are also from Winning Health. The obvious contamination risk — that WiNGPT may have been trained or validated on WiNEval data — is never addressed."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Inference cost is the central topic. Table 2 reports cost per model for the full test set. Section 3 derives GPU hourly cost (~$0.79/hour for A800)."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Hardware is specified as A800 80G × 2 cards. Total execution times per model and concurrency level are reported in Appendix B."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No multiple-seed results reported. The paper acknowledges 'model generation has some inherent randomness' (Appendix B) but does not run multiple seeds."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The number of experimental runs is never stated. Results appear to be from single runs."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": false,
    304         "answer": false,
    305         "justification": "No hyperparameter search is conducted. Only concurrency levels are varied, which is a system configuration rather than model hyperparameter tuning."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Section 6.1 explicitly states the selection criterion: 'find the concurrency setting with the lowest cost (i.e., shortest total completion time) while meeting the performance baselines (e.g., throughput > 20 tokens/s and latency < 1s).' All configurations are shown in Appendix B."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Winning Health evaluates their own WiNGPT models on their own WiNEval-3.0 benchmark and finds WiNGPT-3.5 is the 'overall leader.' No acknowledgment of self-evaluation bias."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "Performance as a function of concurrency (a compute variable) is reported for every model in Appendix B. The Pareto frontier (Figure 1) plots quality vs. cost."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "WiNEval-3.0 is introduced as a 'professional evaluation set for the medical field' with representative properties, but no construct validity analysis is provided. The paper does not discuss whether WiNEval-3.0 scores actually measure clinical utility."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding is involved. Models are evaluated via direct inference."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of when WiNEval-3.0 tasks were created relative to model training periods."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the evaluation setup leaks information. For WiNGPT models tested on their own company's benchmark, this is particularly relevant."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether WiNEval-3.0 tasks overlap with WiNGPT training data or share structural similarities."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection or prevention methods are applied."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "WiNGPT-3.5 is the overall leader in cost-quality tradeoff at $0.34 cost and 76.2 quality score",
    364       "evidence": "Table 2 shows WiNGPT-3.5 achieves the highest score (76.2) at a competitive cost ($0.34) among all tested models.",
    365       "supported": "weak"
    366     },
    367     {
    368       "claim": "Increasing concurrency reduces total time until a performance inflection point, after which throughput drops sharply",
    369       "evidence": "Section 6.1 and Appendix B show concurrency-performance curves for all models. WiNGPT-3.5 drops from 2034s at concurrency 8 to 774s at concurrency 48, then throughput degrades beyond that.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "A800 80G baseline hourly cost is approximately $0.79/hour under common assumptions",
    374       "evidence": "Appendix A provides the cost breakdown: depreciation ($0.64) + power ($0.08) + maintenance ($0.06) = $0.78/hour, with parameter assumptions documented.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "This is the first 'LLM Inference Production Frontier'",
    379       "evidence": "The abstract claims this is the 'first' such frontier. No evidence is provided that prior work has not done similar cost-quality analysis.",
    380       "supported": "unsupported"
    381     }
    382   ],
    383   "red_flags": [
    384     {
    385       "flag": "Company evaluating its own product as the winner",
    386       "detail": "Winning Health evaluates their own WiNGPT model family on their own proprietary WiNEval-3.0 benchmark and concludes WiNGPT-3.5 is the 'overall leader.' This is an undisclosed conflict of interest with no mitigation — the benchmark, the models, and the evaluation are all controlled by the same entity."
    387     },
    388     {
    389       "flag": "Proprietary, unavailable benchmark",
    390       "detail": "WiNEval-3.0 is not publicly available, making independent verification impossible. Since the benchmark creator also makes the winning model, results cannot be validated externally."
    391     },
    392     {
    393       "flag": "No uncertainty quantification",
    394       "detail": "All results are single-run point estimates with no error bars, confidence intervals, or multi-run variance, despite the paper acknowledging randomness in model generation and inference scheduling."
    395     },
    396     {
    397       "flag": "Potential training data contamination",
    398       "detail": "WiNGPT models may have been trained or validated on WiNEval-3.0 data since both come from Winning Health. This critical concern is never addressed."
    399     },
    400     {
    401       "flag": "Overclaiming in title and conclusion",
    402       "detail": "The paper claims to present 'the first quantifiable decision-making tool' for AI technology selection and 'The Economics of AI Inference' broadly, but all evidence comes from a single medical benchmark on one GPU type."
    403     }
    404   ],
    405   "cited_papers": [
    406     {
    407       "title": "Language models are few-shot learners",
    408       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    409       "year": 2020,
    410       "relevance": "Foundational GPT-3 paper establishing LLM capabilities and scaling properties."
    411     },
    412     {
    413       "title": "Llama 2: Open foundation and fine-tuned chat models",
    414       "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"],
    415       "year": 2023,
    416       "arxiv_id": "2307.09288",
    417       "relevance": "Open-weight LLM family relevant to inference cost and deployment studies."
    418     },
    419     {
    420       "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena",
    421       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    422       "year": 2023,
    423       "relevance": "LLM evaluation methodology and benchmarking approaches."
    424     },
    425     {
    426       "title": "Carbon emissions and large neural network training",
    427       "authors": ["David Patterson", "Joseph Gonzalez", "Quoc Le"],
    428       "year": 2021,
    429       "arxiv_id": "2104.10350",
    430       "relevance": "Cost and environmental impact of large model training, related to inference economics."
    431     },
    432     {
    433       "title": "Scaling laws for neural language models",
    434       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"],
    435       "year": 2020,
    436       "arxiv_id": "2001.08361",
    437       "relevance": "Foundational scaling laws relating compute, data, and model performance."
    438     },
    439     {
    440       "title": "Training compute-optimal large language models",
    441       "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"],
    442       "year": 2022,
    443       "arxiv_id": "2203.15556",
    444       "relevance": "Chinchilla scaling laws for compute-optimal training, directly relevant to inference cost tradeoffs."
    445     },
    446     {
    447       "title": "Efficient memory management for large language model serving with PagedAttention",
    448       "authors": ["Woosuk Kwon", "Zhuohan Li", "Siyuan Zhuang"],
    449       "year": 2023,
    450       "relevance": "vLLM inference framework used in many deployment setups; relevant to inference cost optimization."
    451     }
    452   ]
    453 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs