scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25135B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Beyond Benchmarks: The Economics of AI Inference",
      6     "authors": [
      7       "Boqin Zhuang",
      8       "Jiacheng Qiao",
      9       "Mingqian Liu",
     10       "Mingxing Yu",
     11       "Ping Hong",
     12       "Rui Li",
     13       "Xiaoxia Song",
     14       "Xiangjun Xu",
     15       "Xu Chen",
     16       "Yaoyao Ma",
     17       "Yujie Gao"
     18     ],
     19     "year": 2025,
     20     "venue": "arXiv",
     21     "arxiv_id": "2510.26136",
     22     "doi": null
     23   },
     24   "checklist": {
     25     "claims_and_evidence": {
     26       "abstract_claims_supported": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "All main abstract claims (framework introduced, cost/quality/performance analyzed, production frontier constructed) are supported by empirical data from WiNEval-3.0 evaluation across 9 models.",
     30         "source": "haiku"
     31       },
     32       "causal_claims_justified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Paper makes causal claims (e.g., 'increasing concurrency reduces completion time', 'output token volume causes WiNGPT-3.0 cost') without ablations or controls. Single-environment observational design cannot isolate causality.",
     36         "source": "haiku"
     37       },
     38       "generalization_bounded": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Title 'Beyond Benchmarks' and claims of 'high portability' suggest broad applicability, but evaluation limited to one benchmark (WiNEval-3.0), medical domain, and specific hardware (A800). Generalizations not explicitly bounded to tested setting.",
     42         "source": "haiku"
     43       },
     44       "alternative_explanations_discussed": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Each cost difference explained with single narrative (WiNGPT-3.0 = thinking model, Mistral-Small = poor tokenizer for Chinese) without exploring plausible alternatives.",
     48         "source": "haiku"
     49       },
     50       "proxy_outcome_distinction": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Paper distinguishes WiNEval-3.0 score (measured) from actual clinical performance (claimed). Limitations explicitly state benchmark is 'not entirely equivalent to model's final performance in specific specialized clinical scenarios.'",
     54         "source": "haiku"
     55       }
     56     },
     57     "limitations_and_scope": {
     58       "limitations_section_present": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section 7 'Limitations' provides 5 numbered points addressing training costs exclusion, hardware dependency, benchmark proxy nature, statistical confidence gaps, and upfront CAPEX.",
     62         "source": "haiku"
     63       },
     64       "threats_to_validity_specific": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Specific threats identified: dependency on 'specific software/hardware stack,' 'proxy nature of benchmark scores,' and 'lack of statistical confidence analysis' requiring future confidence intervals and sensitivity testing.",
     68         "source": "haiku"
     69       },
     70       "scope_boundaries_stated": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "Limitations section identifies constraints but main narrative (introduction, conclusion) does not explicitly bound results to medical domain, WiNEval-3.0, or A800 hardware. Generic disclaimers insufficient.",
     74         "source": "haiku"
     75       }
     76     },
     77     "conflicts_of_interest": {
     78       "funding_disclosed": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "No funding acknowledgment or financial support statement. Appears to be internal company research without explicit funding disclosure.",
     82         "source": "haiku"
     83       },
     84       "affiliations_disclosed": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Authors affiliated with 'Winning Health AI Research' and evaluate WiNGPT-3.5, WiNGPT-3.0, WiNGPT-2.7 (their own models). This conflict is not disclosed or acknowledged.",
     88         "source": "haiku"
     89       },
     90       "funder_independent_of_outcome": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "If company-funded, funder directly benefits from positive evaluation of WiNGPT models. Result that WiNGPT-3.5 is 'overall leader' directly serves company interests.",
     94         "source": "haiku"
     95       },
     96       "financial_interests_declared": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "No competing interests statement. No declaration of patents, equity stakes, or financial interests. Standard disclosure language absent.",
    100         "source": "haiku"
    101       }
    102     },
    103     "scope_and_framing": {
    104       "key_terms_defined": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Key terms defined: 'economics of inference' as production function (Section 2), 'quality' as WiNEval-3.0 score, 'cost' via explicit formula, 'performance' as three metrics (Section 5).",
    108         "source": "haiku"
    109       },
    110       "intended_contribution_clear": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Contribution explicitly stated: 'introduces a quantitative economics of inference framework' (abstract), 'proposes systematic framework for quantifying inference costs' (Section 1), with decision-making tool (Section 8).",
    114         "source": "haiku"
    115       },
    116       "engagement_with_prior_work": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "Introduction cites prior work (accuracy focus [3], carbon [4], scaling [5]) but does not substantively discuss how this work differs from or builds on them. No dedicated related work section.",
    120         "source": "haiku"
    121       }
    122     }
    123   },
    124   "type_checklist": {
    125     "empirical": {
    126       "artifacts": {
    127         "code_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "No code repository, GitHub link, or code availability statement. Methodology described but no reproducible implementation provided.",
    131           "source": "haiku"
    132         },
    133         "data_released": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "WiNEval-3.0 benchmark not publicly released. Paper presents aggregated results only; raw benchmark data unavailable for independent verification.",
    137           "source": "haiku"
    138         },
    139         "environment_specified": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Hardware specified (A800 80G × 2) but no requirements.txt, Dockerfile, or dependency versions. Vague reference to 'inference services' only; insufficient for reproducibility.",
    143           "source": "haiku"
    144         },
    145         "reproduction_instructions": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "No step-by-step reproduction guide. Paper explains framework and methodology but not sufficient instructions for independent replication.",
    149           "source": "haiku"
    150         }
    151       },
    152       "statistical_methodology": {
    153         "confidence_intervals_or_error_bars": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "All results are point estimates. Paper acknowledges 'inherent randomness' and dynamic batching variations but does not report confidence intervals or error bars. Explicitly listed as limitation #4.",
    157           "source": "haiku"
    158         },
    159         "significance_tests": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "Comparative claims made without statistical significance testing ('WiNGPT-3.5 is overall leader'). No p-values or hypothesis tests.",
    163           "source": "haiku"
    164         },
    165         "effect_sizes_reported": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Absolute values reported (dollars, scores) but without confidence intervals these cannot be reliably interpreted as effect sizes. Variance not quantified.",
    169           "source": "haiku"
    170         },
    171         "sample_size_justified": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "9 models tested, 2,993 requests in WiNEval-3.0. No justification for adequacy. No power analysis provided.",
    175           "source": "haiku"
    176         },
    177         "variance_reported": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "Appendix B shows different concurrency levels but for optimal configuration (Table 2), variance/std dev not reported. Randomness acknowledged but not quantified.",
    181           "source": "haiku"
    182         }
    183       },
    184       "evaluation_design": {
    185         "baselines_included": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "Nine models compared against each other but no external baseline (industry standard, established reference, human expert performance) for medical QA.",
    189           "source": "haiku"
    190         },
    191         "baselines_contemporary": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Models tested (Llama 2 2023, GLM-4, Qwen3, Mistral-Small) are contemporary and reflect current landscape.",
    195           "source": "haiku"
    196         },
    197         "ablation_study": {
    198           "applies": true,
    199           "answer": false,
    200           "justification": "No ablation study. Tests different concurrency levels but does not isolate component contributions (e.g., prove output volume causes cost).",
    201           "source": "haiku"
    202         },
    203         "multiple_metrics": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Three evaluation dimensions: performance (time, TTFT, throughput), quality (WiNEval score), cost (dollars). Multiple metrics across all dimensions.",
    207           "source": "haiku"
    208         },
    209         "human_evaluation": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "WiNEval-3.0 appears automated; no human evaluation of outputs mentioned. Not applicable to this cost-performance study.",
    213           "source": "haiku"
    214         },
    215         "held_out_test_set": {
    216           "applies": true,
    217           "answer": false,
    218           "justification": "WiNEval-3.0 is evaluation set but no explicit statement it is held out from training. For proprietary models (WiNGPT), training data unknown; potential contamination not addressed.",
    219           "source": "haiku"
    220         },
    221         "per_category_breakdown": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "WiNEval-3.0 covers 10 medical scenarios but results reported as aggregate only. No per-category (exam vs diagnosis vs QC) breakdown.",
    225           "source": "haiku"
    226         },
    227         "failure_cases_discussed": {
    228           "applies": true,
    229           "answer": false,
    230           "justification": "No failure cases shown or discussed. All models presented as acceptable; no qualitative error examples.",
    231           "source": "haiku"
    232         },
    233         "negative_results_reported": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "No negative results reported. 'Outliers' framed positively ('thinking model', 'cost-effective'). No genuine negative findings.",
    237           "source": "haiku"
    238         }
    239       },
    240       "setup_transparency": {
    241         "model_versions_specified": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "Model names given (WiNGPT-3.5, Qwen3-30B) but no snapshot dates, API versions, or commit hashes. Only 'GLM-4-32B-0414' includes date code. Insufficient for reproducibility.",
    245           "source": "haiku"
    246         },
    247         "prompts_provided": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No actual prompts or system instructions provided. WiNEval-3.0 described as medical QA but prompt templates not shared.",
    251           "source": "haiku"
    252         },
    253         "hyperparameters_reported": {
    254           "applies": true,
    255           "answer": false,
    256           "justification": "No temperature, top-p, max_tokens, or generation hyperparameters reported. Concurrency (8, 16, 32) is infrastructure parameter, not model hyperparameter.",
    257           "source": "haiku"
    258         },
    259         "scaffolding_described": {
    260           "applies": false,
    261           "answer": false,
    262           "justification": "No agentic scaffolding apparent. Direct model evaluation without agents or complex pipelines.",
    263           "source": "haiku"
    264         },
    265         "data_preprocessing_documented": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "No documentation of preprocessing, filtering, or normalization steps. How 2,993 requests prepared from 10 medical scenarios unexplained.",
    269           "source": "haiku"
    270         }
    271       },
    272       "data_integrity": {
    273         "raw_data_available": {
    274           "applies": true,
    275           "answer": false,
    276           "justification": "WiNEval-3.0 not released publicly. Raw inference outputs and performance logs unavailable.",
    277           "source": "haiku"
    278         },
    279         "data_collection_described": {
    280           "applies": true,
    281           "answer": false,
    282           "justification": "WiNEval-3.0 described as 'derived from real clinical applications' but data collection procedure not detailed. Source and annotation process unknown.",
    283           "source": "haiku"
    284         },
    285         "recruitment_methods_described": {
    286           "applies": false,
    287           "answer": false,
    288           "justification": "Not applicable; benchmark evaluation, no human participants.",
    289           "source": "haiku"
    290         },
    291         "data_pipeline_documented": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Pipeline from raw clinical data to WiNEval-3.0 not documented. Request formatting and processing not explained.",
    295           "source": "haiku"
    296         }
    297       },
    298       "contamination": {
    299         "training_cutoff_stated": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Training data cutoffs not stated for any model. For proprietary and commercial models, cutoff unknown. Critical for medical benchmark validation.",
    303           "source": "haiku"
    304         },
    305         "train_test_overlap_discussed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "Potential train/test overlap not discussed. WiNEval-3.0 'derived from real clinical applications' may overlap with publicly available medical Q&A in training corpora.",
    309           "source": "haiku"
    310         },
    311         "benchmark_contamination_addressed": {
    312           "applies": true,
    313           "answer": false,
    314           "justification": "No discussion of whether WiNEval-3.0 examples were publicly available before model training cutoffs. Medical benchmarks often present in training data.",
    315           "source": "haiku"
    316         }
    317       },
    318       "human_studies": {
    319         "pre_registered": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "Not applicable; no human participants.",
    323           "source": "haiku"
    324         },
    325         "irb_or_ethics_approval": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "Not applicable; no human participants.",
    329           "source": "haiku"
    330         },
    331         "demographics_reported": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "Not applicable; no human participants.",
    335           "source": "haiku"
    336         },
    337         "inclusion_exclusion_criteria": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "Not applicable; no human participants.",
    341           "source": "haiku"
    342         },
    343         "randomization_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "Not applicable; no human participants.",
    347           "source": "haiku"
    348         },
    349         "blinding_described": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "Not applicable; no human participants.",
    353           "source": "haiku"
    354         },
    355         "attrition_reported": {
    356           "applies": false,
    357           "answer": false,
    358           "justification": "Not applicable; no human participants.",
    359           "source": "haiku"
    360         }
    361       },
    362       "cost_and_practicality": {
    363         "inference_cost_reported": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Inference cost is primary focus. Reported in dollars per test set and per-unit cost. Tables 2-4 show cost for each model.",
    367           "source": "haiku"
    368         },
    369         "compute_budget_stated": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "Individual model costs reported but total computational budget for entire evaluation not provided.",
    373           "source": "haiku"
    374         }
    375       }
    376     }
    377   },
    378   "claims": [
    379     {
    380       "claim": "WiNGPT-3.5 is the overall leader, providing highest quality (76.2% score) at lowest cost ($0.34)",
    381       "evidence": "Table 2 directly compares all models; WiNGPT-3.5 leads on both score and cost metrics",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Increasing concurrency from 8 to 48 reduces WiNGPT-3.5 completion time from 2034s to 774s",
    386       "evidence": "Appendix B Table 4 shows concurrency 8→2034s, concurrency 48→774.11s for WiNGPT-3.5",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Each model has optimal concurrency beyond which overhead and marginal cost-benefit decline",
    391       "evidence": "Section 6.1 and Appendix B show performance inflection points; WiNGPT-3.5 optimal at 48, degrades at 64+",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "WiNGPT-3.0's high cost ($3.47) results from massive output token volume (4-8x other models)",
    396       "evidence": "Table 2 shows WiNGPT-3.0 output 3.44M tokens vs others 350-800K; attributed to 'thinking model with chains of thought'",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "Mistral-Small's 2.11M input tokens (vs 1.3M for others) due to less efficient tokenizer for Chinese",
    401       "evidence": "Table 2 data compared; inference made without direct tokenizer testing",
    402       "supported": "weak"
    403     },
    404     {
    405       "claim": "WiNEval-3.0 exhibits long-tail distribution representative of real-world medical application loads",
    406       "evidence": "Section 4 states this property but provides no quantitative evidence (histogram, Zipf analysis, etc.)",
    407       "supported": "weak"
    408     },
    409     {
    410       "claim": "Framework enables shift from gut-feeling to data-driven model selection decisions",
    411       "evidence": "Section 8 concludes framework provides 'quantifiable decision-making tool' for GPU investment and model selection",
    412       "supported": "strong"
    413     },
    414     {
    415       "claim": "Framework is highly portable and can adapt to different hardware platforms by adjusting cost parameters",
    416       "evidence": "Section 8 claims 'high portability: by adjusting core parameters like hourly GPU cost, framework easily adapted'",
    417       "supported": "weak"
    418     }
    419   ],
    420   "methodology_tags": [
    421     "benchmark-eval",
    422     "observational"
    423   ],
    424   "key_findings": "Paper constructs a cost-quality-performance framework for LLM inference on WiNEval-3.0 (medical benchmark, 2,993 requests). Key findings: WiNGPT-3.5 achieves best cost-effectiveness ($0.34 for 76.2% accuracy); inference time scales non-linearly with concurrency, plateauing after 48 concurrent requests with diminishing returns; output token volume is primary cost driver (WiNGPT-3.0's reasoning overhead costs 10x more than fast baselines). Framework enables data-driven model selection based on business constraints (cost, latency, throughput requirements).",
    425   "red_flags": [
    426     {
    427       "flag": "Undisclosed conflict of interest",
    428       "detail": "Authors (WiNGPT Team, Winning Health AI Research) evaluate three of their own models (WiNGPT-3.5, 3.0, 2.7) without disclosing this conflict. The conflict is not mentioned; WiNGPT-3.5 declared 'overall leader.'"
    429     },
    430     {
    431       "flag": "No code or data release",
    432       "detail": "WiNEval-3.0 benchmark not publicly available; no code repository for framework implementation. Evaluation not independently reproducible."
    433     },
    434     {
    435       "flag": "No statistical confidence intervals",
    436       "detail": "All results reported as point estimates. Paper acknowledges 'inherent randomness' and 'dynamic batching variations' but provides no confidence intervals, error bars, or variance quantification. Listed as acknowledged limitation #4."
    437     },
    438     {
    439       "flag": "No contamination analysis",
    440       "detail": "Training data cutoffs unknown for most models. Medical benchmarks may overlap with publicly available medical Q&A in training corpora. No discussion of potential data leakage."
    441     },
    442     {
    443       "flag": "Single benchmark evaluation",
    444       "detail": "Results limited to WiNEval-3.0 (medical domain only). Generalization to other domains, languages, or task types unknown despite title 'Beyond Benchmarks.'"
    445     },
    446     {
    447       "flag": "No ablation studies",
    448       "detail": "Cannot isolate causes of cost differences. Claim that output token volume causes WiNGPT-3.0's cost is inferred from correlation, not proven causally."
    449     },
    450     {
    451       "flag": "Overclaimed novelty",
    452       "detail": "Claims 'first LLM Inference Production Frontier' for WiNEval-3.0 only. Framework (cost analysis, Pareto frontiers) uses standard economics; three principles presented are textbook economics concepts, not novel insights."
    453     },
    454     {
    455       "flag": "Limited hardware/infrastructure scope",
    456       "detail": "Evaluation on single hardware configuration (A800 80G × 2) and presumably vLLM inference engine. Framework claimed 'portable' but not demonstrated on different GPUs, cloud platforms, or inference engines."
    457     }
    458   ],
    459   "cited_papers": [
    460     {
    461       "title": "Language Models are Few-Shot Learners (GPT-3)",
    462       "relevance": "Foundational LLM work establishing baseline capability and scaling relationships"
    463     },
    464     {
    465       "title": "Llama 2: Open Foundation and Fine-tuned Chat Models",
    466       "relevance": "Contemporary baseline LLM for cost-quality comparison; reference model for evaluation"
    467     },
    468     {
    469       "title": "Judging LLM-as-a-Judge with MT-Bench and ChatBot Arena",
    470       "relevance": "LLM evaluation methodology; informs quality metric selection and benchmarking approach"
    471     },
    472     {
    473       "title": "Carbon Emissions and Large Neural Network Training",
    474       "relevance": "Infrastructure cost and energy analysis; directly relevant to inference cost economics"
    475     },
    476     {
    477       "title": "Scaling Laws for Neural Language Models",
    478       "relevance": "Establishes relationship between model size, performance, and compute requirements"
    479     },
    480     {
    481       "title": "Training Compute-Optimal Large Language Models (Chinchilla)",
    482       "relevance": "Compute efficiency and scaling; foundation for cost-performance trade-off analysis"
    483     },
    484     {
    485       "title": "Efficient Memory Management for LLM Serving with PagedAttention (vLLM)",
    486       "relevance": "Inference optimization technology; likely underlying implementation of evaluation infrastructure"
    487     }
    488   ],
    489   "engagement_factors": {
    490     "practical_relevance": {
    491       "score": 3,
    492       "justification": "Directly applicable to production inference decisions. Addresses real business constraints (cost, latency, throughput) that practitioners face when selecting models and hardware."
    493     },
    494     "surprise_contrarian": {
    495       "score": 1,
    496       "justification": "Validates known economic trade-offs; no contrarian findings. 'First frontier' claim weak. Mostly confirms industry expectations rather than challenging assumptions."
    497     },
    498     "fear_safety": {
    499       "score": 0,
    500       "justification": "No safety or alignment concerns raised or addressed. Paper focuses purely on cost-benefit analysis, ignoring robustness or security considerations."
    501     },
    502     "drama_conflict": {
    503       "score": 1,
    504       "justification": "'Impossible trinity' framing presents standard engineering trade-off language as conflict. Minor narrative drama in WiNGPT-3.0 as 'specialized thinking model' but overall low emotional engagement."
    505     },
    506     "demo_ability": {
    507       "score": 2,
    508       "justification": "Results demonstrated in tables but no code or data released. Framework explained but replication requires private WiNEval-3.0 benchmark. Limited hands-on demo potential."
    509     },
    510     "brand_recognition": {
    511       "score": 1,
    512       "justification": "WiNGPT team not affiliated with major academic lab or recognized AI product brand (vs. OpenAI, Anthropic, DeepSeek, Meta). Limited institutional halo effect."
    513     }
    514   },
    515   "hn_data": {
    516     "threads": [
    517       {
    518         "hn_id": "46714925",
    519         "title": "SlimEdge: Lightweight Distributed DNN Deployment on Constrained Hardware",
    520         "points": 1,
    521         "comments": 0,
    522         "url": "https://news.ycombinator.com/item?id=46714925",
    523         "created_at": "2026-01-22T03:27:40Z"
    524       }
    525     ],
    526     "top_points": 1,
    527     "total_points": 1,
    528     "total_comments": 0
    529   }
    530 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs