scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20921B)
      1 {
      2   "paper": {
      3     "title": "A Cost-Benefit Analysis of On-Premise Large Language Model Deployment: Breaking Even with Commercial LLM Services",
      4     "authors": ["Guanzhong Pan", "Vishal Chodnekar", "Abinas Roy", "Haibo Wang"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2509.18101"
      8   },
      9   "scan_version": 2,
     10   "active_modules": [],
     11   "methodology_tags": ["theoretical"],
     12   "key_findings": "On-premise LLM deployment break-even periods vary dramatically by model size: sub-30B models on consumer GPUs (RTX 5090, ~$2k) break even in 0.3–3 months, medium models (70B–120B) in 2–34 months, and large models (235B+) in 4–69 months. The analysis covers 54 deployment scenarios (9 open-source models × 6 commercial APIs) and finds that pricing variability across providers creates large swings in cost-effectiveness. Small and mid-sized enterprises can achieve rapid payback with small open-source models, while large-scale deployments are economically viable only under sustained high-volume usage or when privacy/sovereignty concerns override cost.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The paper provides a playground at https://v0-ai-cost-calculator.vercel.app/ where users can apply the cost-benefit framework. This is a released tool, though source code for the analysis itself is not separately linked."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The pricing, benchmark, and hardware data used in the analysis are presented in tables but no downloadable dataset or raw data file is provided."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specifications, dependencies, or setup instructions are provided. The paper is primarily analytical with tables and equations."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided. The mathematical formulas (Equations 1-7) are given but there are no scripts or detailed instructions to reproduce the break-even calculations."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No confidence intervals or error bars are reported. All break-even values are point estimates with no uncertainty quantification despite depending on variable inputs (electricity rates, hardware prices, throughput estimates)."
     41       },
     42       "significance_tests": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "The paper is a deterministic cost-modeling exercise, not a statistical comparison. No comparative claims requiring significance tests are made."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Table IV reports percentage performance differences alongside break-even months (e.g., 'EXAONE 4.0 32B vs GPT-5: 2.26 months (-2.65%)'), providing context for the cost-performance tradeoff."
     51       },
     52       "sample_size_justified": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "This is an analytical/theoretical paper with no experimental sample. The 54 deployment scenarios cover a systematic combination of models and APIs."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No variance or sensitivity analysis is reported for the break-even estimates. Inputs like electricity cost ($0.15/kWh), throughput, and hardware prices are fixed at single values with no sensitivity ranges."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Commercial API pricing from GPT-5, Claude-4 Opus, Claude-4 Sonnet, Grok-4, and Gemini 2.5 Pro serve as baselines for the cost comparison (Table I, Table IV)."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The commercial models used as baselines (GPT-5, Claude-4, Grok-4, Gemini 2.5 Pro) and open-source models (Qwen3, EXAONE 4.0, Kimi-K2) are all current as of 2025."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "The paper presents a cost model, not a system with components to ablate."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Models are compared on multiple benchmarks (GPQA, MATH-500, LiveCodeBench, MMLU-Pro) in Table I, and the cost analysis uses break-even time and token capacity as metrics."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Human evaluation is not relevant to a cost-modeling paper."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "No ML model is trained or tested; this is a cost analysis paper."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down by model size category (small, medium, large) and by enterprise type (SME, medium, large) in Section VI.E and VI.F."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper discusses cases where on-premise deployment is NOT economically viable: large models against aggressively priced providers like Gemini 2.5 Pro can extend break-even to 5-9 years (Section VI.F.3)."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports that large-scale deployments face 'steepest economic barriers' with break-even extending beyond 2 years, and that against some providers the economic case is unfavorable (Section VI.E, VII)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims about break-even periods (few months for small, 2 years for medium, 5 years for large) are supported by Table IV results, though the abstract's phrasing slightly oversimplifies the ranges."
    115       },
    116       "causal_claims_justified": {
    117         "applies": false,
    118         "answer": false,
    119         "justification": "The paper makes no causal claims. It presents a deterministic cost model computing break-even points from known inputs."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The analysis uses fixed assumptions ($0.15/kWh electricity, specific GPU prices, 8h/day operation, 2:1 output/input token ratio) but presents results as generally applicable to 'organizations' without bounding to these specific assumptions. The title frames it broadly as 'On-Premise Large Language Model Deployment' rather than specifying the narrow conditions modeled."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper does not discuss alternative factors that could affect break-even: GPU depreciation, staffing costs for maintaining on-premise infrastructure, model update frequency requiring new hardware, or the possibility that API prices will continue to drop."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper measures break-even time in months based on token throughput matching, framing this as 'economic viability.' However, it does not acknowledge that cost parity in token generation is a proxy — real deployment economics include staffing, maintenance, downtime, model updates, and opportunity costs that are not modeled."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Specific model names with parameter counts are provided: Qwen3-235B, Llama-3.3-70B, EXAONE 4.0 32B, etc. Commercial models are named (GPT-5, Claude-4 Opus/Sonnet, Grok-4, Gemini 2.5 Pro). Benchmark data is sourced from Artificial Analysis [64]."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "The paper does not use prompting. It is a cost analysis using published benchmark scores and pricing data."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "No model inference or training is performed. The paper uses published benchmark data and pricing information."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "The paper does not document how benchmark scores were selected or aggregated from Artificial Analysis, how hardware specifications and throughput estimates were derived, or how the 2:1 output/input token ratio was determined."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations section. The conclusion mentions future work (empirically validate estimates, include staffing/maintenance) but does not substantively discuss limitations of the current analysis."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to validity are discussed. The paper does not address threats such as rapidly changing pricing, GPU availability constraints, assumed throughput values possibly being optimistic, or the fixed electricity rate assumption."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what it does NOT cover. Staffing costs, maintenance, model fine-tuning costs, downtime risks, and multi-tenancy overhead are omitted from the model without explicit acknowledgment in a limitations discussion."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw data is provided. Benchmark scores are cited from Artificial Analysis [64] and hardware specs from manufacturer documentation, but no downloadable dataset is available."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section III describes the performance evaluation framework and model selection criteria. Benchmark data sources (GPQA, MATH-500, LiveCodeBench, MMLU-Pro) and the aggregation source (Artificial Analysis) are identified."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Data comes from published benchmarks and pricing information."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The pipeline from raw benchmark scores and pricing data to break-even calculations is partially documented through equations, but intermediate steps (e.g., how throughput estimates were derived for each model-hardware combination) are not fully traced."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding information is disclosed. Two authors are affiliated with Carnegie Mellon University and two are unaffiliated, but no funding statement is provided."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are listed: two at Carnegie Mellon University and two listed as 'Unaffiliated.' No evaluated product affiliations are present."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement is provided."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "The paper does not evaluate any pre-trained model's capability on benchmarks. It uses published benchmark scores from third-party sources for cost modeling."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Same as above — no model evaluation is performed by the authors."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Same as above — the paper uses third-party benchmark results for cost comparison, not for evaluating model capability."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "This is a theoretical cost-modeling paper. It reports costs of the systems it analyzes, not costs of its own method."
    284       },
    285       "compute_budget_stated": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "The paper is a theoretical analysis with no significant computation performed by the authors."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "Small open-source models (sub-30B) can break even with commercial APIs in 0.3–3 months on consumer-grade GPUs.",
    295       "evidence": "Table IV shows EXAONE 4.0 32B breaks even in 0.3–2.26 months, Qwen3-30B in 0.3–2.5 months, and Magistral Small in 0.4–3.0 months depending on the commercial API compared against. Hardware cost is ~$2k for a single RTX 5090 (Table III).",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "Medium-scale models (70B–120B) offer balanced economics with break-even periods of 2.3–34 months.",
    300       "evidence": "Table IV shows Llama-3.3-70B at 2.3–17.8 months, gpt-oss-120B at 3.9–30.9 months, GLM-4.5-Air at 4.3–34.0 months. Hardware costs range from $15k–$30k (Table III).",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "Large open-source models require break-even horizons of 4.3–69.3 months, making them viable only for sustained high-volume workloads.",
    305       "evidence": "Table IV shows Kimi-K2 at 8.7–69.3 months, GLM-4.5 at 6.5–51.5 months, Qwen3-235B at 4.3–34.0 months. Hardware costs range from $60k–$240k (Table III).",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "Open-weight models can deliver competitive performance within 20% of top commercial models.",
    310       "evidence": "Table I shows EXAONE 4.0 32B achieving 73.9% GPQA (vs GPT-5's 85.4%), 97.7% MATH-500 (vs 99.4%), and 74.7% LiveCodeBench (vs 66.8% for GPT-5). Performance gaps noted in Table IV's amortized performance difference column.",
    311       "supported": "moderate"
    312     }
    313   ],
    314   "red_flags": [
    315     {
    316       "flag": "Oversimplified cost model",
    317       "detail": "The TCO model includes only hardware and electricity. It explicitly omits staffing costs, maintenance, cooling infrastructure, networking, software licensing, downtime, and model update/replacement costs — factors acknowledged in the future work section as needing inclusion. These omissions systematically bias toward on-premise deployment appearing more favorable."
    318     },
    319     {
    320       "flag": "Fixed assumptions without sensitivity analysis",
    321       "detail": "Key inputs are fixed at single values: electricity at $0.15/kWh, 8 hours/day operation, 20 days/month, 2:1 output/input token ratio. No sensitivity analysis explores how results change under different assumptions (e.g., 24/7 operation, different electricity rates, different usage patterns)."
    322     },
    323     {
    324       "flag": "Throughput estimates not empirically validated",
    325       "detail": "Token throughput values in Table III (e.g., 200 tok/sec for EXAONE 4.0 32B on RTX 5090, 400 tok/sec for Qwen3-235B on 4×A100) are presented without citation or empirical measurement. These values critically affect break-even calculations but their source and accuracy are unclear."
    326     },
    327     {
    328       "flag": "No limitations section",
    329       "detail": "The paper lacks any dedicated discussion of limitations or threats to validity, despite the analysis depending heavily on assumptions that may not hold in practice."
    330     },
    331     {
    332       "flag": "Benchmark scores from third-party aggregator",
    333       "detail": "All benchmark performance data comes from Artificial Analysis [64], a third-party aggregator, rather than from the original papers or the authors' own measurements. The reliability and methodology of this source is not discussed."
    334     }
    335   ],
    336   "cited_papers": [
    337     {
    338       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    339       "authors": ["L. Chen", "M. Zaharia", "J. Zou"],
    340       "year": 2023,
    341       "arxiv_id": "2305.05176",
    342       "relevance": "Directly relevant to LLM deployment economics — proposes strategies for reducing API costs through model cascades and caching."
    343     },
    344     {
    345       "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention",
    346       "authors": ["W. Kwon"],
    347       "year": 2023,
    348       "arxiv_id": "2309.06180",
    349       "relevance": "vLLM's PagedAttention is a key inference optimization that makes on-premise LLM deployment more feasible."
    350     },
    351     {
    352       "title": "Inference Economics of Language Models",
    353       "authors": ["E. Erdil"],
    354       "year": 2025,
    355       "arxiv_id": "2506.04645",
    356       "relevance": "Directly models inference economics for LLMs including hardware amortization and energy efficiency."
    357     },
    358     {
    359       "title": "An Inquiry into Datacenter TCO for LLM Inference with FP8",
    360       "authors": ["J. Kim"],
    361       "year": 2025,
    362       "arxiv_id": "2502.01070",
    363       "relevance": "Analyzes datacenter total cost of ownership for LLM inference, considering precision formats."
    364     },
    365     {
    366       "title": "Energy Considerations of Large Language Model Inference and Efficiency Optimizations",
    367       "authors": ["J. Fernandez", "C. Na", "V. Tiwari", "Y. Bisk", "S. Luccioni", "E. Strubell"],
    368       "year": 2025,
    369       "arxiv_id": "2504.17674",
    370       "relevance": "Analyzes energy costs and efficiency optimizations for LLM inference, relevant to deployment economics."
    371     },
    372     {
    373       "title": "Large Language Models: A Survey",
    374       "authors": ["S. Minaee", "T. Mikolov"],
    375       "year": 2024,
    376       "arxiv_id": "2402.06196",
    377       "relevance": "Comprehensive LLM survey covering capabilities and deployment considerations."
    378     },
    379     {
    380       "title": "Position: On-Premises LLM Deployment Demands a Middle Path: Preserving Privacy Without Sacrificing Model Confidentiality",
    381       "authors": ["H. Huang"],
    382       "year": 2024,
    383       "arxiv_id": "2410.11182",
    384       "relevance": "Directly addresses the privacy vs. capability tradeoff in on-premise LLM deployment."
    385     },
    386     {
    387       "title": "Qwen3 Technical Report",
    388       "authors": ["A. Yang"],
    389       "year": 2025,
    390       "arxiv_id": "2505.09388",
    391       "relevance": "Technical report for one of the key open-source models evaluated in this cost analysis."
    392     }
    393   ]
    394 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs