ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24661B)


      1 {
      2   "paper": {
      3     "title": "The Price of Progress: Algorithmic Efficiency and the Falling Cost of AI Inference",
      4     "authors": ["Hans Gundlach", "Jayson Lynch", "Matthias Mertens", "Neil Thompson"],
      5     "year": 2025,
      6     "venue": "NeurIPS 2025 Workshop on Evaluating the Evolving LLM Lifecycle",
      7     "arxiv_id": "2511.23455"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "Appendix A states: 'Our dataset of benchmark prices, along with the code used for analysis, is available here: The Price of Progress Project: https://github.com/hansgundlach/Algorithmic Progress Inference'. A GitHub URL is provided."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Appendix A states that the dataset of benchmark prices is available at the linked GitHub repository. The dataset is described as 'the largest dataset of current and historical prices to run benchmarks to date.'"
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No mention of environment specifications, requirements.txt, library versions, or any dependency information in the paper."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper links a GitHub repository but does not include step-by-step reproduction instructions in the paper itself. There is no README description, no command examples, and no 'Reproducing Results' section."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Tables 1 and 2 report 90% confidence intervals for all regression estimates. For example, GPQA-D Pareto Restricted: 5.315 [2.449, 11.534]. Figures also show confidence bands."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper makes comparative claims (e.g., open-weight vs. closed-weight trends differ, higher-performance bins decline faster) but does not report formal significance tests such as p-values or hypothesis tests for these comparisons. Only confidence intervals on individual regression coefficients are provided."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Effect sizes are reported as annual reduction factors with baseline context throughout. For example, '5× to 10× per year' for frontier models, '3× per year' for algorithmic progress after hardware adjustment. Tables 1 and 2 provide precise factor estimates with confidence intervals."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Sample sizes are reported (e.g., n=53 for GPQA-D Pareto, n=13 for SWE-V Pareto) but there is no discussion of whether these are sufficient for the claims made or any power analysis. The paper acknowledges limited SWE-V data but does not formally justify sample sizes."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper reports R-squared values and confidence intervals from regressions, but does not report variance or standard deviation across individual data points or model-specific variation in a systematic way. No spread measures (std dev, IQR) are reported for the price data itself."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper explicitly compares its estimates against prior work: Cottier et al. [2025] (10-1000x), Erol et al. [2025] (24.5x and 3.23x), and Saad-Falcon et al. [2025] (3.1x). Section 2.3 discusses differences from these baselines."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "All comparison works are from 2025: Cottier et al. [2025], Erol et al. [2025], and Saad-Falcon et al. [2025]. These represent the most recent estimates available."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper systematically varies components of its analysis: all models vs. Pareto-restricted, all licenses vs. open-weight only, pooled regression vs. binned analysis, and hardware-adjusted vs. unadjusted estimates. These variations serve an ablation-like function showing how each analytical choice affects results."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper reports results across three benchmarks (GPQA-Diamond, OTIS Mock AIME, SWE-bench Verified) and reports both annual reduction factors and R-squared values."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is an econometric analysis of pricing data. Human evaluation of system outputs is not relevant to the claims about price trends."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is not a machine learning model evaluation. The paper fits regression models to observational pricing data; train/test splits are not applicable to this type of econometric analysis."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by benchmark (GPQA-D, AIME, SWE-V), by model type (all vs. open-weight), by restriction type (Pareto vs. all), and by performance bin (Figs. 2-3). Section 2.4 provides detailed per-bin analysis."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses where its approach yields weak or uncertain results: SWE-V has 'less data, and our estimates feature much larger confidence bands' (Section 2.3). It also notes the open-weight unrestricted GPQA-D estimate of 1.214x includes 1.0 in its CI, suggesting possibly no progress."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 3 reports that despite falling inference prices, 'benchmarking costs have stayed constant or increased' — a finding that runs counter to the main positive trend. The paper also reports that SWE-V results are too uncertain to draw strong conclusions."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims 5-10x annual price reduction for frontier models (supported by Table 1), ~3x for algorithmic progress (supported by Table 2 hardware-adjusted estimates), and increasing benchmarking costs (supported by Section 3 and Figs. 4-5). All abstract claims are consistent with the results."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims about decomposing price changes into 'economic forces, hardware efficiency improvements, and algorithmic efficiency improvements' (Abstract). The strategy of isolating open-weight models and dividing by hardware trends is described as yielding 'algorithmic progress' — a causal interpretation from observational data. However, this decomposition relies on assumptions (e.g., open-weight pricing reflects only technical factors) that are stated but not formally tested. Confounds such as changes in inference optimization software, quantization adoption, or batch scheduling are not controlled for."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title ('The Falling Cost of AI Inference') and abstract generalize broadly, but results are limited to April 2024 - November 2025, three specific benchmarks, and pricing from Artificial Analysis. The paper notes limitations for SWE-V but the title and abstract frame findings as general AI inference trends without explicitly bounding to the tested benchmarks and time period."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper considers several alternative explanations: market competition vs. technical progress (Section 2.2-2.3, motivating the open-weight analysis), differences from prior estimates attributed to token-level vs. benchmark-level measurement and time period differences (Section 2.3), and the 40-60% bin price drop attributed to 'increased market competition' rather than technical progress (Section 2.4)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper discusses models generically (e.g., 'Claude 3.7 with different reasoning levels') but does not provide a full list of exact model versions used in the analysis. Individual model names and versions are not enumerated in the paper."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "This paper does not use prompting. It is an econometric analysis of pricing data, not a study that prompts language models."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Regression specifications are provided (Eq. 1) but key details like standard error computation method, exact software/packages used, and optimization settings are not reported."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is an econometric analysis paper."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix B documents preprocessing decisions: excluding $0 cost models, handling cached tokens differently for GPQA-D vs. SWE-V, excluding models with ambiguous version matching, normalizing multi-run benchmarks by dividing tokens by number of runs, and excluding price increases from platforms dropping legacy models. Section 2.1 also describes the Pareto frontier filtering."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated Limitations or Threats to Validity section. Some limitations are scattered in the text (e.g., limited SWE-V data, time period constraints) but they are not consolidated into a substantive section."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats-to-validity section exists. While the paper notes some specific issues (limited SWE-V data, time period differences from prior work), these are not framed as threats to validity and many potential threats are not discussed (e.g., selection bias in Artificial Analysis coverage, Internet Archive data gaps, potential survivorship bias in pricing data)."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show. It notes SWE-V data is limited and the time period is April 2024 - November 2025, but does not systematically bound the claims. The broad framing ('The Falling Cost of AI Inference') extends beyond what was tested."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The paper links to a GitHub repository (Appendix A) containing the dataset and analysis code, enabling independent verification of the underlying data."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 2.1 describes the data collection: 'We collect data on input and output token prices over time by gathering Internet Archive data from Artificial Analysis' covering April 2024 to October 2025, supplemented with Epoch AI benchmark data. The matching process between pricing and benchmark data is described."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data comes from public sources (Artificial Analysis, Epoch AI benchmark hub, Internet Archive). This is not a human subjects study."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Appendix B documents the pipeline: collecting prices from Internet Archive snapshots of Artificial Analysis, matching with Epoch AI benchmark data, handling cached tokens, removing $0 prices, normalizing multi-run benchmarks, and filtering to Pareto frontier. Sample sizes are reported at each stage (e.g., 138 price data points with 93 unique models for GPQA-D)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "The paper lists author affiliations (MIT CSAIL, MIT Sloan, MIT FutureTech) and acknowledges Zachary Brown for comments, but does not disclose any funding sources or grants."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: all four authors are affiliated with MIT CSAIL or MIT Sloan and MIT FutureTech. The paper does not evaluate any MIT product, so there is no obvious vendor-product conflict."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source is disclosed, so independence cannot be assessed. The absence of funding disclosure means this criterion is not satisfied."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "There is no competing interests or financial interests statement in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This paper does not evaluate a pre-trained model's capability on any benchmark. It analyzes pricing data for models that others have benchmarked. Contamination is not relevant."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "This paper does not evaluate a pre-trained model on any benchmark. It is an econometric study of pricing trends."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This paper does not evaluate a pre-trained model on any benchmark. Benchmark contamination is not relevant to this pricing analysis."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants. This is an econometric analysis of publicly available pricing and benchmark data."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants and not an experimental study with treatment assignment."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "This paper does not propose a method that performs inference. It is an econometric study about inference costs. The costs it reports are the subject of study, not the cost of its own method."
    275       },
    276       "compute_budget_stated": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "The computational requirements of the regression analysis itself are negligible and not the subject of the paper. This is an econometric study, not a compute-intensive method."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The price for a given level of benchmark performance has decreased by approximately 5x to 10x per year for frontier models on knowledge, reasoning, math, and software engineering benchmarks.",
    286       "evidence": "Table 1 shows annual reduction factors: GPQA-D Pareto 5.3x [2.4, 11.5], AIME Pareto 11.7x [5.3, 25.9], SWE-V Pareto 4.7x [0.7, 32.2]. Figure 1 visualizes these results.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Algorithmic efficiency progress accounts for approximately 3x annual improvement after controlling for hardware price declines.",
    291       "evidence": "Table 2 shows hardware-adjusted reduction factors for open-weight Pareto models: GPQA-D 3.2x [1.5, 6.8], AIME 3.3x [1.4, 7.9]. This uses Rahman [2024]'s 30% annual hardware price improvement as the divisor.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Despite falling per-unit inference prices, benchmarking costs have remained flat or increased.",
    296       "evidence": "Figures 4 and 5 show benchmark evaluation prices over time for GPQA-D and SWE-V. Section 3 attributes this to 'demand for much higher quality models, which are much larger and use much longer reasoning traces.'",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Higher-performance models show faster price declines than lower-performance models.",
    301       "evidence": "Section 2.4 and Figure 2 show that the highest GPQA-D bin declined by 31x/year vs. 1.7x/year for the lowest bin.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Overall price-performance progress is closer to 10x than 1,000x per year, lower than Cottier et al. [2025] estimates.",
    306       "evidence": "Section 2.3 explains three factors for lower estimates: benchmark-level rather than token-level measurement, different time period (April 2024 vs. 2022/2023 start), and larger model sample. Table 1 confirms most estimates are in the 5-12x range.",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["observational"],
    311   "key_findings": "AI inference prices for a given performance level are declining by approximately 5-10x per year for frontier models across knowledge, reasoning, math, and coding benchmarks. After isolating open-weight models and adjusting for hardware price improvements, algorithmic progress contributes roughly 3x annual improvement. Higher-capability models show faster price declines (up to 31x/year) than lower-capability models (1.7x/year). Despite these per-unit price declines, total benchmarking costs have stayed flat or increased because frontier performance demands larger models with longer reasoning traces.",
    312   "red_flags": [
    313     {
    314       "flag": "Very short time window",
    315       "detail": "The analysis covers only April 2024 to November 2025 (~19 months), which is a narrow window for estimating annual trends. Extrapolating exponential trends from such a short period carries substantial uncertainty, particularly for SWE-V which has only 13 Pareto-restricted data points."
    316     },
    317     {
    318       "flag": "No formal limitations section",
    319       "detail": "The paper lacks a dedicated limitations or threats-to-validity section. Issues like Internet Archive data gaps, selection bias in Artificial Analysis coverage, survivorship bias (dropped models/providers), and the strong assumptions in the hardware decomposition are not systematically addressed."
    320     },
    321     {
    322       "flag": "Causal decomposition from observational data",
    323       "detail": "The claim that algorithmic progress accounts for ~3x improvement relies on dividing open-weight model price trends by hardware price trends. This assumes these two factors are the only drivers and are multiplicatively separable. Confounds such as inference software optimization (vLLM, TensorRT), quantization adoption, batch scheduling improvements, and provider-level efficiency gains are not controlled for."
    324     },
    325     {
    326       "flag": "Broad title relative to evidence",
    327       "detail": "The title 'The Price of Progress: Algorithmic Efficiency and the Falling Cost of AI Inference' suggests general findings, but results are limited to three benchmarks from a single pricing source over 19 months. SWE-bench Verified results have confidence intervals consistent with no progress at all."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "LLM inference prices have fallen rapidly but unequally across tasks",
    333       "authors": ["Ben Cottier", "Ben Snodin", "David Owen", "Tom Adamczewski"],
    334       "year": 2025,
    335       "relevance": "Directly comparable study of LLM inference price trends, finding 10-1000x annual declines depending on performance level."
    336     },
    337     {
    338       "title": "Cost-of-pass: An economic framework for evaluating language models",
    339       "authors": ["Mehmet Hamza Erol", "Batu El", "Mirac Suzgun", "Mert Yuksekgonul", "James Zou"],
    340       "year": 2025,
    341       "arxiv_id": "2504.13359",
    342       "relevance": "Proposes cost-of-pass metric for benchmarks and finds 3-25x annual declines on math benchmarks, directly compared in this paper."
    343     },
    344     {
    345       "title": "Algorithmic progress in language models",
    346       "authors": ["Anson Ho", "Tamay Besiroglu", "Ege Erdil", "David Owen", "Robi Rahman", "Zifan C Guo", "David Atkinson", "Neil Thompson", "Jaime Sevilla"],
    347       "year": 2024,
    348       "relevance": "Foundational work on measuring algorithmic efficiency gains in language models, defining the concept this paper extends to inference pricing."
    349     },
    350     {
    351       "title": "Intelligence per watt: Measuring intelligence efficiency of local AI",
    352       "authors": ["Jon Saad-Falcon", "Avanika Narayan", "Hakki Orhun Akengin"],
    353       "year": 2025,
    354       "arxiv_id": "2511.07885",
    355       "relevance": "Measures energy efficiency gains in AI models (3.1x from 2023-2025), used as a comparison point for algorithmic progress estimates."
    356     },
    357     {
    358       "title": "Inference economics of language models",
    359       "authors": ["Ege Erdil"],
    360       "year": 2025,
    361       "relevance": "Analyzes the price/latency tradeoff in LLM inference, providing economic framework relevant to understanding inference cost dynamics."
    362     },
    363     {
    364       "title": "How predictable is language model benchmark performance?",
    365       "authors": ["David Owen"],
    366       "year": 2024,
    367       "arxiv_id": "2401.04757",
    368       "relevance": "Studies predictability of benchmark performance as function of training compute, providing theoretical basis for the logistic performance model used in this paper."
    369     },
    370     {
    371       "title": "Observational scaling laws and the predictability of language model performance",
    372       "authors": ["Yangjun Ruan", "Chris J Maddison", "Tatsunori B Hashimoto"],
    373       "year": 2024,
    374       "relevance": "Establishes that benchmark performance increases logistically with training compute, supporting the logit transformation used in this paper's regression."
    375     },
    376     {
    377       "title": "Demand for LLMs: Descriptive Evidence on Substitution, Market Expansion, and Multihoming",
    378       "authors": ["Andrey Fradkin"],
    379       "year": 2025,
    380       "arxiv_id": "2504.15440",
    381       "relevance": "Provides economic analysis of LLM market dynamics and pricing behavior, corroborating that model prices rarely change over time."
    382     },
    383     {
    384       "title": "Infinity Bench: Extending long context evaluation beyond 100k tokens",
    385       "authors": ["Xinrong Zhang", "Yingfa Chen", "Shengding Hu"],
    386       "year": 2024,
    387       "arxiv_id": "2402.13718",
    388       "relevance": "Example of expensive benchmarking ($5,000 for GPT-4 evaluation), illustrating the rising cost of AI evaluation discussed in this paper."
    389     }
    390   ]
    391 }

Impressum · Datenschutz