scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21021B)
      1 {
      2   "paper": {
      3     "title": "GATE: An Integrated Assessment Model for AI Automation",
      4     "authors": ["Ege Erdil", "Andrei V. Potlogea", "Tamay Besiroglu", "Edu Roldan", "Anson Ho"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2503.04941"
      8   },
      9   "scan_version": 2,
     10   "active_modules": [],
     11   "methodology_tags": ["theoretical"],
     12   "key_findings": "GATE is an integrated assessment model combining a compute-based AI development module, an AI automation module mapping effective compute to task automation, and a semi-endogenous growth macroeconomic module. The model features endogenous investment, adjustment costs, R&D externalities, and uncertainty add-ons. It is implemented as an interactive sandbox allowing users to simulate economic effects of AI automation under different parameter assumptions. The paper's primary contribution is bridging AI capability modeling with macroeconomic growth theory in a unified dynamic framework.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The paper provides an interactive sandbox at www.epoch.ai/GATE (mentioned in abstract, Section 1, Section 2, and Section 7). While not traditional source code, the model is implemented and publicly accessible."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Appendix D provides extensive parameter tables with ranges, defaults, and empirical justifications. The model sandbox allows users to explore all parameter settings. All calibration data is documented in the appendices."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specifications, dependency lists, or technical requirements for running the model are provided. The sandbox is web-based but no technical details about its implementation are given."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "While the mathematical model is fully specified, there are no step-by-step instructions for reproducing the numerical solver. Appendix C describes the solution approach (gradient descent) but lacks implementation details sufficient for reproduction."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "This is a theoretical modeling paper presenting a simulation framework, not an empirical study with statistical results requiring confidence intervals."
     41       },
     42       "significance_tests": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "Theoretical modeling paper; no comparative empirical claims requiring significance tests."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "Theoretical modeling paper; no empirical effect sizes to report."
     51       },
     52       "sample_size_justified": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "Theoretical paper with no samples."
     56       },
     57       "variance_reported": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "Theoretical modeling paper; no experimental runs with variance to report."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not compare GATE against other integrated assessment models or prior economic models of AI automation (e.g., Davidson 2023, Korinek and Suh 2024) in terms of outputs or predictions. It only discusses them conceptually."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No quantitative comparison against contemporary models is provided."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "While the model has optional add-ons (R&D externalities, uncertainty), no systematic ablation is presented showing how each module affects outcomes. The paper describes functionality but does not present ablation results."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Section 7 describes multiple output variables: GWP growth, consumption, capital investment, automation fraction, training run size, compute allocation, hardware/software efficiency trajectories (Figure 5 shows 9 different output panels)."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Human evaluation is not relevant for a theoretical economic model."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "Theoretical modeling paper; no test sets involved."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Figure 5 provides breakdowns across multiple output categories. The model tracks per-module outputs (AI development, automation, macro) separately."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 8 (Limitations) extensively discusses failure modes and limitations: lack of non-AI TFP growth, stylized effective compute abstraction, failure to incorporate data production, stylized labor reallocation, omission of market structure, static task space, and other bottlenecks."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "The paper does not report scenarios where the model fails to converge, produces unrealistic outputs, or scenarios where the framework breaks down. Appendix D mentions convergence issues but does not present them as results."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims GATE combines three ingredients (compute-based AI development, automation framework, semi-endogenous growth model) in an interactive sandbox. The paper fully describes all three modules and references the sandbox URL."
    115       },
    116       "causal_claims_justified": {
    117         "applies": false,
    118         "answer": false,
    119         "justification": "The paper presents a theoretical model framework. It does not make empirical causal claims; it describes structural relationships within the model."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Section 8 extensively bounds the model's scope, noting it omits non-AI TFP growth, data constraints, realistic labor market frictions, market structure, task heterogeneity, and other bottlenecks. The paper is clear about what GATE does and does not model."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 8 discusses alternative modeling approaches and mechanisms that could yield different outcomes: endogenous TFP, data constraints, market structures, directed technical change, and additional bottlenecks. Section 9 outlines four key directions for future work addressing these alternatives."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": false,
    133         "answer": false,
    134         "justification": "Theoretical paper with no empirical measurements to distinguish from proxy outcomes."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "The paper does not use any AI models for evaluation. It is a theoretical economic model."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No prompting is used in this paper."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Appendix D provides extensive parameter tables (Tables 3-10) with ranges, default values, units, and empirical justifications for all model parameters across general economics, hardware R&D, software R&D, compute investment, compute stock, runtime compute, and automation categories."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The paper documents how each parameter was derived from empirical sources (e.g., GPU pricing, World Bank data, meta-analyses) with specific citations and calculations in Appendix D."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 8 ('Limitations') is a dedicated 5-page section discussing structural simplifications and parametric uncertainty."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 8 provides highly specific threats: the effective compute abstraction neglects serial vs parallel constraints (citing Erdil and Schneider-Joseph 2024), data scarcity as a binding constraint (citing Villalobos et al. 2022), labor reallocation extremes miss retraining dynamics, and social planner framework mischaracterizes competitive investment dynamics."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 8 explicitly states what the model does NOT capture: non-AI TFP growth, data production, intermediate labor reallocation, market structures, task heterogeneity (cognitive vs manual), new task creation, and various bottlenecks. This is thorough boundary-setting."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "While parameter calibrations reference external data sources (World Bank, NVIDIA specs, meta-analyses), the raw data used for parameter estimation is not provided or linked as a dataset."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Appendix D describes in detail how each parameter was derived, citing specific data sources (World Bank GDP, labor force stats, NVIDIA H100 specs, meta-analyses of risk aversion, depreciation rates, etc.)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants; data comes from public economic statistics and hardware specifications."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Appendix D documents the derivation pipeline for each parameter, showing intermediate calculations (e.g., hardware efficiency: H100 price → FLOP/s → FLOP/year/$ calculation)."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding information is disclosed. The paper is from Epoch AI but no grants or funding sources are mentioned."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The paper clearly states it is from Epoch AI, with detailed author contributions in Section A (Appendix)."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Epoch AI is an AI forecasting organization with a stake in AI development timelines being taken seriously. No explicit funding disclosure makes independence impossible to assess."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement is provided."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This paper does not evaluate a pre-trained model's capability on any benchmark. It is a theoretical economic model."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "No pre-trained model evaluation on benchmarks."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "No pre-trained model evaluation on benchmarks."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "Theoretical paper; no inference costs to report."
    284       },
    285       "compute_budget_stated": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "Theoretical paper; no computational experiments with budgets to report."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "GATE is the first integrated assessment model combining compute-based AI development, AI automation framework, and semi-endogenous growth model.",
    295       "evidence": "Section 1 and Section 2 describe the three-module architecture. The paper claims novelty in bringing these together: 'GATE combines three key ingredients that have not been brought together in previous work' (Abstract).",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "The AI development module is the most novel and significant contribution of GATE.",
    300       "evidence": "Sections 3.1-3.7 provide detailed modeling of effective compute accumulation, training-inference tradeoffs, and R&D dynamics. The paper states this explicitly in Sections 2 and 3.",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "Effective compute is a sufficient statistic for tracking AI capabilities.",
    305       "evidence": "Section 3.1 and Appendix B cite scaling laws (Kaplan et al. 2020, Hoffmann et al. 2022), algorithmic progress estimates (Hernandez and Brown 2020, Ho et al. 2024), and Sutton's bitter lesson. However, Section 8 acknowledges this is a significant simplification.",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "Hardware and software R&D display a key asymmetry: software improvements enhance all compute while hardware improvements only affect new compute.",
    310       "evidence": "Section 3.4 and equation (12) in Section 3.7 formalize this asymmetry in the law of motion for effective compute.",
    311       "supported": "strong"
    312     },
    313     {
    314       "claim": "The model can be solved numerically via gradient descent on the social planner's value function.",
    315       "evidence": "Appendix C describes the solution approach with discretization (20 task grid points, 1-year time steps, 80-year planning horizon) and gradient descent over 5 degrees of freedom per timestep.",
    316       "supported": "moderate"
    317     }
    318   ],
    319   "red_flags": [
    320     {
    321       "flag": "No empirical validation",
    322       "detail": "The paper presents a theoretical model without any empirical validation against historical data. No backtesting against observed compute investment, GDP growth, or automation levels is shown. Figure 5 shows illustrative outputs but no comparison to actual economic trajectories."
    323     },
    324     {
    325       "flag": "Incomplete solver documentation",
    326       "detail": "Appendix C contains placeholder comments in brackets ('[If there is anything we do explicitly to avoid the problem of getting stuck at a local optimum...]', '[In a conversation with Ege he mentioned...]'), suggesting the paper was published with incomplete documentation of the solution method."
    327     },
    328     {
    329       "flag": "Self-evaluating organization",
    330       "detail": "Epoch AI is an AI forecasting organization presenting its own model. The model's default parameters and framing choices (e.g., full automation as a central scenario) align with the organization's public positioning on AI timelines."
    331     },
    332     {
    333       "flag": "Single effective compute dimension is acknowledged as inadequate",
    334       "detail": "Section 8 acknowledges that reducing algorithmic progress to a single dimension fails to capture serial vs parallel constraints, scale-dependent improvements, and the distinction between cost-lowering and capability-expanding innovations. This is a fundamental limitation of the core modeling approach."
    335     }
    336   ],
    337   "cited_papers": [
    338     {
    339       "title": "Scaling laws for neural language models",
    340       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan", "Tom B Brown"],
    341       "year": 2020,
    342       "arxiv_id": "2001.08361",
    343       "relevance": "Foundational scaling laws work showing AI performance predictable from compute, central to GATE's compute-based approach."
    344     },
    345     {
    346       "title": "An empirical analysis of compute-optimal large language model training",
    347       "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"],
    348       "year": 2022,
    349       "relevance": "Chinchilla scaling laws establishing compute-optimal training, used to justify GATE's effective compute framework."
    350     },
    351     {
    352       "title": "Algorithmic progress in language models",
    353       "authors": ["Anson Ho", "Tamay Besiroglu", "Ege Erdil"],
    354       "year": 2024,
    355       "arxiv_id": "2403.05812",
    356       "relevance": "Estimates algorithmic efficiency doubling times for LLMs, directly calibrates GATE's software efficiency parameters."
    357     },
    358     {
    359       "title": "Scenarios for the transition to AGI",
    360       "authors": ["Anton Korinek", "Donghyun Suh"],
    361       "year": 2024,
    362       "relevance": "Similar compute-to-automation framework that GATE builds upon, key related work for economic modeling of AI automation."
    363     },
    364     {
    365       "title": "What a compute-centric framework says about AI takeoff speeds",
    366       "authors": ["Tom Davidson"],
    367       "year": 2023,
    368       "relevance": "Open Philanthropy report introducing the FLOP gap concept and compute-centric AI forecasting, directly inspires GATE's automation module."
    369     },
    370     {
    371       "title": "Explosive growth from AI automation: A review of the arguments",
    372       "authors": ["Tamay Besiroglu", "Ege Erdil"],
    373       "year": 2023,
    374       "arxiv_id": "2309.11690",
    375       "relevance": "Reviews arguments for AI-driven explosive economic growth, provides theoretical foundations for GATE's growth dynamics."
    376     },
    377     {
    378       "title": "Data movement limits to frontier model training",
    379       "authors": ["Ege Erdil", "David Schneider-Joseph"],
    380       "year": 2024,
    381       "arxiv_id": "2411.01137",
    382       "relevance": "Shows fundamental data-movement constraints limit training beyond 1e28-1e31 FLOP, cited as limitation of GATE's single effective compute metric."
    383     },
    384     {
    385       "title": "The rising costs of training frontier AI models",
    386       "authors": ["Ben Cottier", "Robi Rahman", "Loredana Fattorini"],
    387       "year": 2024,
    388       "arxiv_id": "2405.21015",
    389       "relevance": "Empirical analysis of AI training costs used to calibrate GATE's compute investment parameters."
    390     },
    391     {
    392       "title": "GPTs are GPTs: An early look at the labor market impact potential of large language models",
    393       "authors": ["Tyna Eloundou", "Sam Manning", "Pamela Mishkin", "Daniel Rock"],
    394       "year": 2023,
    395       "arxiv_id": "2303.10130",
    396       "relevance": "Assesses LLM exposure across occupations, relevant to GATE's task automation framework."
    397     },
    398     {
    399       "title": "Trading off compute in training and inference",
    400       "authors": ["Pablo Villalobos", "David Atkinson"],
    401       "year": 2023,
    402       "relevance": "Empirical estimates of training-inference compute tradeoff used to calibrate GATE's inference multiplier parameter."
    403     }
    404   ]
    405 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs