scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (22899B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "theoretical",
      4   "paper": {
      5     "title": "GATE: An Integrated Assessment Model for AI Automation",
      6     "authors": [
      7       "Ege Erdil",
      8       "Andrei V. Potlogea",
      9       "T. Besiroglu",
     10       "Edu Roldan",
     11       "Anson Ho"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv",
     15     "arxiv_id": "2503.04941",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims GATE combines three key ingredients (compute-based AI development, AI automation framework, semi-endogenous growth model) and the paper thoroughly details all three modules across Sections 3–5. The novelty claim (no prior work combined all three) is asserted but not systematically surveyed.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper makes causal claims such as 'increases in AI capabilities permit gradual automation of labor tasks' and 'software improvements apply to the entire stock of existing compute.' These causal relationships are model assumptions rather than empirically validated mechanisms — the paper does not distinguish between modeling choices and empirical evidence for the causal structure.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "GATE is presented as applicable to the global economy and AI development trajectory, but core assumptions (unit measure of homogeneous tasks, single effective compute dimension, social planner optimization, no TFP growth) are significant simplifications whose implications for real-world applicability are not adequately bounded when scenario projections are presented.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper explores alternative parameter ranges and two polar labor reallocation cases but does not engage with alternative theoretical frameworks (directed technical change as a primary driver, stagnation theories, supply-side bottleneck-dominant models) that predict qualitatively different outcomes under AI automation.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper is explicit that 'effective compute' is a proxy for AI capability and acknowledges its limitations — serial vs. parallel constraints, scale-dependent algorithmic progress, and data requirements are all noted as omitted (Section 3.1 and Appendix B).",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 8 is a dedicated multi-page 'Limitations' section covering six structural limitations in detail.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific limitations are named and explained: omission of non-AI TFP growth, reduction of algorithmic progress to single effective compute dimension (citing Erdil & Schneider-Joseph 2024 on serial compute limits at 10^28–10^31 FLOP), absence of data production constraints, only two extreme labor reallocation scenarios, social planner abstraction missing competitive dynamics, and static unit-measure task space.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section 8 explicitly states what GATE does not model: non-AI R&D effects on TFP, market prices and wages, data constraints, heterogeneous task dimensions (manual vs. cognitive, routine vs. non-routine), directed technical change, and intermediate labor reallocation frictions.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "The paper contains no funding acknowledgment. The acknowledgments section thanks individuals for feedback but discloses no funding sources.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper is attributed to 'Epoch AI' and Appendix A provides a detailed author contribution list with all contributors' roles within the organization.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No external funder is disclosed; the work appears to be internal to Epoch AI, which has institutional interest in AI forecasting credibility. Independence cannot be assessed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The paper contains no competing interests statement or financial disclosure of any kind.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are explicitly defined: 'effective compute' (Section 3.1), 'digital workers' (Section 4.2), 'extensive margin' vs. 'intensive margin' of automation (Sections 4.1–4.2), 'FLOP gap' (Section 4.1), 'integrated assessment model' (Introduction), with all mathematical notation introduced before use.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly states its contribution: GATE is the first integrated assessment model combining a compute-based AI development module, an AI automation module, and a semi-endogenous growth macroeconomics module into a unified interactive simulation platform.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper engages substantively with Davidson (2023) and Korinek & Suh (2024) for the automation module, Jones (1995) and Bloom et al. (2020) for the R&D growth framework, and Kaplan et al. (2020)/Hoffmann et al. (2022) for scaling laws, explaining what GATE extends or differs from in each case.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "theoretical": {
    120       "formal_quality": {
    121         "assumptions_stated_explicitly": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Core assumptions are stated explicitly: social planner optimization, unit measure of tasks, continuous training approximation (justified in footnote 7), CES aggregator with ρ < 0 enforced, exogenous TFP, and all parameter ranges with justifications in Appendix D.",
    125           "source": "haiku"
    126         },
    127         "proofs_complete_or_sketched": {
    128           "applies": false,
    129           "answer": false,
    130           "justification": "This is a simulation/IAM paper with no formal theorems to prove. Model equations and laws of motion are fully specified, and the numerical solution algorithm is described in Appendix C. The formal work is model specification, not theorem proving.",
    131           "source": "haiku"
    132         },
    133         "bounds_tight_or_discussed": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "Physical bounds on hardware efficiency (Hmax) and software efficiency (Smax) are explicitly incorporated into the model via bounding functions (Equations 5–6). Appendix D provides parameter ranges spanning multiple orders of magnitude with uncertainty acknowledgment.",
    137           "source": "haiku"
    138         },
    139         "counterexamples_explored": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The paper notes in Appendix D that extreme parameter values cause numerical instability but does not systematically explore cases where the model's qualitative predictions would break down or where the effective compute framework fails (e.g., capability jumps not explained by compute scaling).",
    143           "source": "haiku"
    144         },
    145         "notation_consistent": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Notation is consistent throughout: C(t) for effective compute, H(t) for hardware efficiency, S(t) for software efficiency, f(t) for automation fraction, CT(t) for largest training run — all introduced once and used consistently across all sections.",
    149           "source": "haiku"
    150         },
    151         "constructive_vs_existence_noted": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "The model is entirely constructive: all state variables have explicit update rules (Table 2), the social planner's optimization is solved numerically via gradient descent on discretized time and task space (Appendix C), and results are computable in the interactive sandbox.",
    155           "source": "haiku"
    156         }
    157       },
    158       "connections": {
    159         "connection_to_practice_discussed": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Section 7 extensively describes practical uses for policymakers, economists, and AI researchers. The model is implemented as an interactive sandbox at epoch.ai/GATE with real-time visualization and scenario comparison functionality.",
    163           "source": "haiku"
    164         },
    165         "relationship_to_prior_work_clear": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "The paper clearly situates GATE relative to Davidson (2023) and Korinek & Suh (2024) for the automation module, Jones (1995) for the R&D framework, and Nordhaus (2021) and Trammell & Korinek (2020) for related IAM approaches, explaining what each contributed and what GATE adds.",
    169           "source": "haiku"
    170         },
    171         "computational_complexity_discussed": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Appendix C describes the gradient descent approach with discretized 5τ degrees of freedom but does not discuss convergence guarantees, computational scaling, or local optima — notably leaving a bracketed placeholder '[If there is anything we do explicitly to avoid the problem of getting stuck at a local optimum...]' unresolved in the published paper.",
    175           "source": "haiku"
    176         },
    177         "limitations_of_formal_model_stated": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Section 8 provides thorough discussion of formal model limitations including the effective compute reduction, static task space, social planner abstraction, omission of non-AI TFP, stylized labor reallocation, and excluded bottlenecks such as regulations and Baumol effects.",
    181           "source": "haiku"
    182         }
    183       }
    184     }
    185   },
    186   "claims": [
    187     {
    188       "claim": "Effective compute is a sufficient statistic for AI capability development, enabling AI progress to be forecast primarily through compute scaling",
    189       "evidence": "Supported by scaling law research (Kaplan et al. 2020, Hoffmann et al. 2022) and algorithmic progress empirics (Hernandez & Brown 2020, Ho et al. 2024); limitations (serial/parallel constraints, scale-dependent improvements) acknowledged in Appendix B",
    190       "supported": "moderate"
    191     },
    192     {
    193       "claim": "GATE is the first model combining a compute-based AI development module, an AI automation module, and a semi-endogenous growth macroeconomics module",
    194       "evidence": "Asserted by comparison to Davidson (2023) and Korinek & Suh (2024) which combine only subsets; no systematic prior literature review to confirm novelty of the full combination",
    195       "supported": "weak"
    196     },
    197     {
    198       "claim": "Software efficiency improvements apply to the entire existing compute stock while hardware improvements apply only to new compute flows, creating a key economic asymmetry",
    199       "evidence": "Stated as a modeling assumption formalized in Equations (4) and (12); physically motivated but not empirically validated against historical compute stock data",
    200       "supported": "moderate"
    201     },
    202     {
    203       "claim": "The training-inference tradeoff is bounded at approximately 1-2 OOMs, so inference compute cannot substitute for insufficient training beyond this range",
    204       "evidence": "Supported by Villalobos and Atkinson (2023) empirical analysis of individual techniques; the total span 'rarely exceeds a few OOMs overall' per that source",
    205       "supported": "moderate"
    206     },
    207     {
    208       "claim": "Physical heating constraints place a hard limit on usable compute of approximately 2×10^38 FLOP/year under the CMOS paradigm",
    209       "evidence": "Derived from Ho, Erdil & Besiroglu (2023) CMOS efficiency estimate and Stefan-Boltzmann calculation assuming 1% of Earth's solar energy budget; the 1% allocation assumption is arbitrary and acknowledged as one of several uncertain parameters",
    210       "supported": "weak"
    211     },
    212     {
    213       "claim": "R&D externalities cause systematic underinvestment in AI relative to the social optimum, justifiable with a wedge parameter ξ",
    214       "evidence": "Standard economic theory about knowledge spillovers; the R&D wedge range (2–20) is presented without empirical calibration, and no empirical evidence for the magnitude of AI-specific externalities is cited",
    215       "supported": "weak"
    216     }
    217   ],
    218   "methodology_tags": [
    219     "theoretical"
    220   ],
    221   "key_findings": "GATE is an integrated assessment model for simulating AI-driven economic transitions, combining three modules: an AI development module mapping investment to effective compute via R&D-driven hardware and software efficiency growth, an AI automation module translating compute to labor task automation along both extensive margins (which tasks) and intensive margins (how many digital workers per task), and a Ramsey-Cass-Koopmans macroeconomic module linking automation to aggregate output through a CES task aggregator. The model features optional add-ons for R&D externalities (reducing private R&D returns via a wedge parameter) and automation uncertainty (Bayesian updating over alternative automation functions). Solved numerically via gradient descent on discretized time and task space, the model is implemented as an interactive sandbox. Key theoretical insights include the training-inference compute tradeoff bounded at ~1–2 OOMs, the software-hardware accumulation asymmetry, and physical limits (heating bottlenecks, CMOS efficiency ceilings) as eventual growth constraints.",
    222   "red_flags": [
    223     {
    224       "flag": "Unresolved local optima in solver",
    225       "detail": "Appendix C contains a bracketed placeholder in the published paper: '[If there is anything we do explicitly to avoid the problem of getting stuck at a local optimum it might be helpful to detail that here]' — the paper was published with an acknowledged but unaddressed limitation in its core numerical solver."
    226     },
    227     {
    228       "flag": "No empirical validation",
    229       "detail": "GATE presents a model of the global economy's AI transition with no backtesting against historical data. Key parameters such as AGI training requirements (10^36.5 eFLOP default, with 8-order-of-magnitude uncertainty range) and FLOP gap make the model's real-world calibration essentially unconstrained."
    230     },
    231     {
    232       "flag": "Social planner assumption",
    233       "detail": "The model assumes coordinated, welfare-maximizing investment across the global economy, which misrepresents competitive dynamics among AI labs. The paper acknowledges this limitation but defers market equilibrium versions to future work, substantially limiting policy applicability."
    234     },
    235     {
    236       "flag": "Effective compute oversimplification",
    237       "detail": "Reducing all AI progress to a single effective compute dimension ignores serial vs. parallel computation constraints (Erdil & Schneider-Joseph 2024 estimates limits binding at 10^28–10^31 FLOP) and scale-dependent algorithmic improvements, acknowledged in Section 8 as potentially distorting timeline estimates."
    238     },
    239     {
    240       "flag": "No funding or competing interests disclosure",
    241       "detail": "No funding source or competing interests statement is provided. Epoch AI, the producing organization, has an institutional stake in establishing AI economic forecasting as a credible field and in GATE's adoption."
    242     }
    243   ],
    244   "cited_papers": [
    245     {
    246       "title": "What a compute-centric framework says about AI takeoff speeds",
    247       "relevance": "Central framework for GATE's automation module; introduced the FLOP gap concept directly adopted by GATE"
    248     },
    249     {
    250       "title": "Scenarios for the transition to AGI",
    251       "relevance": "Related integrated assessment approach; GATE explicitly situates itself against this work"
    252     },
    253     {
    254       "title": "Scaling laws for neural language models",
    255       "relevance": "Empirical foundation for compute-based AI capability prediction underlying the entire model"
    256     },
    257     {
    258       "title": "R&D-based models of economic growth",
    259       "relevance": "Mathematical basis for GATE's hardware and software R&D laws of motion"
    260     },
    261     {
    262       "title": "Are ideas getting harder to find?",
    263       "relevance": "Empirical support for 'fishing out' R&D diminishing returns parameters in the model"
    264     },
    265     {
    266       "title": "Algorithmic progress in language models",
    267       "relevance": "Key empirical input for software efficiency doubling times used in model calibration (~8 months per halving)"
    268     },
    269     {
    270       "title": "Economic growth under transformative AI",
    271       "relevance": "Prior IAM-style work on AI economic transitions that GATE extends"
    272     },
    273     {
    274       "title": "Explosive growth from AI automation: A review of the arguments",
    275       "relevance": "Review of theoretical arguments about AI-driven explosive growth that GATE aims to model formally"
    276     },
    277     {
    278       "title": "Trading off compute in training and inference",
    279       "relevance": "Empirical basis for the training-inference tradeoff parameter m (~1–2 OOMs) in GATE's automation module"
    280     },
    281     {
    282       "title": "Data movement limits to frontier model training",
    283       "relevance": "Cited in limitations section as evidence that single effective compute dimension misses serial compute constraints binding at 10^28–10^31 FLOP"
    284     }
    285   ],
    286   "engagement_factors": {
    287     "practical_relevance": {
    288       "score": 3,
    289       "justification": "Directly targets policymakers and economists with an interactive sandbox at epoch.ai/GATE allowing real-time parameter exploration; most accessible AI economic IAM published to date"
    290     },
    291     "surprise_contrarian": {
    292       "score": 1,
    293       "justification": "Synthesizes existing frameworks (Davidson 2023, Jones 1995, Ramsey-Cass-Koopmans) rather than challenging conventional wisdom; core thesis (compute drives AI, AI drives growth) is mainstream Epoch AI positioning"
    294     },
    295     "fear_safety": {
    296       "score": 2,
    297       "justification": "Models explosive growth scenarios, full automation trajectories, and physical heating limits; uncertainty add-on explicitly models scenarios where AI capabilities disappoint, providing some balance"
    298     },
    299     "drama_conflict": {
    300       "score": 1,
    301       "justification": "Academic modeling paper with explicit parameter uncertainty and limitations; no dramatic predictions or controversial conclusions"
    302     },
    303     "demo_ability": {
    304       "score": 3,
    305       "justification": "Interactive sandbox available immediately at epoch.ai/GATE with real-time visualization and scenario comparison — directly tryable by anyone"
    306     },
    307     "brand_recognition": {
    308       "score": 2,
    309       "justification": "Epoch AI is recognized in AI forecasting (compute trends database, algorithmic progress papers); Tamay Besiroglu and Ege Erdil have prior visibility in the AI research community"
    310     }
    311   },
    312   "hn_data": {
    313     "threads": [
    314       {
    315         "hn_id": "22915584",
    316         "title": "Beyond the Code: Mining Self-Admitted Technical Debt in Issue Tracker Systems",
    317         "points": 97,
    318         "comments": 57,
    319         "url": "https://news.ycombinator.com/item?id=22915584",
    320         "created_at": "2020-04-19T12:55:42Z"
    321       },
    322       {
    323         "hn_id": "35118999",
    324         "title": "Baldur: Whole-Proof Generation and Repair with Large Language Models",
    325         "points": 4,
    326         "comments": 0,
    327         "url": "https://news.ycombinator.com/item?id=35118999",
    328         "created_at": "2023-03-12T11:42:38Z"
    329       },
    330       {
    331         "hn_id": "44439235",
    332         "title": "Wider or Deeper? Scaling LLM Inference-Time Compute with Adaptive Tree Search",
    333         "points": 3,
    334         "comments": 0,
    335         "url": "https://news.ycombinator.com/item?id=44439235",
    336         "created_at": "2025-07-02T00:30:12Z"
    337       },
    338       {
    339         "hn_id": "44312317",
    340         "title": "Self-Supervised Contrastive Learning Approximates Supervised CL",
    341         "points": 3,
    342         "comments": 0,
    343         "url": "https://news.ycombinator.com/item?id=44312317",
    344         "created_at": "2025-06-18T18:45:30Z"
    345       },
    346       {
    347         "hn_id": "38902550",
    348         "title": "Baldur: Whole-Proof Generation and Repair with Large Language Models",
    349         "points": 3,
    350         "comments": 0,
    351         "url": "https://news.ycombinator.com/item?id=38902550",
    352         "created_at": "2024-01-07T16:32:16Z"
    353       },
    354       {
    355         "hn_id": "45179163",
    356         "title": "Outcome-Based Exploration for LLM Reasoning",
    357         "points": 2,
    358         "comments": 0,
    359         "url": "https://news.ycombinator.com/item?id=45179163",
    360         "created_at": "2025-09-09T08:30:12Z"
    361       },
    362       {
    363         "hn_id": "35437352",
    364         "title": "Baldur: Whole-Proof Generation and Repair with Large Language Models",
    365         "points": 1,
    366         "comments": 0,
    367         "url": "https://news.ycombinator.com/item?id=35437352",
    368         "created_at": "2023-04-04T10:07:28Z"
    369       },
    370       {
    371         "hn_id": "35181128",
    372         "title": "Quantum Microscopy of Cancer Cells at the Heisenberg Limit",
    373         "points": 1,
    374         "comments": 0,
    375         "url": "https://news.ycombinator.com/item?id=35181128",
    376         "created_at": "2023-03-16T12:45:37Z"
    377       },
    378       {
    379         "hn_id": "30711465",
    380         "title": "Bam: A Case for Enabling Fine-Grain High Throughput GPU-Access to Storage",
    381         "points": 1,
    382         "comments": 0,
    383         "url": "https://news.ycombinator.com/item?id=30711465",
    384         "created_at": "2022-03-17T14:20:24Z"
    385       }
    386     ],
    387     "top_points": 97,
    388     "total_points": 115,
    389     "total_comments": 57
    390   }
    391 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs