scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (22481B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "theoretical",
      4   "paper": {
      5     "title": "GATE: An Integrated Assessment Model for AI Automation",
      6     "authors": [
      7       "Ege Erdil",
      8       "Andrei V. Potlogea",
      9       "T. Besiroglu",
     10       "Edu Roldan",
     11       "Anson Ho"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv",
     15     "arxiv_id": "2503.04941",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims GATE combines three ingredients (compute-based AI development, automation framework, semi-endogenous growth model) in an interactive sandbox. The paper fully describes all three modules and references the sandbox URL.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "The paper presents a theoretical model framework. It does not make empirical causal claims; it describes structural relationships within the model.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Section 8 extensively bounds the model's scope, noting it omits non-AI TFP growth, data constraints, realistic labor market frictions, market structure, task heterogeneity, and other bottlenecks. The paper is clear about what GATE does and does not model.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section 8 discusses alternative modeling approaches and mechanisms that could yield different outcomes: endogenous TFP, data constraints, market structures, directed technical change, and additional bottlenecks. Section 9 outlines four key directions for future work addressing these alternatives.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "Theoretical paper with no empirical measurements to distinguish from proxy outcomes.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 8 ('Limitations') is a dedicated 5-page section discussing structural simplifications and parametric uncertainty.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section 8 provides highly specific threats: the effective compute abstraction neglects serial vs parallel constraints (citing Erdil and Schneider-Joseph 2024), data scarcity as a binding constraint (citing Villalobos et al. 2022), labor reallocation extremes miss retraining dynamics, and social planner framework mischaracterizes competitive investment dynamics.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section 8 explicitly states what the model does NOT capture: non-AI TFP growth, data production, intermediate labor reallocation, market structures, task heterogeneity (cognitive vs manual), new task creation, and various bottlenecks. This is thorough boundary-setting.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding information is disclosed. The paper is from Epoch AI but no grants or funding sources are mentioned.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper clearly states it is from Epoch AI, with detailed author contributions in Section A (Appendix).",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Epoch AI is an AI forecasting organization with a stake in AI development timelines being taken seriously. No explicit funding disclosure makes independence impossible to assess.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement is provided.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms receive explicit formal definitions: 'effective compute' (Section 3.1), 'extensive vs. intensive margin of automation' (Sections 4.1–4.2), 'digital workers' (Section 4.2), and 'FLOP gap' (Section 4.1) are all precisely defined before use.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The introduction explicitly states GATE's three-part contribution — compute-based AI development module, AI automation module, semi-endogenous macroeconomic growth module — and explains the interdisciplinary bridging goal for both economics and AI communities.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper substantively engages with Davidson (2023) for the FLOP gap framework, Korinek and Suh (2024) as the closest prior integrated model, Jones (1995) for R&D growth theory, and Acemoglu and Restrepo (2019) for task-based automation, explaining in each case how GATE builds on or differs from these.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "theoretical": {
    120       "formal_quality": {
    121         "assumptions_stated_explicitly": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Core assumptions are explicitly stated: continuous training approximation justified in footnote 7, social planner optimization in Section 5.1, CES constant returns to scale in Equation 26, specific functional forms for automation (Equation 13) and adjustment costs (Equations 7, 29); Appendix D provides explicit parameter ranges with justifications for every parameter.",
    125           "source": "haiku"
    126         },
    127         "proofs_complete_or_sketched": {
    128           "applies": false,
    129           "answer": false,
    130           "justification": "GATE is a modeling/simulation paper presenting dynamic equations and a numerical solution algorithm, not a theorem-proof paper; no formal mathematical proofs are required or expected for this contribution type.",
    131           "source": "haiku"
    132         },
    133         "bounds_tight_or_discussed": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "Physical bounds are derived and justified: compute limit CL = 2e38 FLOP/year is derived from CMOS efficiency (5e15 FLOP/J) × 1% of Earth's heat budget, and ranges spanning 7 orders of magnitude are provided; software efficiency bounds discuss multiple estimation heuristics with acknowledged high uncertainty.",
    137           "source": "haiku"
    138         },
    139         "counterexamples_explored": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "The paper explores polar boundary cases explicitly: perfect vs. zero labor reallocation, near-Leontief ρ values causing numerical instability (Appendix D), and the uncertainty add-on explicitly models scenarios where full automation is never reached, testing limits of the core automation assumption.",
    143           "source": "haiku"
    144         },
    145         "notation_consistent": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Notation is consistently maintained throughout the 85-page paper: state variables Hptq, Sptq, CTptq, Cptq, investment flows IQptq, IRD_H, IRD_S, and automation function f are introduced once and used consistently; Table 2 provides a centralized summary of all state update rules.",
    149           "source": "haiku"
    150         },
    151         "constructive_vs_existence_noted": {
    152           "applies": false,
    153           "answer": false,
    154           "justification": "The paper presents a simulation model solved numerically via gradient descent rather than proving existence theorems; the constructive vs. existence distinction is not relevant to this contribution type.",
    155           "source": "haiku"
    156         }
    157       },
    158       "connections": {
    159         "connection_to_practice_discussed": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Section 7 extensively discusses practical use cases for policymakers and researchers including R&D subsidy modeling, training run caps mimicking regulation, and labor market frictions; an interactive sandbox at epoch.ai/GATE makes the model directly usable by practitioners in real time.",
    163           "source": "haiku"
    164         },
    165         "relationship_to_prior_work_clear": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "Each module is explicitly positioned: the AI development module extends Davidson (2023) and is described as 'the most novel contribution'; the macroeconomic module adapts Jones (1995) Ramsey-Cass-Koopmans; the automation module builds on Korinek and Suh (2024); the integration of all three is identified as the gap GATE fills.",
    169           "source": "haiku"
    170         },
    171         "computational_complexity_discussed": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Appendix C describes the numerical solution via gradient descent over discretized time/task space with 5 degrees of freedom per timestep; the paper notes why TFP endogenization was excluded ('introduce hyperbolic growth dynamics... challenging to analyze') due to computational tractability.",
    175           "source": "haiku"
    176         },
    177         "limitations_of_formal_model_stated": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Section 8 provides extensive limitations: single-index effective compute ignores serial/parallel compute distinctions (citing Erdil and Schneider-Joseph 2024 showing limits bind at 1e28–1e31 FLOP), static task space excludes new task creation and directed technical change, social planner cannot generate price/wage predictions.",
    181           "source": "haiku"
    182         }
    183       }
    184     }
    185   },
    186   "claims": [
    187     {
    188       "claim": "GATE is the first model to integrate compute-based AI development, task automation, and semi-endogenous economic growth in a unified framework.",
    189       "evidence": "Introduction states these three ingredients 'have not been brought together in previous work'; paper positions itself against Davidson (2023) and Korinek & Suh (2024) which address subsets.",
    190       "supported": "moderate"
    191     },
    192     {
    193       "claim": "The compute required to achieve a given AI performance level halves approximately every 9–16 months due to algorithmic improvements.",
    194       "evidence": "Cited from Hernandez and Brown (2020) estimating 16-month halving, Erdil and Besiroglu (2022) estimating ~9 months, and Ho et al. (2024) finding ~8-month halving for LLMs — multiple independent estimates.",
    195       "supported": "moderate"
    196     },
    197     {
    198       "claim": "AI compute hardware depreciates at approximately 30% per year.",
    199       "evidence": "Cited solely from Ostrouchov et al. (2020) analyzing GPU failure rates in the Titan supercomputer (2012–2019); limited generalizability from a single HPC installation to the broader AI hardware ecosystem.",
    200       "supported": "weak"
    201     },
    202     {
    203       "claim": "The maximum physical compute limit due to heat dissipation is approximately 2e38 FLOP/year.",
    204       "evidence": "Derived from Ho, Erdil and Besiroglu (2023) CMOS efficiency limit (5e15 FLOP/J) multiplied by 1% of Earth's annual heat dissipation; uncertainty range spans 7 orders of magnitude (8e34 to 5e41), making the central estimate weakly constrained.",
    205       "supported": "weak"
    206     },
    207     {
    208       "claim": "Training AGI (full task automation) requires approximately 1e36.5 effective FLOP of training compute.",
    209       "evidence": "Taken from Cotra (2020) biological anchors analysis; the paper acknowledges extreme uncertainty with parameter range 1e33 to 1e41 FLOP (8 orders of magnitude), undermining any specific point estimate.",
    210       "supported": "weak"
    211     },
    212     {
    213       "claim": "Training and inference compute can be traded off at a slope of approximately m=1–2 (orders of magnitude of inference per OOM of training compute).",
    214       "evidence": "Cited from Villalobos and Atkinson (2023); empirical support exists for specific techniques but generalizability across all task types is uncertain.",
    215       "supported": "moderate"
    216     }
    217   ],
    218   "methodology_tags": [
    219     "theoretical"
    220   ],
    221   "key_findings": "GATE presents a dynamic integrated assessment model tracing the full causal chain from AI compute investment through task automation to macroeconomic growth, with feedback loops from economic expansion back to AI investment. The model's most novel contribution is the AI development module, which formalizes how hardware efficiency, software efficiency, and compute adjustment costs jointly determine effective compute over time. The automation module introduces 'digital workers' as a measurable intensive margin of automation and a FLOP-gap parametrization for the extensive margin. The model is designed for scenario exploration rather than point prediction, with an interactive sandbox at epoch.ai/GATE enabling direct use by researchers and policymakers; key parameters remain highly uncertain, spanning multiple orders of magnitude.",
    222   "red_flags": [
    223     {
    224       "flag": "Unfinished manuscript submitted to arXiv",
    225       "detail": "Appendix C contains visible placeholder text: '[If there is anything we do explicitly to avoid the problem of getting stuck at a local optimum it might be helpful to detail that here]' and '[In a conversation with Ege he mentioned...]' — indicating internal draft notes were published, making the solution algorithm description incomplete."
    226     },
    227     {
    228       "flag": "No empirical validation",
    229       "detail": "The model is not validated against any historical economic or AI development data. No backtesting is presented; the paper is entirely descriptive and forward-looking with no out-of-sample or in-sample fit assessment."
    230     },
    231     {
    232       "flag": "Key parameters span 8+ orders of magnitude uncertainty",
    233       "detail": "The AGI training requirements parameter T ranges from 1e33 to 1e41 FLOP (8 OOM), and maximum software efficiency spans 50 to 1e8 (6 OOM). Model predictions are highly sensitive to these choices, severely limiting predictive utility."
    234     },
    235     {
    236       "flag": "Social planner assumption disconnected from market reality",
    237       "detail": "Section 8 acknowledges the social planner framework cannot capture competitive dynamics, strategic capability deployment timing, market concentration effects, or generate observable predictions like prices and wages — yet these are central to how AI automation will actually unfold."
    238     },
    239     {
    240       "flag": "No funding or competing interest disclosure",
    241       "detail": "Epoch AI's institutional mission centers on compute-based AI forecasting — the same paradigm GATE validates — yet no funding sources are disclosed and no competing interests are declared anywhere in the paper."
    242     }
    243   ],
    244   "cited_papers": [
    245     {
    246       "title": "What a compute-centric framework says about AI takeoff speeds",
    247       "relevance": "Davidson 2023 provides the FLOP gap concept and automation function structure that GATE's automation module directly builds upon; central reference for extensive margin automation."
    248     },
    249     {
    250       "title": "Scenarios for the transition to AGI",
    251       "relevance": "Korinek and Suh 2024 provides the closest prior integrated model; GATE explicitly positions itself as extending this by adding endogenous investment and compute-based AI development."
    252     },
    253     {
    254       "title": "R&D-based models of economic growth",
    255       "relevance": "Jones 1995 provides the semi-endogenous growth framework whose knowledge production function is directly adapted for GATE's hardware and software R&D modules."
    256     },
    257     {
    258       "title": "Are ideas getting harder to find?",
    259       "relevance": "Bloom et al. 2020 provides empirical evidence for the 'fishing out' effect in R&D that motivates GATE's ϕ parameters governing diminishing returns to efficiency improvements."
    260     },
    261     {
    262       "title": "Algorithmic progress in language models",
    263       "relevance": "Ho et al. 2024 provides empirical estimates of ~8-month halving in compute requirements for LLMs, used to calibrate GATE's software R&D parameters."
    264     },
    265     {
    266       "title": "Draft report on AI timelines",
    267       "relevance": "Cotra 2020 provides the biological anchors estimate for AGI training requirements (median 1e36.5 FLOP) used as GATE's default T parameter."
    268     },
    269     {
    270       "title": "Automation and new tasks: How technology displaces and reinstates labor",
    271       "relevance": "Acemoglu and Restrepo 2019 provides the task-based automation framework and empirical estimates of initial automation rates (~10% since 1987) used to calibrate GATE's finit parameter."
    272     },
    273     {
    274       "title": "Trading off compute in training and inference",
    275       "relevance": "Villalobos and Atkinson 2023 provides empirical estimates of the training-inference tradeoff parameter m and ιmax bounds in GATE's compute allocation model."
    276     },
    277     {
    278       "title": "Algorithmic progress in computer vision",
    279       "relevance": "Erdil and Besiroglu 2022 provides empirical estimates of ~9-month software efficiency doubling time used for calibrating the software R&D productivity parameter θS."
    280     },
    281     {
    282       "title": "Explosive growth from AI automation: A review of the arguments",
    283       "relevance": "Besiroglu and Erdil 2023 surveys potential bottlenecks to AI-driven economic growth that motivate GATE's inclusion of adjustment costs and non-accumulable factors."
    284     }
    285   ],
    286   "engagement_factors": {
    287     "practical_relevance": {
    288       "score": 2,
    289       "justification": "Live interactive sandbox at epoch.ai/GATE allows direct use by policymakers and researchers, but extreme parameter uncertainty limits reliable practical application for forecasting."
    290     },
    291     "surprise_contrarian": {
    292       "score": 1,
    293       "justification": "Validates the compute-scaling paradigm and integrates standard economic growth theory; does not challenge conventional wisdom but synthesizes existing frameworks in a novel combination."
    294     },
    295     "fear_safety": {
    296       "score": 2,
    297       "justification": "Explicitly models scenarios of rapid full labor automation and potential explosive economic growth, engaging AI transition concerns, though the framing is economic rather than existential risk-focused."
    298     },
    299     "drama_conflict": {
    300       "score": 1,
    301       "justification": "Technical modeling paper presenting a framework for debate rather than a controversial position; acknowledges uncertainty throughout without staking strongly contested claims."
    302     },
    303     "demo_ability": {
    304       "score": 3,
    305       "justification": "Interactive sandbox is live at epoch.ai/GATE with real-time visualization and scenario comparison, making the model immediately usable without any setup."
    306     },
    307     "brand_recognition": {
    308       "score": 2,
    309       "justification": "Epoch AI is well-known in AI forecasting and scaling research communities; Tamay Besiroglu and Ege Erdil are recognized researchers in this space, but not mainstream-famous outside technical AI circles."
    310     }
    311   },
    312   "hn_data": {
    313     "threads": [
    314       {
    315         "hn_id": "22915584",
    316         "title": "Beyond the Code: Mining Self-Admitted Technical Debt in Issue Tracker Systems",
    317         "points": 97,
    318         "comments": 57,
    319         "url": "https://news.ycombinator.com/item?id=22915584",
    320         "created_at": "2020-04-19T12:55:42Z"
    321       },
    322       {
    323         "hn_id": "35118999",
    324         "title": "Baldur: Whole-Proof Generation and Repair with Large Language Models",
    325         "points": 4,
    326         "comments": 0,
    327         "url": "https://news.ycombinator.com/item?id=35118999",
    328         "created_at": "2023-03-12T11:42:38Z"
    329       },
    330       {
    331         "hn_id": "44439235",
    332         "title": "Wider or Deeper? Scaling LLM Inference-Time Compute with Adaptive Tree Search",
    333         "points": 3,
    334         "comments": 0,
    335         "url": "https://news.ycombinator.com/item?id=44439235",
    336         "created_at": "2025-07-02T00:30:12Z"
    337       },
    338       {
    339         "hn_id": "44312317",
    340         "title": "Self-Supervised Contrastive Learning Approximates Supervised CL",
    341         "points": 3,
    342         "comments": 0,
    343         "url": "https://news.ycombinator.com/item?id=44312317",
    344         "created_at": "2025-06-18T18:45:30Z"
    345       },
    346       {
    347         "hn_id": "38902550",
    348         "title": "Baldur: Whole-Proof Generation and Repair with Large Language Models",
    349         "points": 3,
    350         "comments": 0,
    351         "url": "https://news.ycombinator.com/item?id=38902550",
    352         "created_at": "2024-01-07T16:32:16Z"
    353       },
    354       {
    355         "hn_id": "45179163",
    356         "title": "Outcome-Based Exploration for LLM Reasoning",
    357         "points": 2,
    358         "comments": 0,
    359         "url": "https://news.ycombinator.com/item?id=45179163",
    360         "created_at": "2025-09-09T08:30:12Z"
    361       },
    362       {
    363         "hn_id": "35437352",
    364         "title": "Baldur: Whole-Proof Generation and Repair with Large Language Models",
    365         "points": 1,
    366         "comments": 0,
    367         "url": "https://news.ycombinator.com/item?id=35437352",
    368         "created_at": "2023-04-04T10:07:28Z"
    369       },
    370       {
    371         "hn_id": "35181128",
    372         "title": "Quantum Microscopy of Cancer Cells at the Heisenberg Limit",
    373         "points": 1,
    374         "comments": 0,
    375         "url": "https://news.ycombinator.com/item?id=35181128",
    376         "created_at": "2023-03-16T12:45:37Z"
    377       },
    378       {
    379         "hn_id": "30711465",
    380         "title": "Bam: A Case for Enabling Fine-Grain High Throughput GPU-Access to Storage",
    381         "points": 1,
    382         "comments": 0,
    383         "url": "https://news.ycombinator.com/item?id=30711465",
    384         "created_at": "2022-03-17T14:20:24Z"
    385       }
    386     ],
    387     "top_points": 97,
    388     "total_points": 115,
    389     "total_comments": 57
    390   }
    391 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs