scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27243B)
      1 {
      2   "scan_version": 2,
      3   "active_modules": [],
      4   "paper": {
      5     "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
      6     "authors": ["Joel Becker", "Nate Rush", "Beth Barnes", "David Rein"],
      7     "year": 2025,
      8     "venue": "arXiv",
      9     "arxiv_id": "2507.09089"
     10   },
     11   "methodology_tags": ["rct", "qualitative"],
     12   "key_findings": "In a randomized controlled trial with 16 experienced open-source developers completing 246 tasks, allowing AI tools (primarily Cursor Pro with Claude 3.5/3.7 Sonnet) increased task completion time by 19%, contradicting developer forecasts of 24% speedup and expert forecasts of ~39% speedup. The slowdown was robust across alternative estimators, outcome measures, and subsets. Contributing factors include developer over-optimism about AI usefulness, high developer familiarity with repositories, large/complex codebases, and low AI reliability (<44% generation acceptance rate).",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL, code archive, or analysis scripts are provided in the paper. The paper describes detailed data collection and regression analyses but does not release code."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No dataset download link is provided. The paper describes collecting screen recordings, developer forecasts, and implementation times, but does not release the underlying data."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specifications, requirements files, or dependency information is provided for reproducing the analyses."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided. The regression specification is described in Appendix D, but there are no scripts or instructions to replicate the analysis."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "95% confidence intervals are reported throughout, using HC3 standard errors (Section D.2, Figure 15). The paper also reports CIs from alternative uncertainty estimation procedures including clustered standard errors and hierarchical bootstrap."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper uses regression-based inference with HC3 standard errors and reports p-values (e.g., Table 4 balance checks with Welch t-tests, Table 5 chi-square test). Figure 15 shows alternative uncertainty procedures."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The primary result is reported as a 19% increase in completion time (from the log-linear regression coefficient transformed via exp(β)-1). Context is provided with baseline completion times (Figure 4) and the effect is reported with confidence intervals."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No power analysis or sample size justification is provided. The study has 16 developers and 246 tasks. The paper acknowledges being 'not powered for statistically significant multiple comparisons when subsetting our data' (Section 3.3) but does not justify why 16 developers was the chosen sample size."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Standard deviations are reported in Table 4 for forecasted times. Multiple uncertainty estimation procedures are compared in Figure 15. The paper reports variance across developers (Figure 17) and across subsets."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The study design inherently includes a baseline: the AI-disallowed condition serves as the control/baseline against which AI-allowed performance is compared. The paper also compares results against prior literature (Table 3)."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The comparison is between AI-allowed and AI-disallowed conditions within the same study, using contemporary AI tools (Claude 3.5/3.7 Sonnet, GPT-4o, Gemini 2.5 Pro). Prior work comparisons in Table 3 include contemporary studies."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "This is an RCT measuring a single treatment (AI allowed vs. not allowed), not a multi-component system. There is no system to ablate."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper reports multiple outcome measures: self-reported implementation time, screen recording time, pre-review and post-review time, lines of code per hour, and activity time breakdowns (Figure 6, Figure 22). Alternative estimators are also compared (Figure 13)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The study includes extensive human evaluation: 143 hours of screen recordings were manually labeled with fine-grained activity labels (Section 2.4), exit interviews and surveys were conducted (Section G.5), and PR quality was assessed through the repositories' standard review processes."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "This is an RCT, not a prediction task. There is no train/test split concept applicable here."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Extensive breakdowns are provided: per-developer speedup (Figure 17), by prior task exposure and external resource needs (Figure 7), by AI experience (Figure 10), by scope creep (Figure 9), by month (Figure 23), and by activity type (Figures 6, 19-21)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The entire paper is essentially a discussion of a failure case (AI slowing developers down). Section C.1.4 discusses low AI reliability and developers' experiences with failed AI generations. Qualitative examples of AI failures are provided throughout."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The core finding is a negative result: AI tools slow down experienced developers by 19%, contradicting expectations. The paper also reports which hypothesized contributing factors had evidence against them (Table 1, Section C.3)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims (19% slowdown, developer forecast of 24% speedup, post-hoc estimate of 20% speedup, expert predictions of 38-39% speedup) are all supported by the results in the paper (Figure 1, Section 3.1, Table 6)."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The paper makes causal claims ('AI tooling slowed developers down') and uses an RCT design with randomized treatment assignment, which is the gold standard for causal inference. Balance checks confirm successful randomization (Table 4). Issues are defined before randomization to prevent confounding."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper explicitly bounds generalization in Section 4.1 ('Key Caveats'), Table 2 ('What the evidence does not show'), and throughout the discussion. It states results do not imply AI is unhelpful in other settings, with future models, or with better elicitation strategies."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper systematically investigates 21 alternative explanations for the slowdown in Section 3.3 and Appendix C, categorized into direct productivity loss, experimental artifacts, factors raising developer performance, and factors limiting AI performance."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper explicitly discusses the proxy-outcome distinction. Section 1 critiques prior work for using non-fixed outcome measures (lines of code, PRs) that may not correspond to productivity. It uses task completion time as its measure, notes this is 'a fixed outcome measure,' and discusses scope creep (Section C.2.3) as a potential gap between time and productivity."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper specifies models used: Claude 3.7 Sonnet (thinking mode), Claude 3.7 Sonnet, Claude 3.5 Sonnet, GPT-4o, Gemini 2.5 Pro, and o1, with usage percentages (Section C.3.6). These are marketing names without snapshot dates, but for an RCT studying tool usage in the wild, this represents what developers actually used."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "This is an RCT where developers use AI tools naturally. There are no researcher-designed prompts — developers prompt AI tools as they see fit. The study measures the effect of allowing AI usage, not of specific prompts."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "This is an RCT where developers use commercial AI tools (Cursor Pro) with default settings. The researchers do not control hyperparameters — they are studying the tools as used in practice."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "The study evaluates Cursor Pro as a third-party tool used as-is by developers. The authors cannot describe Cursor's internal scaffolding. Section F.2.2 provides a primer on Cursor's features (chat, agent mode, autocomplete) as understood from the user perspective."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The paper documents data preprocessing: imputation of post-review times for unreviewed issues (Section C.3.4), filtering criteria for screen recordings (>10% broken time, >20% discrepancy with self-reported time, cheating removal), resulting in 74 of 128 recordings (Section 2.4). Issue dropout is documented (Section C.3.3)."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 4.1 'Key Caveats' serves as a dedicated limitations section, discussing setting-specific factors, AI-specific factors, and agent capabilities. Table 2 explicitly lists what the evidence does not show."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The paper discusses 21 specific threats grouped into four categories (Table 1, Appendix C). These are highly specific: 'Our sample of 16 developers' is acknowledged, experimentally driven overuse, sampling bias in recruitment, unfamiliar development environment, and more."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Table 2 is a gold-standard scope boundary statement, listing six specific things the evidence does NOT show (e.g., 'AI systems do not currently speed up many or most software developers'). Section 4.1 adds specific caveats about setting-specific factors and AI-specific factors."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "Raw data (screen recordings, implementation times, forecasts, survey responses) is not publicly available. Only aggregated results and regression outputs are presented."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Data collection is described in detail: Section 2.2.2 covers screen recordings, Cursor analytics, implementation notes. Section 2.4 covers developer forecasts, expert forecasts, exit interviews, and fine-grained activity labels. Appendix G provides full developer instructions."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section G describes recruitment: outreach via professional networks, Reddit communities, and GitHub profiles; filtering for 500+ star repos, 5+ recent commits; 51 initial respondents filtered to ~20 then 16. Selection criteria are explicit."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline is documented: 51 interested developers → ~20 meeting criteria → 16 final participants (3 dropped, Section G.6). 246 tasks completed (136 AI-allowed, 110 AI-disallowed). Screen recording filtering: 128 recordings → 74 valid after quality filters (Section 2.4). Imputation methods for missing post-review times are described (Section C.3.4)."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding source is disclosed. METR (Model Evaluation & Threat Research) is the authors' organization, but no grants, sponsors, or funding agencies are mentioned."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Authors are identified as being from METR (Model Evaluation & Threat Research). METR is not a developer of the AI tools being evaluated (Cursor, Claude, GPT-4o)."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Funding source is not disclosed, so independence cannot be assessed. METR's organizational mission involves AI safety and capability evaluation, which could create incentives in either direction regarding AI productivity results."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement or financial interest disclosure is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This is an RCT measuring developer productivity with AI tools, not an evaluation of a pre-trained model's capability on a benchmark."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "This is an RCT measuring developer productivity, not a benchmark evaluation where train/test overlap is relevant."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "This is an RCT, not a benchmark evaluation."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "No mention of pre-registration (OSF, AsPredicted, AEA registry, or similar) anywhere in the paper."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No mention of IRB or ethics board approval. The study involves human participants (developers) completing tasks and being recorded, but no ethics review is mentioned."
    252       },
    253       "demographics_reported": {
    254         "applies": true,
    255         "answer": true,
    256         "justification": "Developer demographics are reported: typically over a decade of software experience, 5 years average on their repository, 1,500 commits on average, 59% of repository lifetime. AI experience levels: 93% used LLMs, 44% used Cursor (Section 2.1). Table 7 provides per-developer statistics."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": true,
    260         "answer": true,
    261         "justification": "Section G.2.1 lists explicit eligibility criteria: 1+ year professional experience, 6+ months as active maintainer, repository must be open source with 500+ stars and 3000+ LOC, must have a list of issues. Section G describes further filtering steps."
    262       },
    263       "randomization_described": {
    264         "applies": true,
    265         "answer": true,
    266         "justification": "Randomization is described: issues randomized via 'simulated fair coin flip' (Section 2.2). 25 early issues used block randomization before switching (Section E.3). Balance checks confirm successful randomization (Table 4, Table 5)."
    267       },
    268       "blinding_described": {
    269         "applies": true,
    270         "answer": true,
    271         "justification": "The paper explicitly states 'this study is not blinded' (Section C.2.1) — developers know which condition they are in. This is acknowledged as a limitation and investigated as a potential confound (experimentally driven overuse, Section C.2.1)."
    272       },
    273       "attrition_reported": {
    274         "applies": true,
    275         "answer": true,
    276         "justification": "Section G.6 reports 3 developers were dropped (2 due to repository pausing development, 1 due to cheating). Section C.3.3 reports issue dropout: 7 AI-disallowed and 6 AI-allowed issues not completed, plus 24 intentionally dropped issues with reasons. Robustness checks on dropout subsets are provided."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No AI inference costs or token consumption is reported. The paper mentions providing Cursor Pro subscriptions and paying developers $150/hour but does not report API costs or token usage."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "Total study cost is partially disclosed ($150/hour for developers, with ~40 hour cap per developer) but total computational budget, API costs, and infrastructure costs are not stated."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "Allowing AI tools increases task completion time by 19% for experienced open-source developers.",
    295       "evidence": "Log-linear regression on 246 tasks (136 AI-allowed, 110 AI-disallowed) with HC3 standard errors, controlling for developer forecasts as difficulty proxy. Robust across alternative estimators (Figure 13), outcome measures (Figure 22), and developer subsets (Section C.3).",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "Developers forecast 24% speedup from AI before tasks and estimate 20% speedup after completing the study, despite being slowed down by 19%.",
    300       "evidence": "Pre-task forecasts collected before randomization (Section 2.4). Post-hoc estimates collected via exit survey (Section G.5.1). Observed speedup from regression (Section D.1). Figure 1 displays all three together.",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "Economics and ML experts predict 39% and 38% speedup respectively, overestimating even more than developers.",
    305       "evidence": "34 economics experts and 54 ML experts provided incentivized forecasts with detailed study information (Section 2.4, Table 6, Section G.9).",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "Five factors contribute to slowdown: over-optimism about AI, high developer familiarity, large/complex repositories, low AI reliability, and implicit repository context.",
    310       "evidence": "Systematic investigation of 21 factors using subset analyses, surveys, interviews, and screen recordings (Table 1, Appendix C). Evidence includes: developer forecasts vs. actuals, subset analysis by familiarity (Figure 7), <44% generation acceptance rate, and qualitative developer reports.",
    311       "supported": "moderate"
    312     },
    313     {
    314       "claim": "Developers accept less than 44% of AI generations.",
    315       "evidence": "Cursor analytics data from 13 developers with METR-provided subscriptions (Section C.1.4).",
    316       "supported": "moderate"
    317     },
    318     {
    319       "claim": "75% of developers experience slowdown from AI.",
    320       "evidence": "Per-developer speedup estimates shown in Figure 17, using heterogeneous treatment effect estimation (Section D.4).",
    321       "supported": "moderate"
    322     }
    323   ],
    324   "red_flags": [
    325     {
    326       "flag": "Small sample size (16 developers)",
    327       "detail": "The study has only 16 developers, which limits statistical power for subgroup analyses. The paper acknowledges being 'not powered for statistically significant multiple comparisons when subsetting' but still presents many subset analyses (Figures 7-12) that readers might over-interpret."
    328     },
    329     {
    330       "flag": "No pre-registration",
    331       "detail": "For an RCT, the absence of pre-registration is notable. Without pre-registration, it is harder to verify that the analysis plan was not adjusted after seeing results. The 21-factor analysis in particular could reflect post-hoc hypothesis generation."
    332     },
    333     {
    334       "flag": "No IRB/ethics approval mentioned",
    335       "detail": "The study involves human participants completing recorded work tasks for pay, but no ethics board review is mentioned. This is unusual for a study with human subjects."
    336     },
    337     {
    338       "flag": "Non-blinded design with potential demand effects",
    339       "detail": "Developers knew their condition assignment. Section C.2.1 acknowledges this could lead to experimentally driven overuse of AI. Some developers reported overusing AI due to the experiment, though Figure 8 suggests similar slowdown regardless of reported AI usage patterns."
    340     },
    341     {
    342       "flag": "Funding source not disclosed",
    343       "detail": "METR's funding sources are not disclosed. While METR is not a developer of the tools being tested, their organizational focus on AI safety and capability evaluation means their funders may have interests in demonstrating (or not) AI productivity effects."
    344     }
    345   ],
    346   "cited_papers": [
    347     {
    348       "title": "The impact of ai on developer productivity: Evidence from github copilot",
    349       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    350       "year": 2023,
    351       "arxiv_id": "2302.06590",
    352       "relevance": "RCT finding 56% speedup with Copilot on synthetic JavaScript tasks — key prior work this study contrasts with."
    353     },
    354     {
    355       "title": "How much does ai impact development speed? an enterprise-based randomized controlled trial",
    356       "authors": ["Elise Paradis", "Kate Grey", "Quinn Madison"],
    357       "year": 2024,
    358       "arxiv_id": "2410.12944",
    359       "relevance": "Enterprise RCT finding 21% speedup on coding tasks — another RCT this study positions against."
    360     },
    361     {
    362       "title": "The effects of generative ai on high-skilled work: Evidence from three field experiments with software developers",
    363       "authors": ["Zheyuan Cui", "Mert Demirer", "Sonia Jaffe", "Leon Musolff", "Sida Peng", "Tobias Salz"],
    364       "year": 2025,
    365       "relevance": "Three field experiments finding 26% output increase — uses non-fixed outcome measures that this study critiques."
    366     },
    367     {
    368       "title": "The impact of large language models on open-source innovation: Evidence from github copilot",
    369       "authors": ["Doron Yeverechyahu", "Raveesh Mayya", "Gal Oestreicher-Singer"],
    370       "year": 2025,
    371       "relevance": "Natural experiment finding 37% output increase using lines of code metric — critiqued by this study for non-fixed outcome measures."
    372     },
    373     {
    374       "title": "Significant productivity gains through programming with large language models",
    375       "authors": ["Thomas Weber", "Maximilian Brandmaier", "Albrecht Schmidt", "Sven Mayer"],
    376       "year": 2024,
    377       "doi": "10.1145/3661145",
    378       "relevance": "Found 65% increase in task requirements satisfied with AI tools on synthetic tasks."
    379     },
    380     {
    381       "title": "Generative AI and labour productivity: a field experiment on coding",
    382       "authors": ["Leonardo Gambacorta", "Han Qiu", "Shuo Shan", "Daniel M Rees"],
    383       "year": 2024,
    384       "relevance": "Field experiment at BIS finding 55% output increase — uses non-fixed outcome measure."
    385     },
    386     {
    387       "title": "Generative ai at work",
    388       "authors": ["Erik Brynjolfsson", "Danielle Li", "Lindsey Raymond"],
    389       "year": 2025,
    390       "doi": "10.1093/qje/qjae044",
    391       "relevance": "Landmark study finding AI benefits less experienced workers more, compressing performance distributions."
    392     },
    393     {
    394       "title": "Experimental evidence on the productivity effects of generative artificial intelligence",
    395       "authors": ["Shakked Noy", "Whitney Zhang"],
    396       "year": 2023,
    397       "doi": "10.1126/science.adh2586",
    398       "relevance": "Experimental evidence on generative AI productivity effects with heterogeneous effects by experience."
    399     },
    400     {
    401       "title": "Measuring ai ability to complete long tasks",
    402       "authors": ["Thomas Kwa", "Ben West", "Joel Becker"],
    403       "year": 2025,
    404       "arxiv_id": "2503.14499",
    405       "relevance": "METR's benchmark for measuring AI capability on long software tasks — contextualizes the gap between benchmarks and real-world impact."
    406     },
    407     {
    408       "title": "Re-bench: Evaluating frontier ai r&d capabilities of language model agents against human experts",
    409       "authors": ["Hjalmar Wijk", "Tao Lin", "Joel Becker"],
    410       "year": 2025,
    411       "arxiv_id": "2411.15114",
    412       "relevance": "Benchmark evaluating AI R&D capabilities against human experts on complex tasks."
    413     },
    414     {
    415       "title": "SWE-bench: Can language models resolve real-world github issues?",
    416       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig"],
    417       "year": 2024,
    418       "relevance": "Key software engineering benchmark referenced in context of environment complexity effects on AI performance."
    419     },
    420     {
    421       "title": "Scaling llm test-time compute optimally can be more effective than scaling model parameters",
    422       "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"],
    423       "year": 2024,
    424       "arxiv_id": "2408.03314",
    425       "relevance": "Test-time compute scaling — relevant to whether more token spend could improve AI reliability in this setting."
    426     },
    427     {
    428       "title": "The simple macroeconomics of ai",
    429       "authors": ["Daron Acemoglu"],
    430       "year": 2024,
    431       "doi": "10.1093/epolic/eiae042",
    432       "relevance": "Skeptical economic perspective on AI's productivity impact — contextualizes the broader economic debate."
    433     }
    434   ]
    435 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs