ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24192B)


      1 {
      2   "scan_version": 3,
      3   "active_modules": [],
      4   "paper": {
      5     "title": "AI IDEs or Autonomous Agents? Measuring the Impact of Coding Agents on Software Development",
      6     "authors": [
      7       "Shyam Agarwal",
      8       "Hao He",
      9       "Bogdan Vasilescu"
     10     ],
     11     "year": 2026,
     12     "venue": "MSR '26 (23rd International Conference on Mining Software Repositories)",
     13     "arxiv_id": "2601.13597",
     14     "doi": "10.1145/3793302.3793589"
     15   },
     16   "methodology_tags": [
     17     "observational"
     18   ],
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Replication package publicly available at https://github.com/shyamagarwal13/agentic-coding-impact, stated in abstract and footer."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Builds on the publicly available AIDev dataset (v3) [28]. GHArchive data is also public. Replication package is provided."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No mention of environment specifications, dependency files, or library versions in the paper. SonarQube is named but no version or configuration details are given."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The paper provides a replication package URL and the methodology section describes the full pipeline in sufficient detail (data source, filtering criteria, matching procedure, estimator) for replication."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Standard errors reported in Table 2 for all treatment effects. Figure 2 shows confidence bands around dynamic treatment effect estimates."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "P-values reported via star notation in Table 2 (*, **, ***) with thresholds at 0.05, 0.01, 0.001. Significance indicated for each outcome."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Percentage change effects reported throughout: +36.25% commits, +76.59% lines added for AF; +17.73% static analysis warnings. Log-transformed estimates with % change interpretation in Table 2."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No power analysis or justification for sample sizes. The sample sizes (401 AF + 606 controls, 117 IF + 73 controls) result from filtering criteria but no discussion of whether these are adequate for the effect sizes detected."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Standard errors clustered at the repository level are reported in Table 2. Confidence bands shown in Figure 2 event-study plots. Appropriate for observational DiD designs."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Matched control repositories serve as baselines. Additionally, results are compared against prior work on Cursor AI IDE adoption [25] using the same causal inference methods."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The primary comparison is with He et al. (2026) [25] on Cursor adoption, which is concurrent work by overlapping authors using the same methods. Control repos are matched from the same time period."
     79       },
     80       "ablation_study": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is an observational causal inference study, not a system with components to ablate. The AF/IF split is a moderation analysis, not an ablation."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Six outcome metrics: commits, lines added, static-analysis warnings, cognitive complexity, duplicated-line density, and comment-line density (Table 2, Figure 2)."
     89       },
     90       "human_evaluation": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "This is a repository mining study measuring automated metrics. Human evaluation of outputs is not applicable."
     94       },
     95       "held_out_test_set": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "Not a prediction task. This is a causal inference study estimating treatment effects."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results broken down by AF (agent-first, n=401) vs IF (IDE-first, n=117) repositories, and dynamic monthly effects shown in Figure 2 event-study plots from t=-6 to t=+6."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper discusses concerning pre-treatment coefficients in quality metrics: 'We also observe isolated significant pre-treatment coefficients in static-analysis warnings and code complexity... these are concerning and highlight a limitation of our quasi-experimental design.'"
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "IF repositories show minimal or no velocity gains (+3.1% commits, -6.3% lines added, both insignificant). Lines added for IF turn negative by t=6 (~-61%). This null/negative finding is prominently reported."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims of 'roughly 18% and 39%' for warnings and complexity match Table 2 values (17.73%/19.00% and 34.85%/42.87%). Velocity claims of 'large, front-loaded velocity gains only when agents are the first AI tool' match AF vs IF results."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Causal claims use staggered DiD with propensity score matching (AUC 0.92-0.99), the Borusyak et al. imputation estimator, and clustered standard errors. Appropriate quasi-experimental design for causal inference. Authors note intent-to-treat interpretation."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "Title 'Measuring the Impact of Coding Agents on Software Development' generalizes broadly. Sample is limited to open-source GitHub repos with ≥10 stars and ≥10 agentic PRs — a specific subset. The paper does not explicitly bound claims to this population in the abstract or title."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Discusses maturity differences between AF/IF repos (Table 1), coordination costs limiting IF throughput, pre-treatment coefficient concerns suggesting unmodeled confounders, and intent-to-treat limitations (cannot measure usage intensity)."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper measures commits, lines added, static-analysis warnings, cognitive complexity, duplicated-line density, and comment density, then frames these as 'development velocity' and 'software quality.' The gap between these repository-level proxies and actual developer productivity or software quality is not acknowledged. Lines added is a known poor proxy for productivity, and static-analysis warnings are a limited proxy for quality. The paper uses broad terms ('velocity,' 'quality') without discussing the measurement-construct gap."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "This study does not use LLMs. It studies repositories that adopt coding agents but does not invoke any models itself."
    148       },
    149       "prompts_provided": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No prompting is used. This is an observational mining study."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Matching parameters reported: max 3 controls per treated repo, ≥10 stars threshold, ≥10 agentic PRs, propensity score AUC 0.92-0.99, max 10,000 candidates subsampled per month, six monthly covariate lags. Borusyak et al. estimator specified."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding used. This is an observational study."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Detailed pipeline: AIDev dataset → cascading attribution strategy (5 signal types) → filtering (≥10 stars, ≥10 agentic PRs) → propensity score matching → AF/IF partitioning. Final sample sizes stated (401 AF + 606 controls, 117 IF + 73 controls)."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No dedicated 'Limitations' or 'Threats to Validity' section. Limitations are discussed inline within Results (pre-treatment coefficients) and Methods (intent-to-treat caveat), but there is no substantive dedicated section."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Specific threats discussed: pre-treatment coefficients in quality metrics suggesting unmodeled confounders, attribution errors attenuating effects toward zero, inability to measure usage intensity or developer-level interactions, left-censoring of adoption dates."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No explicit statements about what the results do NOT show. The paper notes 'intent-to-treat effects' and 'cannot directly measure usage intensity' but does not systematically state what is out of scope (e.g., private repos, enterprise settings, individual developer impact)."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Replication package at GitHub URL provided. AIDev dataset and GHArchive are both publicly available. The paper also notes corrections to the original dataset in the replication package."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 3.1 details data sources (AIDev v3, GHArchive), time period (January 2024–November 2025), attribution cascade strategy, and filtering criteria."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. Data sources are public datasets (AIDev, GHArchive)."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Full pipeline documented: AIDev dataset → extended attribution taxonomy → retrospective PR parsing → filtering criteria → propensity score matching → AF/IF partitioning. Final sample counts provided at each stage."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding source or acknowledgments section found in the paper."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "All three authors listed as Carnegie Mellon University. No apparent affiliation with companies whose tools are studied (Anthropic, OpenAI, Cursor, etc.)."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not the same as absence of conflict."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial disclosure statement found in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "This is a repository mining study. No pre-trained model is evaluated on a benchmark."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not applicable. No model is evaluated on benchmark data."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "Not applicable. This is an observational study of repository-level outcomes, not a benchmark evaluation."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. This is a repository mining study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants. Repository inclusion criteria are documented under data_preprocessing_documented."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants. This is an observational study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "Observational study with no system to report inference costs for."
    290       },
    291       "compute_budget_stated": {
    292         "applies": false,
    293         "answer": false,
    294         "justification": "Observational study. The computational cost of running SonarQube and DiD estimation is not the focus."
    295       }
    296     }
    297   },
    298   "claims": [
    299     {
    300       "claim": "Agent-first repositories experience large velocity gains: +36.3% commits and +76.6% lines added on average post-adoption.",
    301       "evidence": "Table 2: AF commits β=0.309*** (SE=0.051), lines added β=0.569*** (SE=0.103). Figure 2 shows dynamic effects peaking at t=0 (+111% commits, +216% lines added).",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "IDE-first repositories show minimal velocity gains from agent adoption: +3.1% commits (insignificant) and -6.3% lines added (insignificant).",
    306       "evidence": "Table 2: IF commits β=0.030 (SE=0.092), lines added β=-0.066 (SE=0.189). Both estimates are statistically insignificant.",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "Static-analysis warnings increase by roughly 18% and cognitive complexity by roughly 39% across both AF and IF repositories post-adoption.",
    311       "evidence": "Table 2: Static analysis warnings +17.73% (AF, p<0.001) and +19.00% (IF, insignificant). Cognitive complexity +34.85% (AF, p<0.001) and +42.87% (IF, p<0.01).",
    312       "supported": "strong"
    313     },
    314     {
    315       "claim": "Quality risks are persistent across settings even when velocity advantages fade, indicating sustained agent-induced technical debt.",
    316       "evidence": "Figure 2 dynamic effects: complexity remains elevated through t=6 for both AF and IF. Section 4 discusses 'agent-induced complexity debt' even when velocity gains are absent (IF).",
    317       "supported": "moderate"
    318     },
    319     {
    320       "claim": "Prior AI IDE exposure moderates velocity benefits but not quality risks from agent adoption.",
    321       "evidence": "AF vs IF comparison in Table 2 and Figure 2. Velocity differs substantially (AF large gains, IF minimal) while quality effects (warnings, complexity) are comparable across both groups.",
    322       "supported": "strong"
    323     }
    324   ],
    325   "key_findings": "Autonomous coding agents produce large velocity gains (+36% commits, +77% lines added) only when they are the first AI tool in a repository; projects with prior IDE-based AI assistance see minimal throughput increases. However, quality risks are persistent across both settings: static-analysis warnings rise ~18% and cognitive complexity ~39% regardless of prior AI exposure, indicating sustained agent-induced technical debt. These heterogeneous effects suggest diminishing returns to AI assistance and highlight a speed-maintainability trade-off.",
    326   "red_flags": [
    327     {
    328       "flag": "Pre-treatment coefficient concerns",
    329       "detail": "Authors acknowledge 'isolated significant pre-treatment coefficients in static-analysis warnings and code complexity,' suggesting that parallel trends assumption may not fully hold for quality metrics. This weakens causal claims for quality outcomes."
    330     },
    331     {
    332       "flag": "Small IF control group",
    333       "detail": "The IDE-first analysis has only 117 treated and 73 control repositories (compared to 401+606 for AF), limiting statistical power and generalizability of IF-specific findings."
    334     },
    335     {
    336       "flag": "No dedicated limitations section",
    337       "detail": "For a study making causal claims from observational data, the absence of a dedicated limitations or threats-to-validity section is a notable omission, though some limitations are discussed inline."
    338     },
    339     {
    340       "flag": "Broad title relative to scope",
    341       "detail": "Title claims to measure 'the Impact of Coding Agents on Software Development' but the sample is restricted to open-source GitHub repos with ≥10 stars and ≥10 agentic PRs — a specific and potentially non-representative subset."
    342     }
    343   ],
    344   "cited_papers": [
    345     {
    346       "title": "Speed at the Cost of Quality: How Cursor AI Increases Short-Term Velocity and Long-Term Complexity in Open-Source Projects",
    347       "authors": [
    348         "Hao He",
    349         "Courtney Miller",
    350         "Shyam Agarwal",
    351         "Christian Kastner",
    352         "Bogdan Vasilescu"
    353       ],
    354       "year": 2026,
    355       "relevance": "Directly comparable predecessor study using same DiD methods to assess Cursor AI IDE impact on velocity and quality."
    356     },
    357     {
    358       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    359       "authors": [
    360         "Joel Becker",
    361         "Nate Rush",
    362         "Elizabeth Barnes",
    363         "David Rein"
    364       ],
    365       "year": 2025,
    366       "arxiv_id": "2507.09089",
    367       "relevance": "Controlled experiment finding limited productivity benefits of agentic tools for experienced developers."
    368     },
    369     {
    370       "title": "On the use of agentic coding: An empirical study of pull requests on GitHub",
    371       "authors": [
    372         "Miku Watanabe",
    373         "Hao Li",
    374         "Yutaro Kashiwa",
    375         "Brittany Reid",
    376         "Hajimu Iida",
    377         "Ahmed E Hassan"
    378       ],
    379       "year": 2025,
    380       "relevance": "Empirical study of agentic coding PR acceptance rates on GitHub (83.8% merge rate for Claude Code)."
    381     },
    382     {
    383       "title": "The Rise of AI Teammates in Software Engineering (SE) 3.0: How Autonomous Coding Agents Are Reshaping Software Engineering",
    384       "authors": [
    385         "Hao Li",
    386         "Haoxiang Zhang",
    387         "Ahmed E. Hassan"
    388       ],
    389       "year": 2025,
    390       "arxiv_id": "2507.15003",
    391       "relevance": "Source of the AIDev dataset used in this study; documents rise of autonomous coding agents."
    392     },
    393     {
    394       "title": "Code with Me or for Me? How Increasing AI Automation Transforms Developer Workflows",
    395       "authors": [
    396         "Valerie Chen",
    397         "Ameet Talwalkar",
    398         "Robert Brennan",
    399         "Graham Neubig"
    400       ],
    401       "year": 2025,
    402       "arxiv_id": "2507.08149",
    403       "relevance": "Studies how increasing AI automation transforms developer workflows and impacts user experience."
    404     },
    405     {
    406       "title": "Self-Admitted GenAI Usage in Open-Source Software",
    407       "authors": [
    408         "Tao Xiao",
    409         "Youmei Fan",
    410         "Fabio Calefato",
    411         "Christoph Treude",
    412         "Raula Gaikovina Kula",
    413         "Hideaki Hata",
    414         "Sebastian Baltes"
    415       ],
    416       "year": 2025,
    417       "arxiv_id": "2507.10422",
    418       "relevance": "Documents self-admitted GenAI usage in open-source, measuring code churn and revision requirements."
    419     },
    420     {
    421       "title": "How Much Does AI Impact Development Speed? an Enterprise-Based Randomized Controlled Trial",
    422       "authors": [
    423         "Elise Paradis",
    424         "Kate Grey",
    425         "Quinn Madison"
    426       ],
    427       "year": 2024,
    428       "relevance": "Enterprise RCT measuring AI impact on development speed — one of few randomized studies in this space."
    429     },
    430     {
    431       "title": "AI-assisted Programming May Decrease the Productivity of Experienced Developers by Increasing Maintenance Burden",
    432       "authors": [
    433         "Feiyang Xu",
    434         "Poonacha K. Medappa"
    435       ],
    436       "year": 2025,
    437       "arxiv_id": "2510.10165",
    438       "relevance": "Reports that AI assistance may decrease productivity for experienced developers via maintenance burden."
    439     },
    440     {
    441       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    442       "authors": [
    443         "Hammond A. Pearce",
    444         "Baleegh Ahmad",
    445         "Benjamin Tan",
    446         "Brendan Dolan-Gavitt",
    447         "Ramesh Karri"
    448       ],
    449       "year": 2021,
    450       "relevance": "Early study of security vulnerabilities in AI-generated code, relevant to quality assessment of AI contributions."
    451     },
    452     {
    453       "title": "Revisiting event study designs: robust and efficient estimation",
    454       "authors": [
    455         "Kirill Borusyak",
    456         "Xavier Jaravel",
    457         "Jann Spiess"
    458       ],
    459       "year": 2021,
    460       "relevance": "Methodological foundation — the imputation-based DiD estimator used as the primary causal inference method."
    461     }
    462   ],
    463   "engagement_factors": {
    464     "practical_relevance": {
    465       "score": 2,
    466       "justification": "Directly actionable finding that teams already using AI IDEs should deploy agents selectively rather than expecting additive productivity gains."
    467     },
    468     "surprise_contrarian": {
    469       "score": 2,
    470       "justification": "The main finding that prior AI IDE usage eliminates velocity gains from agents — suggesting diminishing returns rather than compounding benefits — is counterintuitive to the 'more AI = more productivity' narrative."
    471     },
    472     "fear_safety": {
    473       "score": 1,
    474       "justification": "Raises concerns about persistent technical debt and complexity accumulation from agents, but frames it as maintainability risk rather than safety or security."
    475     },
    476     "drama_conflict": {
    477       "score": 2,
    478       "justification": "Directly challenges the implicit claims of agent tool vendors (Devin, Codex, Claude Code) that autonomous agents deliver sustained productivity gains, showing quality degrades regardless."
    479     },
    480     "demo_ability": {
    481       "score": 1,
    482       "justification": "Replication package is publicly available on GitHub but requires significant setup with SonarQube, GHArchive data, and statistical estimation to reproduce."
    483     },
    484     "brand_recognition": {
    485       "score": 2,
    486       "justification": "Study names and evaluates products from OpenAI (Codex), Anthropic (Claude Code), Cursor, Devin, and GitHub Copilot — all high-profile tools in the current AI coding discourse."
    487     }
    488   }
    489 }

Impressum · Datenschutz