scan.json (24192B)
1 { 2 "scan_version": 3, 3 "active_modules": [], 4 "paper": { 5 "title": "AI IDEs or Autonomous Agents? Measuring the Impact of Coding Agents on Software Development", 6 "authors": [ 7 "Shyam Agarwal", 8 "Hao He", 9 "Bogdan Vasilescu" 10 ], 11 "year": 2026, 12 "venue": "MSR '26 (23rd International Conference on Mining Software Repositories)", 13 "arxiv_id": "2601.13597", 14 "doi": "10.1145/3793302.3793589" 15 }, 16 "methodology_tags": [ 17 "observational" 18 ], 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "Replication package publicly available at https://github.com/shyamagarwal13/agentic-coding-impact, stated in abstract and footer." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "Builds on the publicly available AIDev dataset (v3) [28]. GHArchive data is also public. Replication package is provided." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "No mention of environment specifications, dependency files, or library versions in the paper. SonarQube is named but no version or configuration details are given." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": true, 39 "justification": "The paper provides a replication package URL and the methodology section describes the full pipeline in sufficient detail (data source, filtering criteria, matching procedure, estimator) for replication." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": true, 46 "justification": "Standard errors reported in Table 2 for all treatment effects. Figure 2 shows confidence bands around dynamic treatment effect estimates." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": true, 51 "justification": "P-values reported via star notation in Table 2 (*, **, ***) with thresholds at 0.05, 0.01, 0.001. Significance indicated for each outcome." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Percentage change effects reported throughout: +36.25% commits, +76.59% lines added for AF; +17.73% static analysis warnings. Log-transformed estimates with % change interpretation in Table 2." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "No power analysis or justification for sample sizes. The sample sizes (401 AF + 606 controls, 117 IF + 73 controls) result from filtering criteria but no discussion of whether these are adequate for the effect sizes detected." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": true, 66 "justification": "Standard errors clustered at the repository level are reported in Table 2. Confidence bands shown in Figure 2 event-study plots. Appropriate for observational DiD designs." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Matched control repositories serve as baselines. Additionally, results are compared against prior work on Cursor AI IDE adoption [25] using the same causal inference methods." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "The primary comparison is with He et al. (2026) [25] on Cursor adoption, which is concurrent work by overlapping authors using the same methods. Control repos are matched from the same time period." 79 }, 80 "ablation_study": { 81 "applies": false, 82 "answer": false, 83 "justification": "This is an observational causal inference study, not a system with components to ablate. The AF/IF split is a moderation analysis, not an ablation." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "Six outcome metrics: commits, lines added, static-analysis warnings, cognitive complexity, duplicated-line density, and comment-line density (Table 2, Figure 2)." 89 }, 90 "human_evaluation": { 91 "applies": false, 92 "answer": false, 93 "justification": "This is a repository mining study measuring automated metrics. Human evaluation of outputs is not applicable." 94 }, 95 "held_out_test_set": { 96 "applies": false, 97 "answer": false, 98 "justification": "Not a prediction task. This is a causal inference study estimating treatment effects." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Results broken down by AF (agent-first, n=401) vs IF (IDE-first, n=117) repositories, and dynamic monthly effects shown in Figure 2 event-study plots from t=-6 to t=+6." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper discusses concerning pre-treatment coefficients in quality metrics: 'We also observe isolated significant pre-treatment coefficients in static-analysis warnings and code complexity... these are concerning and highlight a limitation of our quasi-experimental design.'" 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "IF repositories show minimal or no velocity gains (+3.1% commits, -6.3% lines added, both insignificant). Lines added for IF turn negative by t=6 (~-61%). This null/negative finding is prominently reported." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "Abstract claims of 'roughly 18% and 39%' for warnings and complexity match Table 2 values (17.73%/19.00% and 34.85%/42.87%). Velocity claims of 'large, front-loaded velocity gains only when agents are the first AI tool' match AF vs IF results." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "Causal claims use staggered DiD with propensity score matching (AUC 0.92-0.99), the Borusyak et al. imputation estimator, and clustered standard errors. Appropriate quasi-experimental design for causal inference. Authors note intent-to-treat interpretation." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "Title 'Measuring the Impact of Coding Agents on Software Development' generalizes broadly. Sample is limited to open-source GitHub repos with ≥10 stars and ≥10 agentic PRs — a specific subset. The paper does not explicitly bound claims to this population in the abstract or title." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": true, 135 "justification": "Discusses maturity differences between AF/IF repos (Table 1), coordination costs limiting IF throughput, pre-treatment coefficient concerns suggesting unmodeled confounders, and intent-to-treat limitations (cannot measure usage intensity)." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper measures commits, lines added, static-analysis warnings, cognitive complexity, duplicated-line density, and comment density, then frames these as 'development velocity' and 'software quality.' The gap between these repository-level proxies and actual developer productivity or software quality is not acknowledged. Lines added is a known poor proxy for productivity, and static-analysis warnings are a limited proxy for quality. The paper uses broad terms ('velocity,' 'quality') without discussing the measurement-construct gap." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": false, 146 "answer": false, 147 "justification": "This study does not use LLMs. It studies repositories that adopt coding agents but does not invoke any models itself." 148 }, 149 "prompts_provided": { 150 "applies": false, 151 "answer": false, 152 "justification": "No prompting is used. This is an observational mining study." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": true, 157 "justification": "Matching parameters reported: max 3 controls per treated repo, ≥10 stars threshold, ≥10 agentic PRs, propensity score AUC 0.92-0.99, max 10,000 candidates subsampled per month, six monthly covariate lags. Borusyak et al. estimator specified." 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "No agentic scaffolding used. This is an observational study." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Detailed pipeline: AIDev dataset → cascading attribution strategy (5 signal types) → filtering (≥10 stars, ≥10 agentic PRs) → propensity score matching → AF/IF partitioning. Final sample sizes stated (401 AF + 606 controls, 117 IF + 73 controls)." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": false, 174 "justification": "No dedicated 'Limitations' or 'Threats to Validity' section. Limitations are discussed inline within Results (pre-treatment coefficients) and Methods (intent-to-treat caveat), but there is no substantive dedicated section." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "Specific threats discussed: pre-treatment coefficients in quality metrics suggesting unmodeled confounders, attribution errors attenuating effects toward zero, inability to measure usage intensity or developer-level interactions, left-censoring of adoption dates." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": false, 184 "justification": "No explicit statements about what the results do NOT show. The paper notes 'intent-to-treat effects' and 'cannot directly measure usage intensity' but does not systematically state what is out of scope (e.g., private repos, enterprise settings, individual developer impact)." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": true, 191 "justification": "Replication package at GitHub URL provided. AIDev dataset and GHArchive are both publicly available. The paper also notes corrections to the original dataset in the replication package." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 3.1 details data sources (AIDev v3, GHArchive), time period (January 2024–November 2025), attribution cascade strategy, and filtering criteria." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants. Data sources are public datasets (AIDev, GHArchive)." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "Full pipeline documented: AIDev dataset → extended attribution taxonomy → retrospective PR parsing → filtering criteria → propensity score matching → AF/IF partitioning. Final sample counts provided at each stage." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No funding source or acknowledgments section found in the paper." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "All three authors listed as Carnegie Mellon University. No apparent affiliation with companies whose tools are studied (Anthropic, OpenAI, Cursor, etc.)." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not the same as absence of conflict." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial disclosure statement found in the paper." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": false, 234 "answer": false, 235 "justification": "This is a repository mining study. No pre-trained model is evaluated on a benchmark." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Not applicable. No model is evaluated on benchmark data." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": false, 244 "answer": false, 245 "justification": "Not applicable. This is an observational study of repository-level outcomes, not a benchmark evaluation." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. This is a repository mining study." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants. Repository inclusion criteria are documented under data_preprocessing_documented." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants. This is an observational study." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": false, 288 "answer": false, 289 "justification": "Observational study with no system to report inference costs for." 290 }, 291 "compute_budget_stated": { 292 "applies": false, 293 "answer": false, 294 "justification": "Observational study. The computational cost of running SonarQube and DiD estimation is not the focus." 295 } 296 } 297 }, 298 "claims": [ 299 { 300 "claim": "Agent-first repositories experience large velocity gains: +36.3% commits and +76.6% lines added on average post-adoption.", 301 "evidence": "Table 2: AF commits β=0.309*** (SE=0.051), lines added β=0.569*** (SE=0.103). Figure 2 shows dynamic effects peaking at t=0 (+111% commits, +216% lines added).", 302 "supported": "strong" 303 }, 304 { 305 "claim": "IDE-first repositories show minimal velocity gains from agent adoption: +3.1% commits (insignificant) and -6.3% lines added (insignificant).", 306 "evidence": "Table 2: IF commits β=0.030 (SE=0.092), lines added β=-0.066 (SE=0.189). Both estimates are statistically insignificant.", 307 "supported": "strong" 308 }, 309 { 310 "claim": "Static-analysis warnings increase by roughly 18% and cognitive complexity by roughly 39% across both AF and IF repositories post-adoption.", 311 "evidence": "Table 2: Static analysis warnings +17.73% (AF, p<0.001) and +19.00% (IF, insignificant). Cognitive complexity +34.85% (AF, p<0.001) and +42.87% (IF, p<0.01).", 312 "supported": "strong" 313 }, 314 { 315 "claim": "Quality risks are persistent across settings even when velocity advantages fade, indicating sustained agent-induced technical debt.", 316 "evidence": "Figure 2 dynamic effects: complexity remains elevated through t=6 for both AF and IF. Section 4 discusses 'agent-induced complexity debt' even when velocity gains are absent (IF).", 317 "supported": "moderate" 318 }, 319 { 320 "claim": "Prior AI IDE exposure moderates velocity benefits but not quality risks from agent adoption.", 321 "evidence": "AF vs IF comparison in Table 2 and Figure 2. Velocity differs substantially (AF large gains, IF minimal) while quality effects (warnings, complexity) are comparable across both groups.", 322 "supported": "strong" 323 } 324 ], 325 "key_findings": "Autonomous coding agents produce large velocity gains (+36% commits, +77% lines added) only when they are the first AI tool in a repository; projects with prior IDE-based AI assistance see minimal throughput increases. However, quality risks are persistent across both settings: static-analysis warnings rise ~18% and cognitive complexity ~39% regardless of prior AI exposure, indicating sustained agent-induced technical debt. These heterogeneous effects suggest diminishing returns to AI assistance and highlight a speed-maintainability trade-off.", 326 "red_flags": [ 327 { 328 "flag": "Pre-treatment coefficient concerns", 329 "detail": "Authors acknowledge 'isolated significant pre-treatment coefficients in static-analysis warnings and code complexity,' suggesting that parallel trends assumption may not fully hold for quality metrics. This weakens causal claims for quality outcomes." 330 }, 331 { 332 "flag": "Small IF control group", 333 "detail": "The IDE-first analysis has only 117 treated and 73 control repositories (compared to 401+606 for AF), limiting statistical power and generalizability of IF-specific findings." 334 }, 335 { 336 "flag": "No dedicated limitations section", 337 "detail": "For a study making causal claims from observational data, the absence of a dedicated limitations or threats-to-validity section is a notable omission, though some limitations are discussed inline." 338 }, 339 { 340 "flag": "Broad title relative to scope", 341 "detail": "Title claims to measure 'the Impact of Coding Agents on Software Development' but the sample is restricted to open-source GitHub repos with ≥10 stars and ≥10 agentic PRs — a specific and potentially non-representative subset." 342 } 343 ], 344 "cited_papers": [ 345 { 346 "title": "Speed at the Cost of Quality: How Cursor AI Increases Short-Term Velocity and Long-Term Complexity in Open-Source Projects", 347 "authors": [ 348 "Hao He", 349 "Courtney Miller", 350 "Shyam Agarwal", 351 "Christian Kastner", 352 "Bogdan Vasilescu" 353 ], 354 "year": 2026, 355 "relevance": "Directly comparable predecessor study using same DiD methods to assess Cursor AI IDE impact on velocity and quality." 356 }, 357 { 358 "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity", 359 "authors": [ 360 "Joel Becker", 361 "Nate Rush", 362 "Elizabeth Barnes", 363 "David Rein" 364 ], 365 "year": 2025, 366 "arxiv_id": "2507.09089", 367 "relevance": "Controlled experiment finding limited productivity benefits of agentic tools for experienced developers." 368 }, 369 { 370 "title": "On the use of agentic coding: An empirical study of pull requests on GitHub", 371 "authors": [ 372 "Miku Watanabe", 373 "Hao Li", 374 "Yutaro Kashiwa", 375 "Brittany Reid", 376 "Hajimu Iida", 377 "Ahmed E Hassan" 378 ], 379 "year": 2025, 380 "relevance": "Empirical study of agentic coding PR acceptance rates on GitHub (83.8% merge rate for Claude Code)." 381 }, 382 { 383 "title": "The Rise of AI Teammates in Software Engineering (SE) 3.0: How Autonomous Coding Agents Are Reshaping Software Engineering", 384 "authors": [ 385 "Hao Li", 386 "Haoxiang Zhang", 387 "Ahmed E. Hassan" 388 ], 389 "year": 2025, 390 "arxiv_id": "2507.15003", 391 "relevance": "Source of the AIDev dataset used in this study; documents rise of autonomous coding agents." 392 }, 393 { 394 "title": "Code with Me or for Me? How Increasing AI Automation Transforms Developer Workflows", 395 "authors": [ 396 "Valerie Chen", 397 "Ameet Talwalkar", 398 "Robert Brennan", 399 "Graham Neubig" 400 ], 401 "year": 2025, 402 "arxiv_id": "2507.08149", 403 "relevance": "Studies how increasing AI automation transforms developer workflows and impacts user experience." 404 }, 405 { 406 "title": "Self-Admitted GenAI Usage in Open-Source Software", 407 "authors": [ 408 "Tao Xiao", 409 "Youmei Fan", 410 "Fabio Calefato", 411 "Christoph Treude", 412 "Raula Gaikovina Kula", 413 "Hideaki Hata", 414 "Sebastian Baltes" 415 ], 416 "year": 2025, 417 "arxiv_id": "2507.10422", 418 "relevance": "Documents self-admitted GenAI usage in open-source, measuring code churn and revision requirements." 419 }, 420 { 421 "title": "How Much Does AI Impact Development Speed? an Enterprise-Based Randomized Controlled Trial", 422 "authors": [ 423 "Elise Paradis", 424 "Kate Grey", 425 "Quinn Madison" 426 ], 427 "year": 2024, 428 "relevance": "Enterprise RCT measuring AI impact on development speed — one of few randomized studies in this space." 429 }, 430 { 431 "title": "AI-assisted Programming May Decrease the Productivity of Experienced Developers by Increasing Maintenance Burden", 432 "authors": [ 433 "Feiyang Xu", 434 "Poonacha K. Medappa" 435 ], 436 "year": 2025, 437 "arxiv_id": "2510.10165", 438 "relevance": "Reports that AI assistance may decrease productivity for experienced developers via maintenance burden." 439 }, 440 { 441 "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions", 442 "authors": [ 443 "Hammond A. Pearce", 444 "Baleegh Ahmad", 445 "Benjamin Tan", 446 "Brendan Dolan-Gavitt", 447 "Ramesh Karri" 448 ], 449 "year": 2021, 450 "relevance": "Early study of security vulnerabilities in AI-generated code, relevant to quality assessment of AI contributions." 451 }, 452 { 453 "title": "Revisiting event study designs: robust and efficient estimation", 454 "authors": [ 455 "Kirill Borusyak", 456 "Xavier Jaravel", 457 "Jann Spiess" 458 ], 459 "year": 2021, 460 "relevance": "Methodological foundation — the imputation-based DiD estimator used as the primary causal inference method." 461 } 462 ], 463 "engagement_factors": { 464 "practical_relevance": { 465 "score": 2, 466 "justification": "Directly actionable finding that teams already using AI IDEs should deploy agents selectively rather than expecting additive productivity gains." 467 }, 468 "surprise_contrarian": { 469 "score": 2, 470 "justification": "The main finding that prior AI IDE usage eliminates velocity gains from agents — suggesting diminishing returns rather than compounding benefits — is counterintuitive to the 'more AI = more productivity' narrative." 471 }, 472 "fear_safety": { 473 "score": 1, 474 "justification": "Raises concerns about persistent technical debt and complexity accumulation from agents, but frames it as maintainability risk rather than safety or security." 475 }, 476 "drama_conflict": { 477 "score": 2, 478 "justification": "Directly challenges the implicit claims of agent tool vendors (Devin, Codex, Claude Code) that autonomous agents deliver sustained productivity gains, showing quality degrades regardless." 479 }, 480 "demo_ability": { 481 "score": 1, 482 "justification": "Replication package is publicly available on GitHub but requires significant setup with SonarQube, GHArchive data, and statistical estimation to reproduce." 483 }, 484 "brand_recognition": { 485 "score": 2, 486 "justification": "Study names and evaluates products from OpenAI (Codex), Anthropic (Claude Code), Cursor, Devin, and GitHub Copilot — all high-profile tools in the current AI coding discourse." 487 } 488 } 489 }