scan-v5.json (27056B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "AI IDEs or Autonomous Agents? Measuring the Impact of Coding Agents on Software Development", 6 "authors": [ 7 "Shyam Agarwal", 8 "Hao He", 9 "Bogdan Vasilescu" 10 ], 11 "year": 2026, 12 "venue": "MSR '26", 13 "arxiv_id": "2601.13597", 14 "doi": "10.1145/3793302.3793589" 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": true, 21 "justification": "All abstract claims (velocity gains only for AF repos, quality risks in both, 18-39% complexity/warning increases) are supported by Table 2 and Figure 2 results with appropriate statistical significance.", 22 "source": "haiku" 23 }, 24 "causal_claims_justified": { 25 "applies": true, 26 "answer": true, 27 "justification": "Paper uses staggered difference-in-differences with propensity-score matched controls—appropriate quasi-experimental design for causal inference in observational data. Acknowledges limitations in measuring usage intensity but matching on pre-treatment dynamics strengthens causal interpretation.", 28 "source": "haiku" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": true, 33 "justification": "Sample restricted to GitHub repos with ≥10 stars and ≥10 agentic PRs; observations monthly through Nov 2025. Limitations implicitly scoped but title is broad relative to sample. Discussion of 'open-source development' grounds claims appropriately.", 34 "source": "haiku" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": true, 39 "justification": "Paper discusses competing explanations: AF repos harvested 'first AI acceleration'; IF repos face higher coordination/review overhead due to maturity. Pre-treatment imbalance concerns noted: 'isolated significant pre-treatment coefficients... reflecting systematic mean differences.'", 40 "source": "haiku" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": true, 45 "justification": "Paper clearly distinguishes measured metrics (commits, lines added, static-analysis warnings, cognitive complexity) from claims about 'development velocity' and 'software quality.' Terminology is consistent and outcome granularity matches claims.", 46 "source": "haiku" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": false, 53 "justification": "No dedicated Limitations or Threats-to-Validity section. Limitations mentioned inline (pre-treatment imbalance, inability to measure usage intensity) but not systematically compiled.", 54 "source": "haiku" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": true, 59 "justification": "Multiple specific threats identified: pre-treatment coefficient imbalance in warnings/complexity; left-censoring mitigation (retrospective parsing Jan 2024–Nov 2025); attribution errors 'primarily introduce noise... attenuating effects toward zero'; cannot measure usage intensity.", 60 "source": "haiku" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": true, 65 "justification": "Explicit boundaries: ≥10 stars, ≥10 agentic PRs, monthly aggregation, GitHub repos only, Jan 2024–Nov 2025 window, repository-level (not individual developer) analysis. Scope is clear if not exhaustively stated.", 66 "source": "haiku" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": false, 73 "justification": "No funding source disclosed in paper. No Acknowledgments section provided. Funding status unknown.", 74 "source": "haiku" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "All three authors from Carnegie Mellon University. No stated financial interest with evaluated tools (OpenAI, Anthropic, Cursor Inc., etc.). Affiliations transparent; no conflicts explicitly disclosed.", 80 "source": "haiku" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": false, 84 "answer": false, 85 "justification": "No funding disclosed; cannot assess funder independence.", 86 "source": "haiku" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "No competing interests statement or patent/equity disclosures provided.", 92 "source": "haiku" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": true, 99 "justification": "Key terms precisely defined: 'development velocity' (commits, lines added); 'software quality' (static-analysis warnings, cognitive complexity, duplication, comment density); 'agent-first' vs 'IDE-first'; 'agent adoption' (first agent-generated PR).", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "Contributions explicitly stated: (1) replicate prior results on broader agent ecosystem; (2) first causal evidence on differential effects of transitioning from IDEs to agents. RQ1–RQ3 clarify research aims.", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "Related Work section positions study against prior IDE-based research and early agent studies, showing inconsistencies motivate longitudinal causal evidence. Methodology acknowledged from prior work (Borusyak et al.; He et al. on Cursor).", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "empirical": { 118 "artifacts": { 119 "code_released": { 120 "applies": true, 121 "answer": true, 122 "justification": "Replication package publicly available at github.com/shyamagarwal13/agentic-coding-impact. Code released.", 123 "source": "haiku" 124 }, 125 "data_released": { 126 "applies": true, 127 "answer": true, 128 "justification": "Built on public AIDev dataset (v3) and GHArchive. Replication package should include processed data or clear access instructions. Raw data is publicly available.", 129 "source": "haiku" 130 }, 131 "environment_specified": { 132 "applies": true, 133 "answer": false, 134 "justification": "No Python version, requirements.txt, Dockerfile, or dependency specifications provided in paper. Environment details presumably in replication package but not in manuscript.", 135 "source": "haiku" 136 }, 137 "reproduction_instructions": { 138 "applies": true, 139 "answer": false, 140 "justification": "No step-by-step reproduction instructions in paper. Replication package referenced but not included. Readers cannot follow codeless instructions from manuscript alone.", 141 "source": "haiku" 142 } 143 }, 144 "statistical_methodology": { 145 "applies": true, 146 "answer_confidence_intervals_or_error_bars": { 147 "applies": true, 148 "answer": true, 149 "justification": "Table 2 reports standard errors for all estimates. Figure 2 displays confidence intervals/error bands around dynamic treatment effects. Variance estimates provided.", 150 "source": "haiku" 151 }, 152 "answer_significance_tests": { 153 "applies": true, 154 "answer": true, 155 "justification": "p-values marked at *, **, *** thresholds (p<0.05, <0.01, <0.001) in Table 2. Significance levels clearly reported for main effects.", 156 "source": "haiku" 157 }, 158 "answer_effect_sizes_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Table 2 reports both log-transformed coefficients and percentage change (e.g., 'AF: 76.59% for lines added'). Effect magnitudes are substantive and contextualized.", 162 "source": "haiku" 163 }, 164 "answer_sample_size_justified": { 165 "applies": true, 166 "answer": false, 167 "justification": "Sample sizes given (401 AF + 606 controls; 117 IF + 73 controls) but no power analysis or statistical justification provided. Minimum thresholds (≥10 stars, ≥10 agentic PRs) motivated pragmatically, not statistically.", 168 "source": "haiku" 169 }, 170 "answer_variance_reported": { 171 "applies": true, 172 "answer": true, 173 "justification": "Standard errors in Table 2; confidence intervals in Figure 2. Variance structure visible in all main results.", 174 "source": "haiku" 175 } 176 }, 177 "evaluation_design": { 178 "applies": true, 179 "answer_baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Control repositories matched on propensity scores; treated vs. control comparison is central. Controls are GitHub repos with ≥10 stars and same primary language.", 183 "source": "haiku" 184 }, 185 "answer_baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "Controls selected from GitHub repos at time of agent adoption (2024–2025). Baselines are contemporary and reflect current development practices.", 189 "source": "haiku" 190 }, 191 "answer_ablation_study": { 192 "applies": true, 193 "answer": false, 194 "justification": "No ablation study. Paper identifies 12 agent types (Claude, Cursor, Devin, etc.) but does not report separate effects per agent or per scaffolding component. Heterogeneous effects by AF/IF are analyzed but not true ablations.", 195 "source": "haiku" 196 }, 197 "answer_multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Six outcomes measured: commits, lines added, static-analysis warnings, cognitive complexity, duplication, comment density. Multiple dimensions of velocity and quality captured.", 201 "source": "haiku" 202 }, 203 "answer_human_evaluation": { 204 "applies": false, 205 "answer": false, 206 "justification": "Observational study of repository-level metrics; no human evaluation of code quality, developer satisfaction, or output properties. Not applicable to this study design.", 207 "source": "haiku" 208 }, 209 "answer_held_out_test_set": { 210 "applies": false, 211 "answer": false, 212 "justification": "Not a prediction task. Causal study using temporal separation (pre/post adoption) as quasi-experimental design. Test set logic does not apply.", 213 "source": "haiku" 214 }, 215 "answer_per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Results stratified by prior AI exposure (AF vs. IF). Separate analyses for each group. No per-agent or per-language breakdown despite identifying 12 agent types.", 219 "source": "haiku" 220 }, 221 "answer_failure_cases_discussed": { 222 "applies": true, 223 "answer": false, 224 "justification": "Limited discussion of failure modes. Paper notes duplication effects are 'small and inconsistent' and interprets this, but does not show concrete failure cases or negative agent behaviors.", 225 "source": "haiku" 226 }, 227 "answer_negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "IF repositories show negative velocity effects by t=6 (lines ~−61%, commits ~−35%). Quality risks universally present regardless of velocity outcome. Negative and null results clearly reported.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "applies": true, 236 "answer_model_versions_specified": { 237 "applies": true, 238 "answer": false, 239 "justification": "Agent types identified (Claude, Cursor, Devin, etc.) but exact model versions, snapshot dates, or parameter configurations not specified. Observational study conflates tool versions.", 240 "source": "haiku" 241 }, 242 "answer_prompts_provided": { 243 "applies": false, 244 "answer": false, 245 "justification": "Observational study of real-world tools; no controlled prompts. Not applicable.", 246 "source": "haiku" 247 }, 248 "answer_hyperparameters_reported": { 249 "applies": true, 250 "answer": false, 251 "justification": "Borusyak et al. estimator used but no reported temperature, top-p, sampling strategy for the agents themselves. Matching hyperparameters (AUC 0.92–0.99) noted but not detailed.", 252 "source": "haiku" 253 }, 254 "answer_scaffolding_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "Observational study of real-world agent usage; no control over scaffolding. Paper does not describe agent system instructions, planning strategies, or tool use. Not applicable to this design.", 258 "source": "haiku" 259 }, 260 "answer_data_preprocessing_documented": { 261 "applies": true, 262 "answer": true, 263 "justification": "Preprocessing steps documented: propensity score matching, covariate selection (age, 6-month lags, cumulative history), exclusion criteria (≥10 stars, ≥10 PRs), AF/IF inference, language matching.", 264 "source": "haiku" 265 } 266 }, 267 "data_integrity": { 268 "applies": true, 269 "answer_raw_data_available": { 270 "applies": true, 271 "answer": true, 272 "justification": "AIDev dataset (v3) is public. GHArchive is public. GitHub data is public. Replication package references should enable raw data access.", 273 "source": "haiku" 274 }, 275 "answer_data_collection_described": { 276 "applies": true, 277 "answer": true, 278 "justification": "Data collection clearly described: retrospective parsing of PRs Jan 2024–Nov 2025 from AIDev; agent attribution via cascading signals (branch prefix, author login, bot type); monthly repository activity from GHArchive.", 279 "source": "haiku" 280 }, 281 "answer_recruitment_methods_described": { 282 "applies": false, 283 "answer": true, 284 "justification": "Public GitHub repositories; no recruitment needed. Applicable = false but answer = true (N/A satisfied).", 285 "source": "haiku" 286 }, 287 "answer_data_pipeline_documented": { 288 "applies": true, 289 "answer": true, 290 "justification": "Pipeline: AIDev dataset → cascading agent attribution → propensity score matching → DiD estimation (Borusyak et al.) → monthly outcomes. Steps described; some implementation details in replication package.", 291 "source": "haiku" 292 } 293 }, 294 "contamination": { 295 "applies": false, 296 "answer_training_cutoff_stated": { 297 "applies": false, 298 "answer": false, 299 "justification": "Not evaluating model capabilities on benchmarks. Study measures repository-level effects, not model generalization. N/A.", 300 "source": "haiku" 301 }, 302 "answer_train_test_overlap_discussed": { 303 "applies": false, 304 "answer": false, 305 "justification": "N/A.", 306 "source": "haiku" 307 }, 308 "answer_benchmark_contamination_addressed": { 309 "applies": false, 310 "answer": false, 311 "justification": "N/A.", 312 "source": "haiku" 313 } 314 }, 315 "human_studies": { 316 "applies": false, 317 "answer_pre_registered": { 318 "applies": false, 319 "answer": false, 320 "justification": "No human participants. N/A.", 321 "source": "haiku" 322 }, 323 "answer_irb_or_ethics_approval": { 324 "applies": false, 325 "answer": false, 326 "justification": "No human participants. N/A.", 327 "source": "haiku" 328 }, 329 "answer_demographics_reported": { 330 "applies": false, 331 "answer": false, 332 "justification": "No human participants. N/A.", 333 "source": "haiku" 334 }, 335 "answer_inclusion_exclusion_criteria": { 336 "applies": false, 337 "answer": false, 338 "justification": "No human participants. N/A.", 339 "source": "haiku" 340 }, 341 "answer_randomization_described": { 342 "applies": false, 343 "answer": false, 344 "justification": "No human participants. N/A.", 345 "source": "haiku" 346 }, 347 "answer_blinding_described": { 348 "applies": false, 349 "answer": false, 350 "justification": "No human participants. N/A.", 351 "source": "haiku" 352 }, 353 "answer_attrition_reported": { 354 "applies": false, 355 "answer": false, 356 "justification": "No human participants. N/A.", 357 "source": "haiku" 358 } 359 }, 360 "cost_and_practicality": { 361 "applies": true, 362 "answer_inference_cost_reported": { 363 "applies": true, 364 "answer": false, 365 "justification": "No inference cost, latency, or computational budget reported for agent runs. Study focuses on repository-level outcomes, not cost analysis.", 366 "source": "haiku" 367 }, 368 "answer_compute_budget_stated": { 369 "applies": true, 370 "answer": false, 371 "justification": "No total computational budget for scanning 129K+ repos, running propensity models (AUC 0.92–0.99), or DiD estimation reported.", 372 "source": "haiku" 373 } 374 } 375 } 376 }, 377 "claims": [ 378 { 379 "claim": "Agentic tools substantially accelerate development velocity only when introduced as a repository's first observable AI tool", 380 "evidence": "Table 2: AF repos show +36.3% commits, +76.6% lines added. IF repos show +3.1% commits, −6.3% lines added. Figure 2 shows AF sustained gains through t=6; IF spike then decline.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Quality risks are persistent across settings, with static-analysis warnings and cognitive complexity rising by roughly 18% and 39%", 385 "evidence": "Table 2: Static Analysis Warnings +17.7% (AF), +19.0% (IF). Code Complexity +34.9% (AF), +42.9% (IF). Figure 2 shows persistent positive trajectory for both outcomes.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Repositories with prior IDE-based AI assistance experience minimal or short-lived throughput increases from agent adoption", 390 "evidence": "Table 2: IF repos −6.3% lines added on average. Figure 2 shows IF spike at t=0–2 then return to near-zero and negative by t=6.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "Increased complexity and warnings persist even when net velocity gains are weak or negative, indicating agent-induced technical debt", 395 "evidence": "IF repos show negative velocity effects (lines ~−61% by t=6) but sustained complexity increase (~+15–+62%). AF repos maintain both velocity and complexity gains, but quality risks do not reverse.", 396 "supported": "strong" 397 }, 398 { 399 "claim": "Teams already using AI IDEs may rely on agents for documentation as well as code", 400 "evidence": "IF repos show +22% average comment density increase; AF repos show muted (+4.3%) effects. Suggests different tool use patterns.", 401 "supported": "moderate" 402 }, 403 { 404 "claim": "Agentic tools act as high-throughput contributors primarily in new-to-AI workflows but yield diminishing returns in AI-saturated ones", 405 "evidence": "Heterogeneous effects: AF vs IF stratification directly supports this claim. AF harvests 'first AI acceleration'; IF faces higher coordination costs and review overhead.", 406 "supported": "moderate" 407 } 408 ], 409 "methodology_tags": [ 410 "observational", 411 "causal-inference", 412 "longitudinal", 413 "difference-in-differences", 414 "matching" 415 ], 416 "key_findings": "Using a quasi-experimental difference-in-differences design with propensity-score matched controls, the paper finds that autonomous coding agents produce heterogeneous effects contingent on prior AI exposure. Repositories without prior IDE-based AI usage (agent-first) experience large sustained velocity gains (+76.6% lines added) that persist for 6+ months, while repositories with prior IDE adoption (IDE-first) show minimal throughput increases that fade by t=6. Critically, both groups experience persistent increases in technical debt regardless of velocity outcomes: static-analysis warnings rise ~18–19% and cognitive complexity increases ~35–43%. The results suggest autonomous agents function as powerful but risky accelerators whose net value depends on context, with quality safeguards essential to prevent long-term maintainability problems.", 417 "red_flags": [ 418 { 419 "flag": "No funding disclosure", 420 "detail": "Missing funding source and competing interests statement. Unclear if CMU funding or industry sponsorship influenced study design or reporting." 421 }, 422 { 423 "flag": "Environment specifications absent", 424 "detail": "No Python version, requirements.txt, Dockerfile, or dependency list in paper. Replication claims rely on GitHub package but reproducibility from paper alone is impossible." 425 }, 426 { 427 "flag": "Pre-treatment imbalance", 428 "detail": "Authors acknowledge: 'isolated significant pre-treatment coefficients in static-analysis warnings and code complexity... suggesting untreated potential outcomes not fully captured.' Indicates matching did not fully balance groups; potential bias toward finding complexity increases." 429 }, 430 { 431 "flag": "No sample size justification", 432 "detail": "Sample sizes provided (401 AF, 117 IF treated repos) but no power analysis. Minimum thresholds (≥10 stars, ≥10 agentic PRs) motivated pragmatically, not statistically." 433 }, 434 { 435 "flag": "Agent versions not documented", 436 "detail": "Paper identifies 12 agent types but does not specify model versions, release dates, or parameter configurations. Observational study conflates heterogeneous tools without ablation." 437 }, 438 { 439 "flag": "No per-agent breakdown", 440 "detail": "Despite identifying Claude, Cursor, Devin, Copilot, etc., results are not stratified by tool. Aggregated effects may mask tool-specific benefits or harms." 441 }, 442 { 443 "flag": "Observation window short", 444 "detail": "Study covers Jan 2024–Nov 2025; agent adoption cluster (May–July 2025) means post-adoption follow-up is <6 months. Long-term technical debt trajectory unknown." 445 }, 446 { 447 "flag": "Pre-treatment trends in quality metrics", 448 "detail": "Figure 2 shows non-zero coefficients at t=−6 to t=−1 for complexity/warnings in some strata, suggesting parallel trends assumption may be violated." 449 } 450 ], 451 "cited_papers": [ 452 { 453 "title": "Revisiting event study designs: robust and efficient estimation", 454 "authors": "Borusyak, Jaravel, Spiess", 455 "year": 2021, 456 "relevance": "Methodological foundation: imputation-based DiD estimator used for causal inference under staggered adoption." 457 }, 458 { 459 "title": "Speed at the Cost of Quality: How Cursor AI Increases Short-Term Velocity and Long-Term Complexity in Open-Source Projects", 460 "authors": "He, Miller, Agarwal, Kastner, Vasilescu", 461 "year": 2026, 462 "relevance": "Prior work on same research question for Cursor IDE; methodology and findings replicated/extended to broader agent ecosystem." 463 }, 464 { 465 "title": "The Rise of AI Teammates in Software Engineering (SE) 3.0: How Autonomous Coding Agents Are Reshaping Software Engineering", 466 "authors": "Li, Zhang, Hassan", 467 "year": 2025, 468 "relevance": "Related survey/overview of agentic coding adoption and impacts." 469 }, 470 { 471 "title": "On the use of agentic coding: An empirical study of pull requests on GitHub", 472 "authors": "Watanabe et al.", 473 "year": 2025, 474 "relevance": "Parallel empirical work on agent-generated PRs; complementary evidence on agentic contribution patterns." 475 }, 476 { 477 "title": "How Much Does AI Impact Development Speed? an Enterprise-Based Randomized Controlled Trial", 478 "authors": "Paradis et al.", 479 "year": 2024, 480 "relevance": "Prior RCT on Copilot productivity impacts; contrasts with observational design and open-source context here." 481 }, 482 { 483 "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity", 484 "authors": "Becker, Rush, Barnes, Rein", 485 "year": 2025, 486 "relevance": "Controlled study of agent impacts on experienced developers; complements large-scale longitudinal findings." 487 } 488 ], 489 "engagement_factors": { 490 "practical_relevance": { 491 "score": 2, 492 "justification": "Findings directly inform adoption decisions (agent-first vs. IDE-first strategies) and quality safeguard requirements. However, study is specific to open-source GitHub; applicability to enterprise, proprietary codebases, and team dynamics unclear." 493 }, 494 "surprise_contrarian": { 495 "score": 2, 496 "justification": "Heterogeneous effects (agent benefits only as first-to-AI) and persistent quality costs despite velocity gains challenge uncritical enthusiasm. Speed-maintainability tradeoff is somewhat expected but data quantifying it is novel." 497 }, 498 "fear_safety": { 499 "score": 1, 500 "justification": "Raises concerns about long-term technical debt and maintainability burdens. Paper notes ethical considerations and need for oversight but does not emphasize AI risk per se; quality-focused rather than safety-focused." 501 }, 502 "demo_ability": { 503 "score": 0, 504 "justification": "Observational study with no interactive demo or hands-on artifact. Findings require building tools and analyzing massive GitHub datasets; not reproducible by individual practitioners without significant infrastructure." 505 }, 506 "drama_conflict": { 507 "score": 2, 508 "justification": "Implicit critique of uncritical agent adoption and hype. Finding that agents may not accelerate already-AI-rich teams and create technical debt challenges narratives but is not sensationalized or controversial by design." 509 }, 510 "brand_recognition": { 511 "score": 2, 512 "justification": "All authors from Carnegie Mellon University (respected institution). Published at MSR '26 (top-tier venue for software engineering empirical work). Rigorous methodology and large-scale dataset provide credibility. Not from FAANG or leading AI lab." 513 } 514 }, 515 "hn_data": { 516 "threads": [], 517 "top_points": 0, 518 "total_points": 0, 519 "total_comments": 0 520 } 521 }