scan.json (25335B)
1 { 2 "scan_version": 3, 3 "active_modules": [], 4 "paper": { 5 "title": "Speed at the Cost of Quality: How Cursor AI Increases Short-Term Velocity and Long-Term Complexity in Open-Source Projects", 6 "authors": [ 7 "Hao He", 8 "Courtney Miller", 9 "Shyam Agarwal", 10 "Christian Kästner", 11 "Bogdan Vasilescu" 12 ], 13 "year": 2026, 14 "venue": "MSR '26", 15 "arxiv_id": "2511.04427", 16 "doi": "10.1145/3793302.3793349" 17 }, 18 "methodology_tags": [ 19 "observational" 20 ], 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "Replication package provided at https://doi.org/10.5281/zenodo.18368661 (Data Availability section)." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "Replication package at Zenodo includes the dataset. The paper states 'We provide a replication package for this paper.'" 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "No mention of requirements.txt, Dockerfile, or detailed environment setup. The paper mentions SonarQube Community server and BigQuery but does not specify software versions or dependencies for reproduction." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper provides a replication package link but does not include step-by-step reproduction instructions in the paper itself. Whether the Zenodo archive contains them is not stated." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": true, 48 "justification": "Standard errors reported in Tables 2 and 3. Figure 3 shows confidence intervals around treatment effect estimates. Table 2 includes ± percentage change intervals." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": true, 53 "justification": "Statistical significance reported throughout with p-value thresholds (p<0.05, p<0.01, p<0.001). Pre-trend Wald tests, Sargan tests, and AR(2) tests all reported with p-values." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "Effect sizes reported as percentage changes with baseline context (e.g., '28.58% increase in lines added', '41.64% increase in code complexity'). Table 2 provides log-transformed estimates and their percentage-change interpretations." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "The sample of 806 treated repositories is determined by data availability (all qualifying repos found via GitHub search), not by power analysis. No power analysis or sample size justification is discussed." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": true, 68 "justification": "Standard errors reported in parentheses for all regression estimates in Tables 2 and 3. Confidence bands shown in event study plots (Figures 3 and 4)." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "The matched control group of 1,380 never-adopter repositories serves as the baseline comparison, constructed via propensity score matching (Section 3.1.3)." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "Control repositories are contemporaneous — matched from the same time period and observed over the same January 2024 to August 2025 window." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "Multiple robustness checks function as ablations: high contributor adoption subset, cursor configuration changes subset, activity-level subsets, and other-AI-tools subsets (Section 4.3, Figure 4)." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Five outcome metrics: commits, lines added, static analysis warnings, duplicate line density, and code complexity (Section 3.2.1)." 91 }, 92 "human_evaluation": { 93 "applies": false, 94 "answer": false, 95 "justification": "This is a large-scale observational mining study of repository metrics. Human evaluation of code quality outputs is not applicable." 96 }, 97 "held_out_test_set": { 98 "applies": false, 99 "answer": false, 100 "justification": "This is a causal inference study using observational data, not a prediction task. No train/test split is applicable." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Results broken down by outcome variable (5 metrics), by time horizon (Figures 3, 4), and by subgroup (high adoption, active repos, other AI tools). Table 1 provides descriptive statistics." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "The paper discusses where effects are weak or absent (e.g., no significant effect on commits, no significant effect on duplicate line density). Section 5.1 discusses the transient nature of velocity gains as a 'failure' of sustained benefit." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Negative results reported: velocity gains in lines added not sustained (main effect on commits insignificant), duplicate line density effect insignificant (Table 2), velocity increase from lines added does not significantly cause more warnings in GMM (Table 3)." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "Abstract claims of 'statistically significant, large, but transient increase in velocity' and 'substantial and persistent increase in static analysis warnings and code complexity' are supported by Tables 2-3 and Figures 3-4." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "Causal claims justified via difference-in-differences design with staggered adoption (Borusyak et al. estimator), propensity score matching, pre-trend tests, and panel GMM with instrumental variables. Multiple robustness checks address alternative explanations (Section 4.3)." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": true, 132 "justification": "Section 3.5.2 explicitly bounds generalization: 'Our results may not generalize to other LLM agent assistants, proprietary software projects, and programming languages beyond the three dominant ones.' Section 5.1.3 discusses open-source-specific contextual factors." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": true, 137 "justification": "Multiple alternative explanations discussed: excitement-frustration-abandonment cycle (Section 5.1.1), inactive repositories driving transient gains (Section 4.3 Row 2), contamination from other AI tools (Section 4.3 Row 3), non-compliance (Section 4.3 Row 1). Each is tested with robustness checks." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper explicitly frames its measurements as proxies: 'development velocity' is operationalized as commits and lines added, 'software quality' as static-analysis warnings, cognitive complexity, and duplicate density. Figure 1 shows a theory diagram with labeled causal arrows. The limitations section acknowledges these are imperfect proxies, and the title itself ('Speed at the Cost of Quality') uses shorthand that the body unpacks with specific metrics." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": false, 148 "answer": false, 149 "justification": "The study does not use LLMs as part of its methodology. It studies the effect of Cursor adoption by others. The paper acknowledges it lacks information on which Cursor version or LLM backend each repository used (Section 3.5.1)." 150 }, 151 "prompts_provided": { 152 "applies": false, 153 "answer": false, 154 "justification": "No prompting is used in the study methodology." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "Key modeling parameters reported: 1:3 nearest-neighbor matching ratio, propensity score model specification (Equation 1), 6-month lag structure, 10-star threshold, lags 2-3 as instruments for GMM, AUC values 0.83-0.91 for propensity models." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "No agentic scaffolding used in the study methodology." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "Data pipeline documented: GitHub code search API with adaptive partitioning (Section 3.1.2), 10-star filter yielding 806 repos, propensity score matching yielding 1,380 controls (Section 3.1.3), monthly metric collection from GHArchive and SonarQube (Section 3.2), multicollinearity check removing issue comments." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 3.5 'Limitations and Threats to Validity' provides extensive discussion of internal validity (Section 3.5.1) and external validity (Section 3.5.2)." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": true, 181 "justification": "Specific threats discussed: observable adoption proxy (config files don't capture all users), usage intensity uncertainty, model/version heterogeneity, imperfect matching with unobserved confounders, contamination from other AI tools. Each is study-specific, not generic." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": true, 186 "justification": "Explicit scope boundaries: estimates are ITT effects, results reflect Cursor adoption vs. state-of-practice (not vs. no AI), may not generalize to enterprise, other tools, or other languages. Section 3.5.1 states estimates 'should be interpreted as the impact of systematic Cursor adoption compared to the current state-of-the-practice.'" 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": true, 193 "justification": "Replication package at Zenodo (doi.org/10.5281/zenodo.18368661). Data sourced from public GHArchive and GitHub API, which are independently verifiable." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Data collection described in detail: GitHub code search API with adaptive partitioning algorithm for .cursorrules files (Section 3.1.2), GHArchive for time series (Section 3.1.3), SonarQube for code quality metrics (Section 3.2.1). Time period: January 2024 to August 2025." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants. Data comes from mining public GitHub repositories. Standard benchmark NA." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "Pipeline documented: 23,308 Cursor files across 3,306 non-fork repos → 10-star filter → 806 repos (Section 3.1.2). Control group: population of all ≥10 star repos → 10,000 candidates per month → propensity score matching → 1,380 controls (Section 3.1.3). Filtering counts provided at each stage." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "Acknowledgments section lists NSF awards (2206859, DGE214073, 2317168, 2120323), research awards from Google and the Digital Infrastructure Fund, and Google Cloud research credits." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "All authors affiliated with Carnegie Mellon University, clearly stated. No affiliation with Cursor/Anysphere." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": true, 225 "justification": "Funded by NSF and research awards. Google provided credits for BigQuery analysis. Neither NSF nor Google has a direct stake in whether Cursor improves or degrades code quality. The study evaluates a third-party product (Cursor)." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial interests statement found in the paper." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": false, 236 "answer": false, 237 "justification": "The study does not evaluate a pre-trained model's capability on any benchmark. It is a mining study measuring the impact of tool adoption on repository-level metrics." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": false, 241 "answer": false, 242 "justification": "Not a benchmark evaluation study. No model is being tested on a benchmark." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": false, 246 "answer": false, 247 "justification": "Not a benchmark evaluation study." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants. This is a repository mining study." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants. Repository inclusion criteria are documented in data_preprocessing_documented." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": false, 290 "answer": false, 291 "justification": "The study does not propose a method with inference costs. It is an empirical observational study." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "Google Cloud credits mentioned for BigQuery analysis and SonarQube used locally, but no quantification of total compute budget (e.g., how long SonarQube analysis took across all repos, BigQuery costs)." 297 } 298 } 299 }, 300 "claims": [ 301 { 302 "claim": "Cursor adoption leads to a statistically significant but transient increase in development velocity, with 281.3% increase in lines added in the first month that dissipates after two months.", 303 "evidence": "Table 2 shows 28.58% average increase in lines added (p<0.001). Figure 3 event study plots show significant effects only in months 0-1 post-adoption, with non-significant effects from month 2 onward.", 304 "supported": "strong" 305 }, 306 { 307 "claim": "Cursor adoption leads to persistent increases in static analysis warnings (30.3%) and code complexity (41.6%).", 308 "evidence": "Table 2 reports ATT estimates with p<0.001 for both metrics. Figure 3 shows sustained significant effects across months 0-6 post-adoption for both metrics.", 309 "supported": "strong" 310 }, 311 { 312 "claim": "Code complexity increases even after controlling for velocity dynamics, indicating Cursor generates inherently more complex code.", 313 "evidence": "Table 3 GMM model shows Cursor has significant coefficient (0.086, p<0.001) for code complexity after controlling for lines added, lines of code, and temporal dynamics.", 314 "supported": "strong" 315 }, 316 { 317 "claim": "Accumulated technical debt (static analysis warnings and code complexity) decreases future development velocity, creating a self-reinforcing cycle.", 318 "evidence": "Table 3 GMM models show 100% increase in code complexity causes 64.5% decrease in lines added, and 100% increase in warnings causes 50.3% decrease. Sargan and AR(2) tests validate the instrument conditions.", 319 "supported": "strong" 320 }, 321 { 322 "claim": "Effects are amplified in repositories with higher sustained Cursor usage and attenuated in repositories using multiple AI tools.", 323 "evidence": "Figure 4 robustness checks show stronger quality degradation in high-contributor-adoption and cursor-configuration-changes subsets, and slightly weaker effects when other AI tools are present.", 324 "supported": "moderate" 325 } 326 ], 327 "key_findings": "Cursor adoption produces a large but transient velocity increase (281% more lines added in month 1, dissipating by month 2) alongside persistent quality degradation (30% more static analysis warnings, 42% more code complexity). Panel GMM analysis shows code complexity increases independently of velocity effects, and accumulated technical debt reduces future velocity, creating a self-reinforcing cycle. Robustness checks across adoption intensity, activity levels, and AI tool contamination confirm the findings.", 328 "red_flags": [ 329 { 330 "flag": "Proxy-based treatment identification", 331 "detail": "Cursor adoption is identified via .cursorrules files in git history, which is an imperfect proxy. Developers can use Cursor without committing config files, and committing config files doesn't guarantee sustained use. The authors acknowledge this is an ITT estimate." 332 }, 333 { 334 "flag": "Quality metrics may not capture AI-specific patterns", 335 "detail": "The authors themselves note (Section 6) that complexity metrics designed for human-written code may not appropriately penalize AI-generated patterns. SonarQube cognitive complexity may flag structurally valid but verbose AI code differently than genuinely problematic complexity." 336 } 337 ], 338 "cited_papers": [ 339 { 340 "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity", 341 "authors": [ 342 "Joel Becker", 343 "Nate Rush", 344 "Elizabeth Barnes", 345 "David Rein" 346 ], 347 "year": 2025, 348 "arxiv_id": "2507.09089", 349 "relevance": "RCT finding that AI tools including Cursor do not help experienced OSS developers solve real tasks faster, contradicting self-reported productivity claims." 350 }, 351 { 352 "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot", 353 "authors": [ 354 "Sida Peng", 355 "Eirini Kalliamvakou", 356 "Peter Cihon", 357 "Mert Demirer" 358 ], 359 "year": 2023, 360 "arxiv_id": "2302.06590", 361 "relevance": "Foundational RCT measuring Copilot's productivity impact (56% task completion improvement), a key comparison for this study's findings." 362 }, 363 { 364 "title": "On the use of agentic coding: An empirical study of pull requests on GitHub", 365 "authors": [ 366 "Miku Watanabe", 367 "Hao Li", 368 "Yutaro Kashiwa", 369 "Brittany Reid", 370 "Hajimu Iida", 371 "Ahmed E Hassan" 372 ], 373 "year": 2025, 374 "arxiv_id": "2509.14745", 375 "relevance": "Empirical study of Claude Code PRs on GitHub finding 83.8% acceptance rate, complementary to this study's project-level analysis." 376 }, 377 { 378 "title": "The effects of generative AI on high skilled work: Evidence from three field experiments with software developers", 379 "authors": [ 380 "Zheyuan Kevin Cui", 381 "Mert Demirer", 382 "Sonia Jaffe", 383 "Leon Musolff", 384 "Sida Peng", 385 "Tobias Salz" 386 ], 387 "year": 2024, 388 "relevance": "Field experiments at Microsoft/Accenture/Cisco on AI coding productivity (22-36% improvements), key enterprise comparison." 389 }, 390 { 391 "title": "How much does AI impact development speed? An enterprise-based randomized controlled trial", 392 "authors": [ 393 "Elise Paradis", 394 "Kate Grey", 395 "Quinn Madison" 396 ], 397 "year": 2025, 398 "relevance": "Enterprise RCT on AI coding tool impact, providing enterprise context that this OSS study lacks." 399 }, 400 { 401 "title": "Intuition to Evidence: Measuring AI's True Impact on Developer Productivity", 402 "authors": [ 403 "Anand Kumar" 404 ], 405 "year": 2025, 406 "arxiv_id": "2509.19708", 407 "relevance": "Enterprise study indicating different adoption patterns than open-source, relevant context for generalization." 408 }, 409 { 410 "title": "Who is using AI to code? Global diffusion and impact of generative AI", 411 "authors": [ 412 "Simone Daniotti", 413 "Johannes Wachs", 414 "Xiangnan Feng", 415 "Frank Neffke" 416 ], 417 "year": 2025, 418 "arxiv_id": "2506.08945", 419 "relevance": "Large-scale study using neural classifier to identify AI code on GitHub, finding 30% AI use raises commits by 2.4%." 420 }, 421 { 422 "title": "The Impact of Generative AI on Collaborative Open-Source Software Development: Evidence from GitHub Copilot", 423 "authors": [ 424 "Fangchen Song", 425 "Ashish Agarwal", 426 "Wen Wen" 427 ], 428 "year": 2024, 429 "arxiv_id": "2410.02091", 430 "relevance": "DiD study of Copilot finding only 6.5% project-level productivity increase from proprietary backend data." 431 }, 432 { 433 "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions", 434 "authors": [ 435 "Hammond Pearce", 436 "Baleegh Ahmad", 437 "Benjamin Tan", 438 "Brendan Dolan-Gavitt", 439 "Ramesh Karri" 440 ], 441 "year": 2022, 442 "relevance": "Early study on security vulnerabilities in Copilot-generated code, prior generation to the agentic tools studied here." 443 }, 444 { 445 "title": "Code with Me or for Me? How Increasing AI Automation Transforms Developer Workflows", 446 "authors": [ 447 "Valerie Chen", 448 "Ameet Talwalkar", 449 "Robert Brennan", 450 "Graham Neubig" 451 ], 452 "year": 2025, 453 "arxiv_id": "2507.08149", 454 "relevance": "Qualitative research on developer challenges with AI coding, supports the excitement-frustration-abandonment interpretation." 455 }, 456 { 457 "title": "Ai agents, productivity, and higher-order thinking: Early evidence from software development", 458 "authors": [ 459 "Suproteem K Sarkar" 460 ], 461 "year": 2025, 462 "relevance": "Early evidence on agentic AI productivity in software development, directly relevant to agentic tool impact." 463 }, 464 { 465 "title": "Generative AI and the Nature of Work", 466 "authors": [ 467 "Manuel Hoffmann", 468 "Sam Boysel", 469 "Frank Nagle", 470 "Sida Peng", 471 "Kevin Xu" 472 ], 473 "year": 2024, 474 "relevance": "Study on how LLM adoption increases work autonomy, a proposed mechanism for productivity increases." 475 } 476 ], 477 "engagement_factors": { 478 "practical_relevance": { 479 "score": 2, 480 "justification": "Directly informs teams adopting AI coding tools about quality tradeoffs and suggests concrete process adaptations like scaling QA with velocity." 481 }, 482 "surprise_contrarian": { 483 "score": 3, 484 "justification": "Directly contradicts the widely-held '10x productivity' narrative around Cursor/AI coding tools, showing velocity gains vanish after two months while technical debt persists." 485 }, 486 "fear_safety": { 487 "score": 1, 488 "justification": "Raises concerns about code quality degradation and security warnings increasing, but safety/risk is secondary to the productivity narrative." 489 }, 490 "drama_conflict": { 491 "score": 3, 492 "justification": "Directly challenges Cursor's productivity claims and the broader AI coding hype with empirical evidence of a self-reinforcing technical debt cycle — a classic 'emperor has no clothes' paper." 493 }, 494 "demo_ability": { 495 "score": 2, 496 "justification": "Replication package available on Zenodo with data and code, reproducible with moderate effort for researchers familiar with econometric methods." 497 }, 498 "brand_recognition": { 499 "score": 3, 500 "justification": "Cursor is one of the most talked-about AI coding products with millions of users, and the paper is from Carnegie Mellon, a top-tier CS institution." 501 } 502 } 503 }