scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24449B)
      1 {
      2   "paper": {
      3     "title": "Speed at the Cost of Quality: How Cursor AI Increases Short-Term Velocity and Long-Term Complexity in Open-Source Projects",
      4     "authors": ["Hao He", "Courtney Miller", "Shyam Agarwal", "Christian Kästner", "Bogdan Vasilescu"],
      5     "year": 2026,
      6     "venue": "MSR 2026",
      7     "arxiv_id": "2511.04427",
      8     "doi": "10.1145/3793302.3793349"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["observational"],
     13   "key_findings": "Cursor adoption leads to a statistically significant but transient velocity increase (281% lines added in month 1, dissipating by month 3) alongside persistent increases in static analysis warnings (+30%) and code complexity (+41%). Panel GMM models reveal that accumulated technical debt subsequently reduces future velocity, creating a self-reinforcing cycle. Robustness checks across adoption intensity, activity levels, other AI tools, programming languages, and alternative DiD estimators generally support these findings, though the Callaway-Sant'Anna estimator yields non-significant quality results.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "A replication package is provided at https://doi.org/10.5281/zenodo.18368661 (Section 'Data Availability')."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The Zenodo replication package is referenced for data availability."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No mention of environment specifications, requirements.txt, or dependency details in the paper. SonarQube Community is mentioned as a tool but no version or setup details."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper references a replication package but does not include step-by-step reproduction instructions in the paper itself. Whether the Zenodo package contains them cannot be verified from the paper text."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Standard errors are reported in Tables 2, 3, and 6. Figure 3 shows confidence intervals around treatment effect estimates. Table 2 reports '±' ranges."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Statistical significance is reported throughout with p-value thresholds (*p<0.05, **p<0.01, ***p<0.001) in Tables 2, 3, and 6. Pre-trend tests use heteroscedasticity- and cluster-robust Wald tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Effect sizes are reported as percentage changes with context (e.g., '+28.58% (±13.7%)' for lines added, '+41.64% (±7.62%)' for code complexity in Table 2). Log-transformed estimates enable interpretation as percentage changes."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The sample of 806 treated repositories is determined by what was found via GitHub code search, not by power analysis. No power analysis or sample size justification is discussed."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Standard errors are reported for all estimates in Tables 2, 3, and 6. The Borusyak et al. estimator provides robust standard errors."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The matched control group of 1,380 never-adopting repositories serves as the baseline for comparison in the DiD design."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Control repositories are matched from the same time period (2024-2025) and undergo the same temporal trends, ensuring contemporaneity."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "This is an observational study measuring a single treatment (Cursor adoption), not a system with components to ablate."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Five outcome metrics are used: commits, lines added, static analysis warnings, duplicate line density, and code complexity."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is not relevant to this observational mining study measuring repository-level metrics."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "This is an observational causal inference study, not a predictive modeling study requiring train/test splits."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by programming language (JS/TS, Python, Go), adoption intensity (high contributor, configuration changes), activity level, and other AI tool usage (Figure 4, Figure 7, Table 7). Appendix D breaks down SonarQube warnings by 20 categories (Table 8)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses where the Callaway-Sant'Anna estimator yields non-significant results for quality outcomes (Appendix B.3) and discusses the transient nature of velocity gains as a form of 'failure' of sustained benefit."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that duplicate line density shows no significant effect (Table 2), commits show no significant ATT with two of three estimators, and the Callaway-Sant'Anna estimator yields non-significant quality results."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims of 'transient increase in velocity', 'substantial and persistent increase in static analysis warnings and code complexity', and 'velocity slowdown' from technical debt are all supported by Tables 2, 3, and Figure 3."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper uses difference-in-differences with propensity score matching, the Borusyak et al. imputation estimator for staggered adoption, and panel GMM with instrumental variables — all appropriate causal identification strategies for observational data. The parallel trends assumption is tested (Section 3.3.3, Figure 3)."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 3.5.2 explicitly bounds external validity: 'Our results may not generalize to other LLM agent assistants, proprietary software projects, and programming languages beyond the three dominant ones.' Section 5.1.3 discusses the open-source context as a specific limitation."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 3.5 discusses many alternative explanations: observable adoption bias, usage intensity uncertainty, model heterogeneity, imperfect matching, contamination from other AI tools. Section 4.3 presents robustness checks against these. Section 5.1.1 discusses excitement-frustration-abandonment as an alternative mechanism for transient velocity gains."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper explicitly notes that commits and lines added are 'productivity proxies' with 'moderate-to-strong correlation with perceived productivity' (Section 3.2.1, citing prior work). It also acknowledges that quality metrics take 'the technical debt perspective' rather than claiming to measure overall quality (Section 3.2.1). The conclusion notes complexity metrics 'were designed for human-written code' and may not appropriately measure AI-generated code."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "The paper does not use any AI models itself. It studies the effect of Cursor adoption on repositories. The paper explicitly acknowledges model/version heterogeneity across studied repositories as a feature (Section 3.5.1)."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use prompting. It is an observational study analyzing repository metrics."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Key model parameters are reported: propensity score matching uses 1:3 nearest-neighbor matching, logistic regression specification in Equation 1, GMM uses lags 2-3 as instruments, 10-star threshold, 10,000 candidate sample per cohort. Regression specifications are fully detailed."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used by the authors. The paper studies Cursor as a black-box treatment."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Data collection is thoroughly documented: GitHub code search with adaptive partitioning (Section 3.1.2), 10-star filtering threshold with cited precision, propensity score matching pipeline (Section 3.1.3), SonarQube analysis (Section 3.2.1), GHArchive collection (Section 3.2.2), multi-collinearity check removing issue comments."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 3.5 'Limitations and Threats to Validity' provides extensive discussion spanning internal validity (3.5.1) and external validity (3.5.2)."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 3.5.1 discusses specific threats: observable adoption through configuration files creating selection bias, uncertainty about usage intensity/persistence, model/version heterogeneity, imperfect matching (AUC 0.83-0.91 meaning incomplete variance explanation), contamination from other AI tools. These are all specific to this study."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 3.5.1 explicitly states estimates represent 'intent-to-treat' effects, not treatment-on-treated. Section 3.5.2 lists specific exclusions: other AI assistants, proprietary software, underrepresented languages. The paper also states estimates capture Cursor vs. 'state-of-the-practice' (not vs. no AI at all)."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "A replication package is available at Zenodo (https://doi.org/10.5281/zenodo.18368661)."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3.1 describes data collection in detail: GitHub code search API with adaptive partitioning algorithm, GHArchive for monthly time series, SonarQube for code quality metrics. Time periods, inclusion criteria (10+ stars), and specific API endpoints are mentioned."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data comes from public GitHub repositories identified via code search. The repository selection process is documented in Section 3.1.2."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is documented: 23,308 Cursor files found → 3,306 non-fork repos → 806 with 10+ stars → propensity score matching → 1,380 controls. GHArchive and SonarQube pipelines described. Table 7 shows observation counts for each subset."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgments section lists NSF awards (2206859, 2317168, 2120323), NSF Graduate Research Fellowship (DGE214073), Google research award, Digital Infrastructure Fund, and Google Cloud research credits."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are from Carnegie Mellon University. They do not work for Cursor/Anysphere, so there is no direct product conflict."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "NSF and Google fund the research. Google has a competing product (Gemini Code Assist) but is not directly invested in Cursor's outcomes. The funders do not have a direct stake in whether Cursor is found to be beneficial or harmful."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement or financial interest declaration is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. It is an observational mining study of repository-level outcomes."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No model benchmark evaluation is performed."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No model benchmark evaluation is performed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants. This is a repository mining study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants. Repository inclusion criteria are documented in Section 3.1."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "This is an observational study, not a method or tool. No inference is performed."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "The paper mentions Google Cloud research credits for BigQuery analysis but does not quantify the compute budget for SonarQube analysis of 2,000+ repositories or other computational work."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "Cursor adoption leads to a statistically significant but transient increase in development velocity, with lines added increasing by 281.3% in the first month but gains dissipating after two months.",
    296       "evidence": "Table 2 shows ATT of +28.58% for lines added overall; Figure 3 shows horizon-specific effects of 281.3% in month 0 and 48.4% in month 1, with non-significant effects afterward. All five outcomes pass pre-trend tests at p<0.05.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Cursor adoption leads to a persistent increase of 30.3% in static analysis warnings and 41.6% in code complexity.",
    301       "evidence": "Table 2 reports ATT estimates with p<0.001. Figure 3 shows effects persisting through month +6. However, the Callaway-Sant'Anna estimator yields non-significant results for these quality outcomes (Table 6, Appendix B.3), weakening the evidence.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Technical debt accumulation (static analysis warnings and code complexity) causally reduces future development velocity.",
    306       "evidence": "Table 3 panel GMM models show a 100% increase in code complexity causes 64.5% decrease in lines added, and 100% increase in static analysis warnings causes 50.3% decrease. Sargan tests (p>0.05) and AR(2) tests (p>0.05) validate instruments.",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "Cursor adoption leads to inherently more complex code, even after controlling for velocity dynamics.",
    311       "evidence": "Table 3, column L→C shows Cursor coefficient of 0.086 (p<0.001) for code complexity even with velocity controls and codebase size dynamics accounted for.",
    312       "supported": "strong"
    313     },
    314     {
    315       "claim": "Cursor adoption has no significant effect on duplicate line density.",
    316       "evidence": "Table 2 shows ATT of +7.03% which is marked significant but Figure 3 shows inconsistent temporal patterns. Heavy adopters show modest increases (Figure 4).",
    317       "supported": "weak"
    318     },
    319     {
    320       "claim": "The effects are amplified in repositories with higher sustained Cursor usage.",
    321       "evidence": "Figure 4, Row 1 shows stronger quality effects in 'High Contributor Adoption' and 'Cursor Configuration Changes' subsets compared to the full sample.",
    322       "supported": "moderate"
    323     }
    324   ],
    325   "red_flags": [
    326     {
    327       "flag": "Estimator disagreement on quality outcomes",
    328       "detail": "The Callaway-Sant'Anna estimator finds non-significant effects for all three quality outcomes (static analysis warnings, code complexity, duplicate line density), while Borusyak et al. and TWFE find significant positive effects. The paper reports the most favorable estimator (Borusyak) in the main text. Appendix B.3 discusses this but the resolution is that Callaway-Sant'Anna has lower power due to small cohorts — this is plausible but the disagreement is non-trivial."
    329     },
    330     {
    331       "flag": "Proxy validity of .cursorrules for adoption",
    332       "detail": "Using .cursorrules files as a proxy for Cursor adoption introduces multiple biases: only repositories that committed configuration files are included, timing may not reflect actual adoption start, and the paper cannot distinguish between Composer (multi-file editing) and full agentic mode. The paper acknowledges this extensively but the estimates are fundamentally limited by this proxy."
    333     },
    334     {
    335       "flag": "SonarQube warning analysis is descriptive only",
    336       "detail": "Appendix D's breakdown of warning categories by pre/post adoption (Table 8) is described as a 'convenience sample' with known architectural limitations preventing precise tracking. These descriptive statistics should not be interpreted causally but may be read as such."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    342       "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"],
    343       "year": 2025,
    344       "arxiv_id": "2507.09089",
    345       "relevance": "Controlled experiment finding that early-2025 AI tools including Cursor do not help experienced developers, contradicting self-reported productivity gains."
    346     },
    347     {
    348       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    349       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    350       "year": 2023,
    351       "arxiv_id": "2302.06590",
    352       "relevance": "RCT measuring Copilot's productivity impact (56% faster task completion), foundational reference for AI coding productivity studies."
    353     },
    354     {
    355       "title": "On the use of agentic coding: An empirical study of pull requests on GitHub",
    356       "authors": ["Miku Watanabe", "Hao Li", "Yutaro Kashiwa", "Brittany Reid", "Hajimu Iida", "Ahmed E Hassan"],
    357       "year": 2025,
    358       "arxiv_id": "2509.14745",
    359       "relevance": "Examines 567 Claude Code pull requests finding 83.8% acceptance rate, complementary to this paper's project-level analysis."
    360     },
    361     {
    362       "title": "The Impact of Large Language Models on Open-source Innovation: Evidence from GitHub Copilot",
    363       "authors": ["Doron Yeverechyahu", "Raveesh Mayya", "Gal Oestreicher-Singer"],
    364       "year": 2024,
    365       "relevance": "DiD study estimating 17.82% increase in Python package releases after Copilot availability, closest prior work to this study's methodology."
    366     },
    367     {
    368       "title": "The effects of generative AI on high skilled work: Evidence from three field experiments with software developers",
    369       "authors": ["Zheyuan Kevin Cui", "Mert Demirer", "Sonia Jaffe", "Leon Musolff", "Sida Peng", "Tobias Salz"],
    370       "year": 2024,
    371       "relevance": "Field experiments at Microsoft, Accenture, Cisco reporting 22-36% productivity gains from AI coding tools."
    372     },
    373     {
    374       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    375       "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"],
    376       "year": 2022,
    377       "relevance": "Foundational study on security vulnerabilities in AI-generated code from Copilot."
    378     },
    379     {
    380       "title": "Do Users Write More Insecure Code with AI Assistants?",
    381       "authors": ["Neil Perry", "Megha Srivastava", "Deepak Kumar", "Dan Boneh"],
    382       "year": 2023,
    383       "relevance": "Controlled study on security implications of AI coding assistants with human participants."
    384     },
    385     {
    386       "title": "The Impact of Generative AI on Collaborative Open-Source Software Development: Evidence from GitHub Copilot",
    387       "authors": ["Fangchen Song", "Ashish Agarwal", "Wen Wen"],
    388       "year": 2024,
    389       "arxiv_id": "2410.02091",
    390       "relevance": "Estimates only 6.5% project-level productivity increase from Copilot using proprietary backend data."
    391     },
    392     {
    393       "title": "How much does AI impact development speed? An enterprise-based randomized controlled trial",
    394       "authors": ["Elise Paradis", "Kate Grey", "Quinn Madison"],
    395       "year": 2025,
    396       "relevance": "Enterprise RCT on AI impact on development speed, one of few studies on agentic tools beyond Copilot."
    397     },
    398     {
    399       "title": "Who is using AI to code? Global diffusion and impact of generative AI",
    400       "authors": ["Simone Daniotti", "Johannes Wachs", "Xiangnan Feng", "Frank Neffke"],
    401       "year": 2025,
    402       "arxiv_id": "2506.08945",
    403       "relevance": "Large-scale study using neural classifier to identify AI-generated code on GitHub, finding 30% AI use raises quarterly commits by 2.4%."
    404     },
    405     {
    406       "title": "Code with Me or for Me? How Increasing AI Automation Transforms Developer Workflows",
    407       "authors": ["Valerie Chen", "Ameet Talwalkar", "Robert Brennan", "Graham Neubig"],
    408       "year": 2025,
    409       "arxiv_id": "2507.08149",
    410       "relevance": "Qualitative research documenting developer challenges and workflow changes with AI coding assistance at different automation levels."
    411     },
    412     {
    413       "title": "Intuition to Evidence: Measuring AI's True Impact on Developer Productivity",
    414       "authors": ["Anand Kumar"],
    415       "year": 2025,
    416       "arxiv_id": "2509.19708",
    417       "relevance": "Enterprise study on AI's impact on developer productivity with field evidence."
    418     }
    419   ]
    420 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs