scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (30727B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Speed at the Cost of Quality: How Cursor AI Increases Short-Term Velocity and Long-Term Complexity in Open-Source Projects",
      6     "authors": [
      7       "Hao He",
      8       "Courtney Miller",
      9       "Shyam Agarwal",
     10       "Christian Kästner",
     11       "Bogdan Vasilescu"
     12     ],
     13     "year": 2026,
     14     "venue": "MSR '26",
     15     "arxiv_id": "2511.04427",
     16     "doi": "10.1145/3793302.3793349"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims of 'statistically significant, large, but transient increase in velocity' and 'substantial and persistent increase in static analysis warnings and code complexity' are supported by Tables 2-3 and Figures 3-4.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims justified via difference-in-differences design with staggered adoption (Borusyak et al. estimator), propensity score matching, pre-trend tests, and panel GMM with instrumental variables. Multiple robustness checks address alternative explanations (Section 4.3).",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Section 3.5.2 explicitly bounds generalization: 'Our results may not generalize to other LLM agent assistants, proprietary software projects, and programming languages beyond the three dominant ones.' Section 5.1.3 discusses open-source-specific contextual factors.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Multiple alternative explanations discussed: excitement-frustration-abandonment cycle (Section 5.1.1), inactive repositories driving transient gains (Section 4.3 Row 2), contamination from other AI tools (Section 4.3 Row 3), non-compliance (Section 4.3 Row 1). Each is tested with robustness checks.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly frames its measurements as proxies: 'development velocity' is operationalized as commits and lines added, 'software quality' as static-analysis warnings, cognitive complexity, and duplicate density. Figure 1 shows a theory diagram with labeled causal arrows. The limitations section acknowledges these are imperfect proxies, and the title itself ('Speed at the Cost of Quality') uses shorthand that the body unpacks with specific metrics.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 3.5 'Limitations and Threats to Validity' provides extensive discussion of internal validity (Section 3.5.1) and external validity (Section 3.5.2).",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats discussed: observable adoption proxy (config files don't capture all users), usage intensity uncertainty, model/version heterogeneity, imperfect matching with unobserved confounders, contamination from other AI tools. Each is study-specific, not generic.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Explicit scope boundaries: estimates are ITT effects, results reflect Cursor adoption vs. state-of-practice (not vs. no AI), may not generalize to enterprise, other tools, or other languages. Section 3.5.1 states estimates 'should be interpreted as the impact of systematic Cursor adoption compared to the current state-of-the-practice.'",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgments section lists NSF awards (2206859, DGE214073, 2317168, 2120323), research awards from Google and the Digital Infrastructure Fund, and Google Cloud research credits.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors affiliated with Carnegie Mellon University, clearly stated. No affiliation with Cursor/Anysphere.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Funded by NSF and research awards. Google provided credits for BigQuery analysis. Neither NSF nor Google has a direct stake in whether Cursor improves or degrades code quality. The study evaluates a third-party product (Cursor).",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement found in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "'LLM agent assistant' is explicitly distinguished from completion tools and chat interfaces; 'development velocity' is operationalized as commits and lines added; quality metrics are defined through SonarQube outputs (cognitive complexity, static analysis warnings, duplicate line density) in Section 3.2.1.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 2 states two explicit contributions: (1) DiD design estimating additional project-level productivity gain from modern agentic Cursor vs. state-of-practice; (2) first comprehensive analysis of Cursor's impact on code quality.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 substantively distinguishes this work from prior Copilot RCTs and field experiments, code quality benchmark studies, and the Watanabe et al. Claude Code PR study — explaining the specific longitudinal project-level gap this work fills.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Replication package provided at https://doi.org/10.5281/zenodo.18368661 (Data Availability section).",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Replication package at Zenodo includes the dataset. The paper states 'We provide a replication package for this paper.'",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No mention of requirements.txt, Dockerfile, or detailed environment setup. The paper mentions SonarQube Community server and BigQuery but does not specify software versions or dependencies for reproduction.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The paper provides a replication package link but does not include step-by-step reproduction instructions in the paper itself. Whether the Zenodo archive contains them is not stated.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "Standard errors reported in Tables 2 and 3. Figure 3 shows confidence intervals around treatment effect estimates. Table 2 includes ± percentage change intervals.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "Statistical significance reported throughout with p-value thresholds (p<0.05, p<0.01, p<0.001). Pre-trend Wald tests, Sargan tests, and AR(2) tests all reported with p-values.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Effect sizes reported as percentage changes with baseline context (e.g., '28.58% increase in lines added', '41.64% increase in code complexity'). Table 2 provides log-transformed estimates and their percentage-change interpretations.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The sample of 806 treated repositories is determined by data availability (all qualifying repos found via GitHub search), not by power analysis. No power analysis or sample size justification is discussed.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Standard errors reported in parentheses for all regression estimates in Tables 2 and 3. Confidence bands shown in event study plots (Figures 3 and 4).",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "The matched control group of 1,380 never-adopter repositories serves as the baseline comparison, constructed via propensity score matching (Section 3.1.3).",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Control repositories are contemporaneous — matched from the same time period and observed over the same January 2024 to August 2025 window.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Multiple robustness checks function as ablations: high contributor adoption subset, cursor configuration changes subset, activity-level subsets, and other-AI-tools subsets (Section 4.3, Figure 4).",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Five outcome metrics: commits, lines added, static analysis warnings, duplicate line density, and code complexity (Section 3.2.1).",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "This is a large-scale observational mining study of repository metrics. Human evaluation of code quality outputs is not applicable.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "This is a causal inference study using observational data, not a prediction task. No train/test split is applicable.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results broken down by outcome variable (5 metrics), by time horizon (Figures 3, 4), and by subgroup (high adoption, active repos, other AI tools). Table 1 provides descriptive statistics.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "The paper discusses where effects are weak or absent (e.g., no significant effect on commits, no significant effect on duplicate line density). Section 5.1 discusses the transient nature of velocity gains as a 'failure' of sustained benefit.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Negative results reported: velocity gains in lines added not sustained (main effect on commits insignificant), duplicate line density effect insignificant (Table 2), velocity increase from lines added does not significantly cause more warnings in GMM (Table 3).",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": false,
    237           "answer": false,
    238           "justification": "The study does not use LLMs as part of its methodology. It studies the effect of Cursor adoption by others. The paper acknowledges it lacks information on which Cursor version or LLM backend each repository used (Section 3.5.1).",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": false,
    243           "answer": false,
    244           "justification": "No prompting is used in the study methodology.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Key modeling parameters reported: 1:3 nearest-neighbor matching ratio, propensity score model specification (Equation 1), 6-month lag structure, 10-star threshold, lags 2-3 as instruments for GMM, AUC values 0.83-0.91 for propensity models.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding used in the study methodology.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Data pipeline documented: GitHub code search API with adaptive partitioning (Section 3.1.2), 10-star filter yielding 806 repos, propensity score matching yielding 1,380 controls (Section 3.1.3), monthly metric collection from GHArchive and SonarQube (Section 3.2), multicollinearity check removing issue comments.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "Replication package at Zenodo (doi.org/10.5281/zenodo.18368661). Data sourced from public GHArchive and GitHub API, which are independently verifiable.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Data collection described in detail: GitHub code search API with adaptive partitioning algorithm for .cursorrules files (Section 3.1.2), GHArchive for time series (Section 3.1.3), SonarQube for code quality metrics (Section 3.2.1). Time period: January 2024 to August 2025.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. Data comes from mining public GitHub repositories. Standard benchmark NA.",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Pipeline documented: 23,308 Cursor files across 3,306 non-fork repos → 10-star filter → 806 repos (Section 3.1.2). Control group: population of all ≥10 star repos → 10,000 candidates per month → propensity score matching → 1,380 controls (Section 3.1.3). Filtering counts provided at each stage.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "The study does not evaluate a pre-trained model's capability on any benchmark. It is a mining study measuring the impact of tool adoption on repository-level metrics.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": false,
    301           "answer": false,
    302           "justification": "Not a benchmark evaluation study. No model is being tested on a benchmark.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "Not a benchmark evaluation study.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants. This is a repository mining study.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants. Repository inclusion criteria are documented in data_preprocessing_documented.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": false,
    359           "answer": false,
    360           "justification": "The study does not propose a method with inference costs. It is an empirical observational study.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Google Cloud credits mentioned for BigQuery analysis and SonarQube used locally, but no quantification of total compute budget (e.g., how long SonarQube analysis took across all repos, BigQuery costs).",
    367           "source": "opus"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Cursor adoption causes a 281.3% increase in lines added in the first month post-adoption, dissipating after two months",
    375       "evidence": "DiD horizon-average ATT: +281.3% lines added in month 1, +48.4% in month 2, returning to baseline thereafter (Figure 3, Section 4.1.1); robust across all three estimators (Table 6).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Cursor adoption causes a persistent 30.3% increase in static analysis warnings",
    380       "evidence": "ATT = 0.2644*** (Borusyak et al., Table 2); sustained post-adoption in Figure 3. However, Callaway-Sant'Anna shows -10.49% (insignificant), creating an unresolved estimator disagreement on this key finding.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Cursor adoption causes a persistent 41.6% increase in code complexity",
    385       "evidence": "ATT = 0.3481*** (Borusyak et al., Table 2); GMM also shows Cursor has significant 9% baseline complexity increase controlling for codebase size. Again, Callaway-Sant'Anna shows -3.80% (insignificant).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Accumulated technical debt causally reduces future development velocity, creating a self-reinforcing cycle",
    390       "evidence": "Panel GMM (Table 3): 100% increase in code complexity → 64.5% velocity decrease; 100% increase in warnings → 50.3% velocity decrease; both significant with valid Sargan (p > 0.05) and AR(2) tests (p > 0.05).",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Quality degradation effects are stronger and more attributable to Cursor in repositories with sustained, intensive usage",
    395       "evidence": "Robustness checks (Figure 4, Row 1): High contributor adoption and active configuration change subsets show amplified quality effects, not weaker — ruling out spurious correlation from non-compliance.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "The velocity gain from Cursor adoption would be fully cancelled by a ~3x increase in code complexity or ~5x increase in static analysis warnings",
    400       "evidence": "Computed from GMM coefficients in Table 3: Cursor ATT on lines added (1.044) offset by quality coefficients (-0.718 for complexity, -0.588 for warnings); Section 4.2 derives the cancellation thresholds explicitly.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "observational"
    406   ],
    407   "key_findings": "A difference-in-differences study on 806 Cursor-adopting GitHub repositories (matched to 1,380 controls) finds that Cursor adoption produces substantial but transient velocity gains (+281% lines added in month 1, dissipating after 2 months) alongside persistent quality degradation (+30% static analysis warnings, +41% code complexity per the Borusyak et al. estimator). Dynamic panel GMM reveals technical debt accumulation causally reduces future velocity, consistent with a self-reinforcing debt cycle. A critical caveat: the Callaway-Sant'Anna estimator shows insignificant negative effects on quality outcomes, so the quality degradation findings — the paper's primary contribution — rest on estimator choice and should be interpreted with caution. Appendix D shows the largest post-adoption warning increases are in naming conventions, code hygiene, and code complexity categories.",
    408   "red_flags": [
    409     {
    410       "flag": "Estimator disagreement on key quality findings",
    411       "detail": "The paper's primary claims (30% more warnings, 41% more complexity) are supported by the Borusyak et al. estimator but contradicted by the Callaway-Sant'Anna estimator (insignificant negative effects). The paper attributes this to lower statistical power in cohort-specific estimation but cannot definitively resolve the disagreement, which concerns the paper's novel contribution."
    412     },
    413     {
    414       "flag": "Adoption proxy validity",
    415       "detail": "Cursor adoption is identified via .cursorrules config files committed to git, missing developers who use Cursor without committing such files. This creates selection bias toward more systematic adopters and produces ITT estimates of unknown direction relative to average treatment effects."
    416     },
    417     {
    418       "flag": "Imperfect control group balance",
    419       "detail": "Balance checks (Table 5) show treatment repos are notably younger (496 vs 681 days), far more active (10,103 vs 2,443 events/month), and have more PRs (1,076 vs 266), with residual confounding possible despite DiD fixed-effects adjustment."
    420     },
    421     {
    422       "flag": "No competing interests statement",
    423       "detail": "The paper receives Google Research awards and Google Cloud credits; Google competes with Cursor via GitHub Copilot. No explicit competing interests declaration appears despite ACM publishing norms."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Revisiting event-study designs: Robust and efficient estimation (Borusyak, Jaravel, Spiess, 2024)",
    429       "relevance": "Core DiD estimator used throughout; addresses heterogeneous treatment effect bias in staggered adoption designs"
    430     },
    431     {
    432       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot (Peng et al., 2023)",
    433       "relevance": "Primary prior RCT on AI coding assistant productivity; baseline for comparison to earlier-generation tools"
    434     },
    435     {
    436       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity (Becker et al., 2025)",
    437       "relevance": "Concurrent controlled experiment showing LLM agents don't help experienced developers; complementary evidence contradicting self-reports"
    438     },
    439     {
    440       "title": "The Impact of Large Language Models on Open-source Innovation: Evidence from GitHub Copilot (Yeverechyahu et al., 2024)",
    441       "relevance": "Closest methodological predecessor: DiD design on Copilot adoption in Python packages, without code quality outcomes"
    442     },
    443     {
    444       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions (Pearce et al., 2022)",
    445       "relevance": "Security vulnerability analysis of early Copilot completions; contrasted to this paper's observational quality findings"
    446     },
    447     {
    448       "title": "No Need to Lift a Finger Anymore? Assessing the Quality of Code Generation by ChatGPT (Liu et al., 2024)",
    449       "relevance": "Code quality analysis of ChatGPT-generated code; predecessor finding complexity issues in earlier AI tools"
    450     },
    451     {
    452       "title": "On the use of agentic coding: An empirical study of pull requests on GitHub (Watanabe et al., 2025)",
    453       "relevance": "Complementary study of Claude Code PR acceptance rates; contrasted as project-level vs. PR-level analysis"
    454     },
    455     {
    456       "title": "Difference-in-differences with multiple time periods (Callaway and Sant'Anna, 2021)",
    457       "relevance": "Alternative DiD estimator used in robustness checks; its disagreement with main estimator is a key unresolved tension"
    458     },
    459     {
    460       "title": "Technical debt cripples software developer productivity: A longitudinal study (Besker, Martini, Bosch, 2018)",
    461       "relevance": "Foundational prior work establishing technical debt's negative velocity impact, providing theoretical basis for GMM interpretation"
    462     },
    463     {
    464       "title": "The Impact of Generative AI on Collaborative Open-Source Software Development: Evidence from GitHub Copilot (Song et al., 2024)",
    465       "relevance": "Observational study estimating only 6.5% project-level productivity gain from Copilot; contrasted to the larger Cursor effect estimates"
    466     }
    467   ],
    468   "engagement_factors": {
    469     "practical_relevance": {
    470       "score": 2,
    471       "justification": "Directly informs teams adopting AI coding tools about quality tradeoffs and suggests concrete process adaptations like scaling QA with velocity."
    472     },
    473     "surprise_contrarian": {
    474       "score": 3,
    475       "justification": "Directly contradicts the widely-held '10x productivity' narrative around Cursor/AI coding tools, showing velocity gains vanish after two months while technical debt persists."
    476     },
    477     "fear_safety": {
    478       "score": 1,
    479       "justification": "Raises concerns about code quality degradation and security warnings increasing, but safety/risk is secondary to the productivity narrative."
    480     },
    481     "drama_conflict": {
    482       "score": 3,
    483       "justification": "Directly challenges Cursor's productivity claims and the broader AI coding hype with empirical evidence of a self-reinforcing technical debt cycle — a classic 'emperor has no clothes' paper."
    484     },
    485     "demo_ability": {
    486       "score": 2,
    487       "justification": "Replication package available on Zenodo with data and code, reproducible with moderate effort for researchers familiar with econometric methods."
    488     },
    489     "brand_recognition": {
    490       "score": 3,
    491       "justification": "Cursor is one of the most talked-about AI coding products with millions of users, and the paper is from Carnegie Mellon, a top-tier CS institution."
    492     }
    493   },
    494   "hn_data": {
    495     "threads": [
    496       {
    497         "hn_id": "47401734",
    498         "title": "Speed at the cost of quality: Study of use of Cursor AI in open source projects (2025)",
    499         "points": 147,
    500         "comments": 80,
    501         "url": "https://news.ycombinator.com/item?id=47401734",
    502         "created_at": "2026-03-16T17:07:37Z"
    503       },
    504       {
    505         "hn_id": "38283398",
    506         "title": "API-Driven Program Synthesis for Testing Static Typing Implementations",
    507         "points": 35,
    508         "comments": 1,
    509         "url": "https://news.ycombinator.com/item?id=38283398",
    510         "created_at": "2023-11-15T22:19:08Z"
    511       },
    512       {
    513         "hn_id": "45968758",
    514         "title": "Does AI-Assisted Coding Deliver? A Study of Cursor's Impact on Software Projects",
    515         "points": 14,
    516         "comments": 2,
    517         "url": "https://news.ycombinator.com/item?id=45968758",
    518         "created_at": "2025-11-18T16:50:19Z"
    519       },
    520       {
    521         "hn_id": "46730534",
    522         "title": "Does AI-Assisted Coding Deliver? A Study of Cursor on Software Projects",
    523         "points": 2,
    524         "comments": 0,
    525         "url": "https://news.ycombinator.com/item?id=46730534",
    526         "created_at": "2026-01-23T09:54:11Z"
    527       },
    528       {
    529         "hn_id": "46658985",
    530         "title": "Does AI-Assisted Coding Deliver? A Study of Cursor's Impact on Software Projects",
    531         "points": 2,
    532         "comments": 0,
    533         "url": "https://news.ycombinator.com/item?id=46658985",
    534         "created_at": "2026-01-17T15:53:22Z"
    535       },
    536       {
    537         "hn_id": "45998822",
    538         "title": "Does AI-Assisted Coding Deliver? A Difference-in-Differences Study",
    539         "points": 2,
    540         "comments": 0,
    541         "url": "https://news.ycombinator.com/item?id=45998822",
    542         "created_at": "2025-11-20T22:36:21Z"
    543       },
    544       {
    545         "hn_id": "45951387",
    546         "title": "Does AI-Assisted Coding Deliver? A Study of Cursor's Impact on Software Projects",
    547         "points": 2,
    548         "comments": 0,
    549         "url": "https://news.ycombinator.com/item?id=45951387",
    550         "created_at": "2025-11-17T06:57:28Z"
    551       },
    552       {
    553         "hn_id": "42127507",
    554         "title": "UniGAD: Unifying Multi-Level Graph Anomaly Detection",
    555         "points": 2,
    556         "comments": 0,
    557         "url": "https://news.ycombinator.com/item?id=42127507",
    558         "created_at": "2024-11-13T16:32:30Z"
    559       },
    560       {
    561         "hn_id": "46180812",
    562         "title": "Does AI-Assisted Coding Deliver? A Difference-in-Differences Study",
    563         "points": 1,
    564         "comments": 0,
    565         "url": "https://news.ycombinator.com/item?id=46180812",
    566         "created_at": "2025-12-07T10:54:26Z"
    567       },
    568       {
    569         "hn_id": "46070691",
    570         "title": "A Difference-in-Differences Study of Cursor's Impact on Software Projects",
    571         "points": 1,
    572         "comments": 0,
    573         "url": "https://news.ycombinator.com/item?id=46070691",
    574         "created_at": "2025-11-27T16:21:41Z"
    575       }
    576     ],
    577     "top_points": 147,
    578     "total_points": 208,
    579     "total_comments": 83
    580   }
    581 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs