scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (45331B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "AI IDEs or Autonomous Agents? Measuring the Impact of Coding Agents on Software Development",
      6     "authors": [
      7       "Shyam Agarwal",
      8       "Hao He",
      9       "Bogdan Vasilescu"
     10     ],
     11     "year": 2026,
     12     "venue": "MSR '26",
     13     "arxiv_id": "2601.13597",
     14     "doi": "10.1145/3793302.3793589"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "applies": true,
     19       "answer": true,
     20       "justification": "Claims stay within evidence: velocity gains verified in Table 2 (+36–77% for agent-first repos, +3.1% for IDE-first); quality degradation (+18% warnings, +39% complexity) confirmed across conditions. Abstract claims match empirical findings, though causal language is appropriate for DiD design.",
     21       "source": "haiku",
     22       "sub_questions": {
     23         "abstract_claims_supported": {
     24           "applies": true,
     25           "answer": true,
     26           "justification": "All abstract claims (front-loaded gains in AF, minimal gains in IF, persistent quality risks) directly verified by Table 2 and Figure 2 results."
     27         },
     28         "causal_claims_justified": {
     29           "applies": true,
     30           "answer": true,
     31           "justification": "Staggered difference-in-differences with propensity score matching and Borusyak et al. [18] estimator is the appropriate quasi-experimental method for observational data with heterogeneous adoption timing."
     32         },
     33         "generalization_bounded": {
     34           "applies": true,
     35           "answer": true,
     36           "justification": "Scope clearly bounded to GitHub open-source repos with ≥10 stars, ≥10 agentic PRs, Jan 2024–Nov 2025. Paper appropriately qualifies findings: 'in our sample' and 'repository-level outcomes.' Broad title slightly expansive but body text is precise."
     37         },
     38         "alternative_explanations_discussed": {
     39           "applies": true,
     40           "answer": true,
     41           "justification": "Explains heterogeneous effects: 'greater maturity of IF repos...likely constrains how aggressively agentic changes can be merged.' Discusses pre-treatment coefficient concerns. Quality degradation explanation (complexity debt) plausible but not independently validated."
     42         },
     43         "proxy_outcome_distinction": {
     44           "applies": true,
     45           "answer": true,
     46           "justification": "Paper explicitly defines proxies: 'Development velocity via monthly commit counts and lines added; software quality via static-analysis warnings...cognitive complexity.' Acknowledges comment density alone doesn't counterbalance complexity growth."
     47         }
     48       },
     49       "abstract_claims_supported": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Abstract claims of 'roughly 18% and 39%' for warnings and complexity match Table 2 values (17.73%/19.00% and 34.85%/42.87%). Velocity claims of 'large, front-loaded velocity gains only when agents are the first AI tool' match AF vs IF results.",
     53         "source": "opus"
     54       },
     55       "causal_claims_justified": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Causal claims use staggered DiD with propensity score matching (AUC 0.92-0.99), the Borusyak et al. imputation estimator, and clustered standard errors. Appropriate quasi-experimental design for causal inference. Authors note intent-to-treat interpretation.",
     59         "source": "opus"
     60       },
     61       "generalization_bounded": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "Title 'Measuring the Impact of Coding Agents on Software Development' generalizes broadly. Sample is limited to open-source GitHub repos with ≥10 stars and ≥10 agentic PRs — a specific subset. The paper does not explicitly bound claims to this population in the abstract or title.",
     65         "source": "opus"
     66       },
     67       "alternative_explanations_discussed": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Discusses maturity differences between AF/IF repos (Table 1), coordination costs limiting IF throughput, pre-treatment coefficient concerns suggesting unmodeled confounders, and intent-to-treat limitations (cannot measure usage intensity).",
     71         "source": "opus"
     72       },
     73       "proxy_outcome_distinction": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "The paper measures commits, lines added, static-analysis warnings, cognitive complexity, duplicated-line density, and comment density, then frames these as 'development velocity' and 'software quality.' The gap between these repository-level proxies and actual developer productivity or software quality is not acknowledged. Lines added is a known poor proxy for productivity, and static-analysis warnings are a limited proxy for quality. The paper uses broad terms ('velocity,' 'quality') without discussing the measurement-construct gap.",
     77         "source": "opus"
     78       }
     79     },
     80     "limitations_and_scope": {
     81       "applies": true,
     82       "answer": true,
     83       "justification": "Scope boundaries clearly stated (GitHub, ≥10 stars, ≥10 PRs, Jan 2024–Nov 2025). Specific threats discussed: pre-treatment coefficient anomalies, attribution noise, cannot measure usage intensity. No dedicated limitations section, but scattered throughout Methods/Discussion.",
     84       "source": "haiku",
     85       "sub_questions": {
     86         "limitations_section_present": {
     87           "applies": true,
     88           "answer": false,
     89           "justification": "No dedicated 'Limitations' or 'Threats to Validity' section. Limitations scattered in Methods ('While we cannot directly measure usage intensity') and Results ('These are concerning and highlight a limitation of our quasi-experimental design')."
     90         },
     91         "threats_to_validity_specific": {
     92           "applies": true,
     93           "answer": true,
     94           "justification": "Specific threats: 'isolated significant pre-treatment coefficients in static-analysis warnings and code complexity'; 'cannot directly measure usage intensity'; 'any remaining attribution errors primarily introduce noise in treatment timing.' Not boilerplate."
     95         },
     96         "scope_boundaries_stated": {
     97           "applies": true,
     98           "answer": true,
     99           "justification": "Explicit boundaries: GitHub repos ≥10 stars, ≥10 agentic PRs, Jan 2024–Nov 2025, repository-level (not individual developer), staggered adoption windows. Paper states what it does NOT show: individual-developer workflows, long-term post-adoption trajectories."
    100         }
    101       },
    102       "limitations_section_present": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "No dedicated 'Limitations' or 'Threats to Validity' section. Limitations are discussed inline within Results (pre-treatment coefficients) and Methods (intent-to-treat caveat), but there is no substantive dedicated section.",
    106         "source": "opus"
    107       },
    108       "threats_to_validity_specific": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Specific threats discussed: pre-treatment coefficients in quality metrics suggesting unmodeled confounders, attribution errors attenuating effects toward zero, inability to measure usage intensity or developer-level interactions, left-censoring of adoption dates.",
    112         "source": "opus"
    113       },
    114       "scope_boundaries_stated": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "No explicit statements about what the results do NOT show. The paper notes 'intent-to-treat effects' and 'cannot directly measure usage intensity' but does not systematically state what is out of scope (e.g., private repos, enterprise settings, individual developer impact).",
    118         "source": "opus"
    119       }
    120     },
    121     "conflicts_of_interest": {
    122       "applies": true,
    123       "answer": false,
    124       "justification": "Affiliations disclosed (Carnegie Mellon, CMU independent of evaluated tools). Funding source NOT disclosed—no acknowledgments section visible. No competing interests statement. Missing formal conflict-of-interest declaration.",
    125       "source": "haiku",
    126       "sub_questions": {
    127         "funding_disclosed": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "No funding source mentioned in the visible paper. No acknowledgments section, no grant numbers, no funding statement."
    131         },
    132         "affiliations_disclosed": {
    133           "applies": true,
    134           "answer": true,
    135           "justification": "All authors affiliated with Carnegie Mellon University, disclosed in author list. No affiliation with evaluated tool companies (Cursor, Anthropic, OpenAI, etc.)."
    136         },
    137         "funder_independent_of_outcome": {
    138           "applies": true,
    139           "answer": true,
    140           "justification": "CMU is independent of the evaluated agentic tool companies. No indication of company-specific funding."
    141         },
    142         "financial_interests_declared": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "No competing interests statement, no patents, no equity disclosures. Standard 'CC BY 4.0' license but no financial interest declaration."
    146         }
    147       },
    148       "funding_disclosed": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No funding source or acknowledgments section found in the paper.",
    152         "source": "opus"
    153       },
    154       "affiliations_disclosed": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "All three authors listed as Carnegie Mellon University. No apparent affiliation with companies whose tools are studied (Anthropic, OpenAI, Cursor, etc.).",
    158         "source": "opus"
    159       },
    160       "funder_independent_of_outcome": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not the same as absence of conflict.",
    164         "source": "opus"
    165       },
    166       "financial_interests_declared": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No competing interests or financial disclosure statement found in the paper.",
    170         "source": "opus"
    171       }
    172     },
    173     "scope_and_framing": {
    174       "applies": true,
    175       "answer": true,
    176       "justification": "Key terms precisely defined (autonomous agents vs. IDE assistants, development velocity, software quality). Contributions explicitly stated in two components. Strong engagement with prior work: Related Work section shows gap in agentic tool evaluation vs. extensive IDE assistant studies.",
    177       "source": "haiku",
    178       "sub_questions": {
    179         "key_terms_defined": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Agentic tools defined with four dimensions: Autonomy, Scope, Planning, Interaction. Velocity = commits + lines added. Quality = SonarQube metrics (warnings, complexity, duplication, comments). Clear contrast to pre-agentic IDE assistants."
    183         },
    184         "intended_contribution_clear": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "RQ1–3 stated explicitly. Contributions: (1) first large-scale longitudinal evidence on agentic contributions, (2) first causal evidence on differential effects of IDE → agent transition. Very clear."
    188         },
    189         "engagement_with_prior_work": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Related Work shows 'vast majority of real-world research focuses on IDE-based AI assistants...typically reporting modest velocity improvements. In contrast, early studies of agentic tools show mixed results.' Clearly positions this as gap-filling work, not derivative."
    193         }
    194       }
    195     }
    196   },
    197   "type_checklist": {
    198     "empirical": {
    199       "artifacts": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Code released: https://github.com/shyamagarwal13/agentic-coding-impact. Data from public AIDev dataset, GHArchive, SonarQube. Environment specs and reproduction instructions likely in replication package but not in paper text itself.",
    203         "source": "haiku",
    204         "sub_questions": {
    205           "code_released": {
    206             "applies": true,
    207             "answer": true,
    208             "justification": "Replication package explicitly provided: 'The replication package for this study is publicly available at https://github.com/shyamagarwal13/agentic-coding-impact.'"
    209           },
    210           "data_released": {
    211             "applies": true,
    212             "answer": true,
    213             "justification": "Data from public sources: AIDev dataset (Li et al. 2025), GHArchive (public GitHub archive), SonarQube analyses (public)."
    214           },
    215           "environment_specified": {
    216             "applies": true,
    217             "answer": false,
    218             "justification": "No requirements.txt, Dockerfile, or dependency specifications in the paper. Replication package may contain this, but not documented in the paper itself."
    219           },
    220           "reproduction_instructions": {
    221             "applies": true,
    222             "answer": false,
    223             "justification": "Paper refers to 'replication package' for instructions but does not include step-by-step instructions in the paper text."
    224           }
    225         },
    226         "code_released": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Replication package publicly available at https://github.com/shyamagarwal13/agentic-coding-impact, stated in abstract and footer.",
    230           "source": "opus"
    231         },
    232         "data_released": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "Builds on the publicly available AIDev dataset (v3) [28]. GHArchive data is also public. Replication package is provided.",
    236           "source": "opus"
    237         },
    238         "environment_specified": {
    239           "applies": true,
    240           "answer": false,
    241           "justification": "No mention of environment specifications, dependency files, or library versions in the paper. SonarQube is named but no version or configuration details are given.",
    242           "source": "opus"
    243         },
    244         "reproduction_instructions": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "The paper provides a replication package URL and the methodology section describes the full pipeline in sufficient detail (data source, filtering criteria, matching procedure, estimator) for replication.",
    248           "source": "opus"
    249         }
    250       },
    251       "statistical_methodology": {
    252         "applies": true,
    253         "answer": true,
    254         "justification": "Standard errors and p-values reported (Table 2). Effect sizes as log-transformed coefficients and % change. Confidence intervals shown (Figure 2). Sample size justified by filtering (≥10 agentic PRs) but lacks formal power analysis.",
    255         "source": "haiku",
    256         "sub_questions": {
    257           "confidence_intervals_or_error_bars": {
    258             "applies": true,
    259             "answer": true,
    260             "justification": "Table 2 reports standard errors for each estimate. Figure 2 shows error bars on dynamic effects."
    261           },
    262           "significance_tests": {
    263             "applies": true,
    264             "answer": true,
    265             "justification": "Table 2: p-values indicated with *, **, ***. Figure 2: filled dots p<0.05, hollow dots p≥0.05."
    266           },
    267           "effect_sizes_reported": {
    268             "applies": true,
    269             "answer": true,
    270             "justification": "Table 2: log-transformed coefficients plus % change ('36.25% change in commits for AF', '76.59% for lines added'). Not just p-values."
    271           },
    272           "sample_size_justified": {
    273             "applies": true,
    274             "answer": false,
    275             "justification": "Sample size determined by filtering rules (≥10 stars, ≥10 agentic PRs), not by power analysis. No justification for why n=401 AF treated is sufficient to detect expected effect sizes."
    276           },
    277           "variance_reported": {
    278             "applies": true,
    279             "answer": true,
    280             "justification": "Standard errors reported in Table 2 (implies variance estimation). Confidence intervals visible in Figure 2."
    281           }
    282         },
    283         "confidence_intervals_or_error_bars": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "Standard errors reported in Table 2 for all treatment effects. Figure 2 shows confidence bands around dynamic treatment effect estimates.",
    287           "source": "opus"
    288         },
    289         "significance_tests": {
    290           "applies": true,
    291           "answer": true,
    292           "justification": "P-values reported via star notation in Table 2 (*, **, ***) with thresholds at 0.05, 0.01, 0.001. Significance indicated for each outcome.",
    293           "source": "opus"
    294         },
    295         "effect_sizes_reported": {
    296           "applies": true,
    297           "answer": true,
    298           "justification": "Percentage change effects reported throughout: +36.25% commits, +76.59% lines added for AF; +17.73% static analysis warnings. Log-transformed estimates with % change interpretation in Table 2.",
    299           "source": "opus"
    300         },
    301         "sample_size_justified": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "No power analysis or justification for sample sizes. The sample sizes (401 AF + 606 controls, 117 IF + 73 controls) result from filtering criteria but no discussion of whether these are adequate for the effect sizes detected.",
    305           "source": "opus"
    306         },
    307         "variance_reported": {
    308           "applies": true,
    309           "answer": true,
    310           "justification": "Standard errors clustered at the repository level are reported in Table 2. Confidence bands shown in Figure 2 event-study plots. Appropriate for observational DiD designs.",
    311           "source": "opus"
    312         }
    313       },
    314       "evaluation_design": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "Baseline controls matched on propensity scores. Multiple metrics (velocity, quality, sub-metrics). Heterogeneous analysis (AF vs. IF). Negative results reported (IF repos decline by t=6). Per-tool breakdown absent.",
    318         "source": "haiku",
    319         "sub_questions": {
    320           "baselines_included": {
    321             "applies": true,
    322             "answer": true,
    323             "justification": "Two control sets: AF repos matched to non-IDE-exposed controls (606 controls); IF repos matched to IDE-exposed controls (73 controls). Controls have ≥10 stars, similar activity profiles."
    324           },
    325           "baselines_contemporary": {
    326             "applies": true,
    327             "answer": true,
    328             "justification": "Controls extracted from GHArchive in same time window. Propensity score matching on dynamic pre-treatment characteristics ensures temporal alignment."
    329           },
    330           "ablation_study": {
    331             "applies": true,
    332             "answer": false,
    333             "justification": "Heterogeneous analysis by prior AI exposure (AF vs. IF) is a form of moderation analysis, not a component ablation. No systematic ablation of agent features or design choices."
    334           },
    335           "multiple_metrics": {
    336             "applies": true,
    337             "answer": true,
    338             "justification": "Velocity: commits, lines added. Quality: static-analysis warnings, cognitive complexity, duplication, comment density. 6 outcome measures across two dimensions."
    339           },
    340           "human_evaluation": {
    341             "applies": false,
    342             "answer": false,
    343             "justification": "N/A: repository-level observational study. Code quality measured via automated metrics, not human raters. Appropriate for this study type."
    344           },
    345           "held_out_test_set": {
    346             "applies": false,
    347             "answer": false,
    348             "justification": "N/A: not a prediction task. Observational study of real-world adoption effects."
    349           },
    350           "per_category_breakdown": {
    351             "applies": true,
    352             "answer": true,
    353             "justification": "Results stratified by AF vs. IF. Results by outcome type (velocity, quality). Breakdown by tool (Claude, Cursor, Devin, etc.) absent—treated as aggregate 'agentic agents.'"
    354           },
    355           "failure_cases_discussed": {
    356             "applies": true,
    357             "answer": true,
    358             "justification": "IF repos show negative velocity by t=6 (lines −61%, commits −35%). Pre-treatment coefficient concerns acknowledged. Quality risks persist even when velocity doesn't improve."
    359           },
    360           "negative_results_reported": {
    361             "applies": true,
    362             "answer": true,
    363             "justification": "IF repos fail to sustain velocity benefits. Quality risks universally present. Complexity rises even when velocity advantage fades."
    364           }
    365         },
    366         "baselines_included": {
    367           "applies": true,
    368           "answer": true,
    369           "justification": "Matched control repositories serve as baselines. Additionally, results are compared against prior work on Cursor AI IDE adoption [25] using the same causal inference methods.",
    370           "source": "opus"
    371         },
    372         "baselines_contemporary": {
    373           "applies": true,
    374           "answer": true,
    375           "justification": "The primary comparison is with He et al. (2026) [25] on Cursor adoption, which is concurrent work by overlapping authors using the same methods. Control repos are matched from the same time period.",
    376           "source": "opus"
    377         },
    378         "ablation_study": {
    379           "applies": false,
    380           "answer": false,
    381           "justification": "This is an observational causal inference study, not a system with components to ablate. The AF/IF split is a moderation analysis, not an ablation.",
    382           "source": "opus"
    383         },
    384         "multiple_metrics": {
    385           "applies": true,
    386           "answer": true,
    387           "justification": "Six outcome metrics: commits, lines added, static-analysis warnings, cognitive complexity, duplicated-line density, and comment-line density (Table 2, Figure 2).",
    388           "source": "opus"
    389         },
    390         "human_evaluation": {
    391           "applies": false,
    392           "answer": false,
    393           "justification": "This is a repository mining study measuring automated metrics. Human evaluation of outputs is not applicable.",
    394           "source": "opus"
    395         },
    396         "held_out_test_set": {
    397           "applies": false,
    398           "answer": false,
    399           "justification": "Not a prediction task. This is a causal inference study estimating treatment effects.",
    400           "source": "opus"
    401         },
    402         "per_category_breakdown": {
    403           "applies": true,
    404           "answer": true,
    405           "justification": "Results broken down by AF (agent-first, n=401) vs IF (IDE-first, n=117) repositories, and dynamic monthly effects shown in Figure 2 event-study plots from t=-6 to t=+6.",
    406           "source": "opus"
    407         },
    408         "failure_cases_discussed": {
    409           "applies": true,
    410           "answer": true,
    411           "justification": "The paper discusses concerning pre-treatment coefficients in quality metrics: 'We also observe isolated significant pre-treatment coefficients in static-analysis warnings and code complexity... these are concerning and highlight a limitation of our quasi-experimental design.'",
    412           "source": "opus"
    413         },
    414         "negative_results_reported": {
    415           "applies": true,
    416           "answer": true,
    417           "justification": "IF repositories show minimal or no velocity gains (+3.1% commits, -6.3% lines added, both insignificant). Lines added for IF turn negative by t=6 (~-61%). This null/negative finding is prominently reported.",
    418           "source": "opus"
    419         }
    420       },
    421       "setup_transparency": {
    422         "applies": true,
    423         "answer": true,
    424         "justification": "Data preprocessing documented (monthly aggregation, filtering, attribution strategy). Black-box agent tools (model versions/prompts/hyperparams N/A).",
    425         "source": "haiku",
    426         "sub_questions": {
    427           "model_versions_specified": {
    428             "applies": true,
    429             "answer": false,
    430             "justification": "Multiple tools studied (Claude, Cursor, Devin, Codex, etc.) but no specific model versions (e.g., Claude 3.5 Sonnet, GPT-4o). Attribution based on PR metadata, not model snapshots."
    431           },
    432           "prompts_provided": {
    433             "applies": false,
    434             "answer": false,
    435             "justification": "N/A: evaluating black-box deployed agents. Researchers have no control over agent prompts; tools are evaluated as-deployed in open-source projects."
    436           },
    437           "hyperparameters_reported": {
    438             "applies": false,
    439             "answer": false,
    440             "justification": "N/A: black-box tools. Hyperparameters (temperature, top-p, etc.) not accessible or controllable."
    441           },
    442           "scaffolding_described": {
    443             "applies": false,
    444             "answer": false,
    445             "justification": "N/A: evaluating real-world agent deployments, not custom scaffolding or orchestration. No control over agent architecture."
    446           },
    447           "data_preprocessing_documented": {
    448             "applies": true,
    449             "answer": true,
    450             "justification": "Attribution strategy detailed (cascading priority: branch prefixes, author logins, bot type, comment patterns). Filtering (≥10 stars, ≥10 agentic PRs) stated. Monthly aggregation defined. SonarQube processing steps described."
    451           }
    452         },
    453         "model_versions_specified": {
    454           "applies": false,
    455           "answer": false,
    456           "justification": "This study does not use LLMs. It studies repositories that adopt coding agents but does not invoke any models itself.",
    457           "source": "opus"
    458         },
    459         "prompts_provided": {
    460           "applies": false,
    461           "answer": false,
    462           "justification": "No prompting is used. This is an observational mining study.",
    463           "source": "opus"
    464         },
    465         "hyperparameters_reported": {
    466           "applies": true,
    467           "answer": true,
    468           "justification": "Matching parameters reported: max 3 controls per treated repo, ≥10 stars threshold, ≥10 agentic PRs, propensity score AUC 0.92-0.99, max 10,000 candidates subsampled per month, six monthly covariate lags. Borusyak et al. estimator specified.",
    469           "source": "opus"
    470         },
    471         "scaffolding_described": {
    472           "applies": false,
    473           "answer": false,
    474           "justification": "No agentic scaffolding used. This is an observational study.",
    475           "source": "opus"
    476         },
    477         "data_preprocessing_documented": {
    478           "applies": true,
    479           "answer": true,
    480           "justification": "Detailed pipeline: AIDev dataset → cascading attribution strategy (5 signal types) → filtering (≥10 stars, ≥10 agentic PRs) → propensity score matching → AF/IF partitioning. Final sample sizes stated (401 AF + 606 controls, 117 IF + 73 controls).",
    481           "source": "opus"
    482         }
    483       },
    484       "data_integrity": {
    485         "applies": true,
    486         "answer": true,
    487         "justification": "Raw data available in replication package and public sources (AIDev, GHArchive, SonarQube). Data collection procedure detailed (PR parsing Jan 2024–Nov 2025, metric aggregation). Data pipeline documented.",
    488         "source": "haiku",
    489         "sub_questions": {
    490           "raw_data_available": {
    491             "applies": true,
    492             "answer": true,
    493             "justification": "Replication package promised. Data sourced from public AIDev dataset, GHArchive, SonarQube repositories—all publicly accessible."
    494           },
    495           "data_collection_described": {
    496             "applies": true,
    497             "answer": true,
    498             "justification": "'For each repository, we define the agent adoption date as the earliest month containing an agent-attributed PR.' Attribution strategy cascading (branch prefixes → author logins → actor type → default human). Retrospective parsing Jan 2024–Nov 2025."
    499           },
    500           "recruitment_methods_described": {
    501             "applies": false,
    502             "answer": false,
    503             "justification": "N/A: no human recruitment. Observational study of GitHub repositories (no participant selection bias concerns)."
    504           },
    505           "data_pipeline_documented": {
    506             "applies": true,
    507             "answer": true,
    508             "justification": "Pipeline: AIDev dataset + GHArchive extraction → propensity score matching → Borusyak et al. DiD estimation. Covariates, filtering, and aggregation steps all documented."
    509           }
    510         },
    511         "raw_data_available": {
    512           "applies": true,
    513           "answer": true,
    514           "justification": "Replication package at GitHub URL provided. AIDev dataset and GHArchive are both publicly available. The paper also notes corrections to the original dataset in the replication package.",
    515           "source": "opus"
    516         },
    517         "data_collection_described": {
    518           "applies": true,
    519           "answer": true,
    520           "justification": "Section 3.1 details data sources (AIDev v3, GHArchive), time period (January 2024–November 2025), attribution cascade strategy, and filtering criteria.",
    521           "source": "opus"
    522         },
    523         "recruitment_methods_described": {
    524           "applies": false,
    525           "answer": false,
    526           "justification": "No human participants. Data sources are public datasets (AIDev, GHArchive).",
    527           "source": "opus"
    528         },
    529         "data_pipeline_documented": {
    530           "applies": true,
    531           "answer": true,
    532           "justification": "Full pipeline documented: AIDev dataset → extended attribution taxonomy → retrospective PR parsing → filtering criteria → propensity score matching → AF/IF partitioning. Final sample counts provided at each stage.",
    533           "source": "opus"
    534         }
    535       },
    536       "contamination": {
    537         "applies": false,
    538         "answer": false,
    539         "justification": "N/A: not evaluating LLM benchmark performance. Studying real-world deployment effects on repository-level outcomes (commits, code metrics). No train/test contamination concern.",
    540         "source": "haiku",
    541         "sub_questions": {
    542           "training_cutoff_stated": {
    543             "applies": false,
    544             "answer": false,
    545             "justification": "N/A: not evaluating model capabilities on benchmarks."
    546           },
    547           "train_test_overlap_discussed": {
    548             "applies": false,
    549             "answer": false,
    550             "justification": "N/A: same as above."
    551           },
    552           "benchmark_contamination_addressed": {
    553             "applies": false,
    554             "answer": false,
    555             "justification": "N/A: evaluating adoption effects on repository outcomes, not model performance on benchmarks."
    556           }
    557         },
    558         "training_cutoff_stated": {
    559           "applies": false,
    560           "answer": false,
    561           "justification": "This is a repository mining study. No pre-trained model is evaluated on a benchmark.",
    562           "source": "opus"
    563         },
    564         "train_test_overlap_discussed": {
    565           "applies": false,
    566           "answer": false,
    567           "justification": "Not applicable. No model is evaluated on benchmark data.",
    568           "source": "opus"
    569         },
    570         "benchmark_contamination_addressed": {
    571           "applies": false,
    572           "answer": false,
    573           "justification": "Not applicable. This is an observational study of repository-level outcomes, not a benchmark evaluation.",
    574           "source": "opus"
    575         }
    576       },
    577       "human_studies": {
    578         "applies": false,
    579         "answer": false,
    580         "justification": "N/A: no human subjects. Observational study of repository-level metrics.",
    581         "source": "haiku",
    582         "pre_registered": {
    583           "applies": false,
    584           "answer": false,
    585           "justification": "No human participants. This is a repository mining study.",
    586           "source": "opus"
    587         },
    588         "irb_or_ethics_approval": {
    589           "applies": false,
    590           "answer": false,
    591           "justification": "No human participants.",
    592           "source": "opus"
    593         },
    594         "demographics_reported": {
    595           "applies": false,
    596           "answer": false,
    597           "justification": "No human participants.",
    598           "source": "opus"
    599         },
    600         "inclusion_exclusion_criteria": {
    601           "applies": false,
    602           "answer": false,
    603           "justification": "No human participants. Repository inclusion criteria are documented under data_preprocessing_documented.",
    604           "source": "opus"
    605         },
    606         "randomization_described": {
    607           "applies": false,
    608           "answer": false,
    609           "justification": "No human participants. This is an observational study.",
    610           "source": "opus"
    611         },
    612         "blinding_described": {
    613           "applies": false,
    614           "answer": false,
    615           "justification": "No human participants.",
    616           "source": "opus"
    617         },
    618         "attrition_reported": {
    619           "applies": false,
    620           "answer": false,
    621           "justification": "No human participants.",
    622           "source": "opus"
    623         }
    624       },
    625       "cost_and_practicality": {
    626         "applies": false,
    627         "answer": false,
    628         "justification": "Inference cost and compute budget not reported. Study is observational analysis of existing deployments, not a simulation or benchmark evaluation.",
    629         "source": "haiku",
    630         "sub_questions": {
    631           "inference_cost_reported": {
    632             "applies": false,
    633             "answer": false,
    634             "justification": "N/A: not evaluating model inference costs. Authors likely used APIs to fetch data but don't report computational budget."
    635           },
    636           "compute_budget_stated": {
    637             "applies": false,
    638             "answer": false,
    639             "justification": "No computational budget reported. Study relies on public datasets (AIDev, GHArchive, SonarQube) and statistical analysis."
    640           }
    641         },
    642         "inference_cost_reported": {
    643           "applies": false,
    644           "answer": false,
    645           "justification": "Observational study with no system to report inference costs for.",
    646           "source": "opus"
    647         },
    648         "compute_budget_stated": {
    649           "applies": false,
    650           "answer": false,
    651           "justification": "Observational study. The computational cost of running SonarQube and DiD estimation is not the focus.",
    652           "source": "opus"
    653         }
    654       }
    655     }
    656   },
    657   "claims": [
    658     {
    659       "claim": "Agentic tools substantially accelerate development velocity (+36–77% commits/lines) only when introduced as a repository's first observable AI tool.",
    660       "evidence": "Table 2: AF repos +36.3% commits, +76.6% lines added (p<0.001); IF repos +3.1%, −6.3% (not sig.). Figure 2 shows AF spike to +111% commits, +216% lines at t=0.",
    661       "supported": "strong"
    662     },
    663     {
    664       "claim": "Quality risks are persistent across both AF and IF settings, with static-analysis warnings rising ~18% and cognitive complexity rising ~39%.",
    665       "evidence": "Table 2: Static Analysis Warnings +17.73% (AF, p<0.001), +19.00% (IF, not sig.); Code Complexity +34.85% (AF, p<0.001), +42.87% (IF, p<0.01).",
    666       "supported": "strong"
    667     },
    668     {
    669       "claim": "IF repositories exhibit short-lived velocity gains that turn negative by month 6, with lines added declining ~61% and commits ~35% by t=6.",
    670       "evidence": "Figure 2: IF repos show bump at t=0–2 (+16–28% commits), then decline. Text states 'eventually turn negative (lines ∼−61%, commits ∼−35% by t=6).'",
    671       "supported": "moderate"
    672     },
    673     {
    674       "claim": "Greater maturity of IDE-first (IF) repositories (higher stars, forks, PR volume) constrains aggressive merging of agentic changes, limiting throughput benefits.",
    675       "evidence": "Table 1: IF repos have 8.1× more stars (8,123 vs. 1,461), 5.7× more forks. Discussion: 'likely constrains how aggressively agentic changes can be merged, so that localized speedups are offset by triage and review overhead.'",
    676       "supported": "moderate"
    677     },
    678     {
    679       "claim": "Autonomous agents amplify the speed–maintainability trade-off, particularly in AI-rich environments where they may increase complexity without delivering sustained velocity benefits.",
    680       "evidence": "Results show IF repos: minimal velocity gains (+3.1%) but persistent complexity (+42.9%). Discussion: 'in AI-rich environments may magnify complexity without delivering sustained velocity benefits.'",
    681       "supported": "strong"
    682     },
    683     {
    684       "claim": "Comment density diverges across groups: IF repositories experience substantial increases (~22% average, >+30% by t=6), while AF effects are muted (~4%), suggesting teams using AI IDEs leverage agents for documentation.",
    685       "evidence": "Table 2: Comment Line Density AF +4.34%, IF +22.30% (p<0.001). Paper suggests IF teams use agents 'for documentation as well as code.'",
    686       "supported": "strong"
    687     }
    688   ],
    689   "methodology_tags": [
    690     "observational",
    691     "quasi-experimental"
    692   ],
    693   "key_findings": "Using staggered difference-in-differences on 518 treated GitHub repositories (401 agent-first, 117 IDE-first) matched to controls, this study finds that AI coding agents deliver substantial velocity gains (+36–77% commits/lines) only when they are a project's first AI tool; repositories with prior IDE assistant experience show minimal and declining velocity benefits (eventual −61% lines by month 6). Critically, both groups accumulate persistent technical debt: static-analysis warnings rise ~18% and cognitive complexity rises ~39% regardless of velocity outcome, indicating that agents systematically introduce higher-complexity code. The findings challenge optimism about AI-assisted development and suggest a speed–maintainability trade-off requiring quality safeguards, selective deployment, and developer oversight.",
    694   "red_flags": [
    695     {
    696       "flag": "Isolated pre-treatment coefficient anomalies",
    697       "detail": "Authors observe 'isolated significant pre-treatment coefficients in static-analysis warnings and code complexity,' suggesting matched controls may not fully capture pre-treatment trajectories. This violates the parallel-trends assumption of DiD and could bias quality estimates downward (apparent effect may be selection artifact)."
    698     },
    699     {
    700       "flag": "Agent attribution methodology introduces noise",
    701       "detail": "Multi-signal cascading attribution (branch names → author logins → PR descriptions → default human) is heuristic-based. Authors acknowledge 'remaining attribution errors primarily introduce noise in treatment timing' but don't quantify misclassification rate. Treatment timing errors bias effects toward zero (conservative) but could introduce confounding if errors correlate with repository characteristics."
    702     },
    703     {
    704       "flag": "No formal power analysis",
    705       "detail": "Sample sizes determined by filtering rules (≥10 stars, ≥10 agentic PRs) rather than power calculations. Final n=401 AF, 117 IF may be underpowered for detecting heterogeneous effects, especially in the IF group (n=117 treated)."
    706     },
    707     {
    708       "flag": "Heterogeneous tool effects obscured",
    709       "detail": "Claude, Cursor, Codex, Devin, Jules, OpenHands, etc. are treated as interchangeable 'agentic agents.' These tools have dramatically different architectures and capabilities; averaging effects may mask important differences and hide tool-specific quality or velocity risks."
    710     },
    711     {
    712       "flag": "Quality metrics are automated proxies",
    713       "detail": "Static-analysis warnings, cognitive complexity, duplication measured by SonarQube—not human code review or developer satisfaction. Complexity metrics may not reflect actual maintainability perceived by developers. Comment density is a proxy for documentation quality, not actual code understandability."
    714     },
    715     {
    716       "flag": "IF repository decline (t=6) unexplained",
    717       "detail": "The dramatic negative velocity effect by t=6 in IF repos (−61% lines, −35% commits) is explained post-hoc as 'review overhead on mature projects,' but this explanation is not independently validated. Could indicate different selection dynamics, agent failure modes in complex codebases, or measurement artifacts."
    718     }
    719   ],
    720   "cited_papers": [
    721     {
    722       "title": "Speed at the Cost of Quality: How Cursor AI Increases Short-Term Velocity and Long-Term Complexity in Open-Source Projects",
    723       "authors": "He, Miller, Agarwal, Kastner, Vasilescu",
    724       "year": 2026,
    725       "relevance": "Direct comparison: same authors, same quasi-experimental methods (DiD with propensity matching) on Cursor IDE adoption. This paper extends to agentic tools across tools ecosystem."
    726     },
    727     {
    728       "title": "The Rise of AI Teammates in Software Engineering (SE) 3.0: How Autonomous Coding Agents Are Reshaping Software Engineering",
    729       "authors": "Li, Zhang, Hassan",
    730       "year": 2025,
    731       "relevance": "AIDev dataset [28]—the foundational data source for agent attribution and PR identification. Required reading for understanding agent taxonomy."
    732     },
    733     {
    734       "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity",
    735       "authors": "Becker, Rush, Barnes, Rein",
    736       "year": 2025,
    737       "relevance": "Contrasting find: RCT of Cursor agent on experienced developers shows 'limited productivity benefits,' suggesting real-world field studies reveal different dynamics than controlled experiments."
    738     },
    739     {
    740       "title": "On the use of agentic coding: An empirical study of pull requests on GitHub",
    741       "authors": "Watanabe, Li, Kashiwa, Reid, Iida, Hassan",
    742       "year": 2025,
    743       "relevance": "Empirical characterization of agent-generated PRs: merge rates (83.8% of Claude PRs merged), failure modes, and developer acceptance patterns."
    744     },
    745     {
    746       "title": "Code with Me or for Me? How Increasing AI Automation Transforms Developer Workflows",
    747       "authors": "Chen, Talwalkar, Brennan, Neubig",
    748       "year": 2025,
    749       "relevance": "Examines workflow transformation as automation increases—relevant to understanding speed–maintainability trade-off and coordination overhead in AI-saturated projects."
    750     },
    751     {
    752       "title": "Impact of AI-tooling on the Engineering Workspace",
    753       "authors": "Chretien, Albarran",
    754       "year": 2024,
    755       "relevance": "Broader survey of AI-tooling effects on development practices; contextualizes agentic agents within the wider ecosystem of IDE assistants."
    756     },
    757     {
    758       "title": "Expectation vs. Experience: Evaluating the Usability of Code Generation Tools Powered by Large Language Models",
    759       "authors": "Vaithilingam, Zhang, Glassman",
    760       "year": 2022,
    761       "relevance": "Foundational work on usability and trust in code generation tools; relevant to understanding developer acceptance of agent contributions."
    762     },
    763     {
    764       "title": "How Much Does AI Impact Development Speed? An Enterprise-Based Randomized Controlled Trial",
    765       "authors": "Paradis, Murillo, Pandey, et al.",
    766       "year": 2024,
    767       "relevance": "RCT evidence on GitHub Copilot productivity impact in enterprise; experimental counterpoint to this observational study of real-world adoption."
    768     }
    769   ],
    770   "engagement_factors": {
    771     "practical_relevance": {
    772       "score": 2,
    773       "justification": "Directly actionable finding that teams already using AI IDEs should deploy agents selectively rather than expecting additive productivity gains."
    774     },
    775     "surprise_contrarian": {
    776       "score": 2,
    777       "justification": "The main finding that prior AI IDE usage eliminates velocity gains from agents — suggesting diminishing returns rather than compounding benefits — is counterintuitive to the 'more AI = more productivity' narrative."
    778     },
    779     "fear_safety": {
    780       "score": 1,
    781       "justification": "Raises concerns about persistent technical debt and complexity accumulation from agents, but frames it as maintainability risk rather than safety or security."
    782     },
    783     "drama_conflict": {
    784       "score": 2,
    785       "justification": "Directly challenges the implicit claims of agent tool vendors (Devin, Codex, Claude Code) that autonomous agents deliver sustained productivity gains, showing quality degrades regardless."
    786     },
    787     "demo_ability": {
    788       "score": 1,
    789       "justification": "Replication package is publicly available on GitHub but requires significant setup with SonarQube, GHArchive data, and statistical estimation to reproduce."
    790     },
    791     "brand_recognition": {
    792       "score": 2,
    793       "justification": "Study names and evaluates products from OpenAI (Codex), Anthropic (Claude Code), Cursor, Devin, and GitHub Copilot — all high-profile tools in the current AI coding discourse."
    794     }
    795   },
    796   "hn_data": {
    797     "threads": [],
    798     "top_points": 0,
    799     "total_points": 0,
    800     "total_comments": 0
    801   }
    802 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs