scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24738B)
      1 {
      2   "paper": {
      3     "title": "How Much Does AI Impact Development Speed? An Enterprise-Based Randomized Controlled Trial",
      4     "authors": ["Elise Paradis", "Kate Grey", "Quinn Madison", "Daye Nam", "Andrew Macvean", "Vahid Meimand", "Nan Zhang", "Ben Ferrari-Church", "Satish Chandra"],
      5     "year": 2024,
      6     "venue": "2025 IEEE/ACM 47th International Conference on Software Engineering: Software Engineering in Practice (ICSE-SEIP)",
      7     "arxiv_id": "2410.12944",
      8     "doi": "10.1109/ICSE-SEIP66354.2025.00060"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["rct"],
     13   "key_findings": "A randomized controlled trial with 96 Google software engineers found that AI coding features (code completion, smart paste, natural language to code) reduced task completion time by approximately 21% when controlling for developer-level factors, though the confidence interval was large and the controlled estimate was not statistically significant (p=0.086). Developers who code more hours per day and more senior developers were significantly faster. An interaction effect suggested developers who code 5+ hours/day may benefit more from AI, though this was not statistically significant.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No source code, analysis scripts, or repository URLs are provided in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No data is released. The study uses proprietary Google internal data and no anonymized dataset is shared."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, statistical software versions, or analysis tool details are provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No reproduction instructions are provided. The task instructions are available 'upon request' (footnote 1) but no step-by-step reproduction guide is included."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "95% confidence intervals are reported for the main effect estimate: 'β1 = −0.24; 95%CI = [−0.51, 0.03]' (Section VI-C). Standard errors are reported for all regression coefficients in Table III."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Student's t-test is used for the main comparison (t(83.6) = 2.11, p = .038), Welch two-sample t-tests for covariate balance checks (Table II), and p-values are reported for all regression models (Table III)."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Effect sizes are reported with context: '21% increase in development speed' from Model 2 (β1 = −0.24), with the formula for converting log estimates to percentages explicitly stated. Comparisons to Peng et al. (56%) and Cui et al. (26%) provide additional context."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No power analysis or formal sample size justification is provided. The paper acknowledges the study 'was adequately powered to test our main effect' (Section V) but provides no calculation supporting this claim, and acknowledges insufficient power for interaction effects."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Standard deviations are reported for all variables in Table I (e.g., LogToT M=4.46, SD=0.69). Standard errors of the mean are reported for time on task by condition (SE=9.3 for AI, SE=8.1 for control)."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The control group (no AI features, n=48) serves as the baseline comparison against the experimental group (AI features enabled, n=48)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The study compares against a no-AI control condition, which is the appropriate contemporary baseline for an RCT evaluating AI tool impact. The paper also contextualizes results against Peng et al. (2023) and Cui et al. (2024)."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "The study tests the combined effect of three AI features as a bundle; ablation of individual features is not applicable to the RCT design as deployed."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Only one primary metric is used: time on task (logged). No secondary metrics such as code quality, task completion rate, or developer satisfaction are analyzed as outcomes."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation of system outputs is not relevant here; the study measures developer task completion time, not the quality of AI-generated code."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "This is an RCT, not a benchmark evaluation. There is no train/test split concept."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by developer-level factors (seniority, hours coding per day, AI tool frequency) and task-level factors (C++ proficiency, domain expertise) across multiple regression models (Table III, Models 1-4)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses where AI did NOT help: interaction effects with seniority and AI tool frequency were not significant (Hypotheses 4 and 5 rejected), and task-level factors were insignificant predictors (Section VII)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Multiple negative results are reported: the main effect lost significance when controlling for covariates (H2 partially supported), interaction effects for seniority and AI frequency were not significant (H4 and H5 rejected), and task-level factors were not significant predictors."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims ~21% speed improvement with a large confidence interval, which matches Model 2 results. It appropriately hedges with 'best estimate' and 'confidence interval is large.' The interesting interaction effect is described as tentative."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper uses a randomized controlled trial design, which is appropriate for causal inference. Participants were randomly assigned to conditions, covariate balance was verified (Table II), and the authors appropriately note the RCT provides 'unbiased estimates' (Section IV)."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper explicitly bounds generalization: results are specific to Google's internal tools, Google developers, and a specific task type. Section V states findings 'may not be directly comparable to those obtained from studies with other developer populations.' The abstract invites further research noting effects may not 'translate across tools and over time.'"
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section VII discusses multiple alternative explanations: differences from Peng et al. attributed to population differences vs. tool differences; four possible reasons why AI frequency didn't predict speed (ceiling effect, low usability floor, lack of tool cohesion, learning curve); and possible explanations for seniority interaction."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper explicitly discusses the SPACE framework (Section II) and acknowledges that 'time on task' is one dimension of productivity, citing Hernández-López et al. for its use as a 'dominant measure.' Section VIII notes code quality was not explored. The distinction between speed and broader productivity is made."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "The study evaluates integrated IDE features (AI Code Completion, Smart Paste, Natural Language to Code) as products, not specific model versions. The underlying models are proprietary Google systems not identified by version."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The study does not use prompting as an experimental method; the AI features are built-in IDE tools that developers interact with through normal coding workflows."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "The study evaluates production AI features as black boxes. Hyperparameters of the underlying models are not relevant to the RCT design."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "The AI features are evaluated as production tools; the authors cannot be expected to describe internal scaffolding of proprietary Google IDE features."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The dependent variable is defined precisely: 'time research participants spent in Cider V working in the study-specific workspace' measured via telemetry, from workspace creation to last interaction (Section IV-B1). Variable recoding is documented (e.g., AvgProgHrsDay dichotomization, NbrHighFreqAreas computation)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section V 'Limitations' is a dedicated section spanning a full page with substantive discussion of multiple threats to validity."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Specific threats discussed include: evaluating their own team's tools (internal validity), Google employees not representative of all developers (external validity), lab task cannot match real-world complexity, 3-hour feasibility constraint limits generalizability, and the rapidly evolving nature of AI tools making this 'only the beginning of an answer.'"
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly states what results do NOT show: 'we cannot assume that the effect size obtained in our lab study will necessarily apply more broadly' (abstract), findings 'may not be directly comparable to those obtained from studies with other developer populations' (Section V), and 'questions about the impact of AI on code quality were not explored' (Section VIII)."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw data is made available. All data is proprietary Google internal data."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Data collection is described in detail: time on task measured via IDE telemetry (workspace creation to last interaction), pre/post questionnaires with cognitive testing, survey variables with specific coding schemes (Section IV-B), and the recruitment/task procedure (Section IV, Figure 4)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Recruitment is described: 'full-time software engineers from across Google were recruited via email by a team that was independent from the research team' with specific eligibility criteria: 1+ year at Google, C++ proficient, Piper users, Cider V users, some domain experience (Section IV)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is documented: 96 participants recruited → random assignment after pre-task questionnaire and training → task completion → post-task questionnaire → 93 completed (96.9%). Variable operationalization and recoding are documented in Section IV-B."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source is disclosed. All authors are Google employees conducting the study as part of their work, but no explicit funding statement or corporate sponsorship acknowledgment is provided."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are listed with Google affiliation prominently at the top of the paper."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Google funds this research (all authors are Google employees) and has a direct financial interest in demonstrating that its AI coding tools improve developer productivity. The paper acknowledges this: 'we cannot fully eliminate the risk associated with us evaluating our own team's developer tools' (Section V)."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is provided. The authors work at Google and are evaluating Google's own AI tools, but no formal declaration is made."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This is an RCT measuring developer speed, not an evaluation of a model's capability on a benchmark. Contamination concepts do not apply."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Not a benchmark evaluation; this is a human-subjects RCT."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not a benchmark evaluation; this is a human-subjects RCT."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "No mention of pre-registration (OSF, AsPredicted, or any registry) is found in the paper."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No mention of IRB or ethics board approval despite the study involving 96 human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": true,
    256         "answer": true,
    257         "justification": "Demographics are partially reported: seniority levels (3-7, mean 4.38), average programming hours per day, C++ proficiency, data logging expertise, and AI tool usage frequency (Table I). However, gender and geographic distribution are not reported."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": true,
    261         "answer": true,
    262         "justification": "Inclusion criteria are clearly stated: 'working at Google for at least one year, proficient in C++, submitted code to Piper, used Cider V as their main IDE, and had some experience with the task domain' (Section IV)."
    263       },
    264       "randomization_described": {
    265         "applies": true,
    266         "answer": true,
    267         "justification": "Randomization is described: 'Participants were randomly assigned to either the experimental or control group after they had completed both the pre-task questionnaire and the tool training' (Section IV). Balance checks are provided in Table II."
    268       },
    269       "blinding_described": {
    270         "applies": true,
    271         "answer": false,
    272         "justification": "Blinding is not described. Participants presumably knew whether they had AI features enabled or disabled, and it is unclear whether the researchers analyzing the data were blinded to condition. This is a potential source of bias not discussed."
    273       },
    274       "attrition_reported": {
    275         "applies": true,
    276         "answer": true,
    277         "justification": "Attrition is reported: '93 participants (96.9%) who started the task also completed it' out of 96 total. The N for each condition varies slightly (47 AI, 46 control for the t-test vs 48 assigned to each), suggesting 3 dropouts."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "This is an RCT measuring human developer speed, not a system proposing a new computational method. Inference cost is not relevant."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "This is an RCT, not a computational experiment. Compute budget is not relevant."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "Developers using AI features were significantly faster at completing the task (t(83.6) = 2.11, p = .038).",
    296       "evidence": "Student's t-test on log time on task, AI group mean 96 min (n=47) vs control 114 min (n=46). Section VI-B.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "AI made developers approximately 21% faster when controlling for developer-level factors.",
    301       "evidence": "Model 2 regression: β1 = −0.24, 95% CI = [−0.51, 0.03], p = 0.086 (not significant at p<0.05). Table III.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Developers who code 5+ hours per day were 32% faster than those coding 0-4 hours.",
    306       "evidence": "Model 2 regression: AvgProgHrsDay β = −0.38, p < 0.05. Table III.",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "Higher seniority was associated with 15% decrease in time on task per level increase.",
    311       "evidence": "Model 2 regression: Level β = −0.16, p < 0.05. Table III.",
    312       "supported": "strong"
    313     },
    314     {
    315       "claim": "Developers who code more hours per day may benefit more from AI than those who code less.",
    316       "evidence": "Interaction term EC:APHD β = −0.29, large effect but not significant. Model 4, p = 0.018 (model-level). Table III.",
    317       "supported": "weak"
    318     }
    319   ],
    320   "red_flags": [
    321     {
    322       "flag": "Company evaluating its own product",
    323       "detail": "All authors are Google employees evaluating Google's own AI coding tools. The paper acknowledges this conflict ('we cannot fully eliminate the risk associated with us evaluating our own team's developer tools') but the risk remains. No independent evaluation was conducted."
    324     },
    325     {
    326       "flag": "Main controlled effect not significant",
    327       "detail": "The headline 21% estimate (Model 2) has p = 0.086 and 95% CI crossing zero [−0.51, 0.03]. The significant result (H1, p=.038) is from an unadjusted t-test. The paper appropriately reports this but the abstract leads with '21%' which could be misleading."
    328     },
    329     {
    330       "flag": "No pre-registration",
    331       "detail": "For an RCT with multiple hypotheses and interaction tests, lack of pre-registration raises concerns about potential outcome switching or selective reporting of interaction effects."
    332     },
    333     {
    334       "flag": "No IRB/ethics approval mentioned",
    335       "detail": "A study with 96 human participants in a controlled experiment makes no mention of institutional ethics review."
    336     },
    337     {
    338       "flag": "No blinding described",
    339       "detail": "Participants knew whether AI features were enabled, which could introduce demand characteristics or performance anxiety effects. This is acknowledged as difficult to avoid but is not discussed as a limitation."
    340     },
    341     {
    342       "flag": "Three AI features bundled",
    343       "detail": "The study tests three features simultaneously (code completion, smart paste, NL to code) but cannot attribute the effect to any individual feature. The title and abstract suggest a general 'AI' effect."
    344     }
    345   ],
    346   "cited_papers": [
    347     {
    348       "title": "The impact of AI on developer productivity: Evidence from GitHub Copilot",
    349       "authors": ["S. Peng", "E. Kalliamvakou", "P. Cihon", "M. Demirer"],
    350       "year": 2023,
    351       "arxiv_id": "2302.06590",
    352       "relevance": "Key baseline RCT study finding 56% speed increase with GitHub Copilot, directly compared against in this paper."
    353     },
    354     {
    355       "title": "The effects of generative AI on high skilled work: Evidence from three field experiments with software developers",
    356       "authors": ["Z. K. Cui", "M. Demirer", "S. Jaffe", "L. Musolff", "S. Peng", "T. Salz"],
    357       "year": 2024,
    358       "relevance": "Large-scale field experiment (n=4,867) finding 26% throughput increase with Copilot, key comparison point for enterprise AI impact."
    359     },
    360     {
    361       "title": "Productivity assessment of neural code completion",
    362       "authors": ["A. Ziegler", "E. Kalliamvakou", "X. A. Li"],
    363       "year": 2022,
    364       "relevance": "Early study analyzing Copilot telemetry data and perceived productivity from code suggestion acceptance rates."
    365     },
    366     {
    367       "title": "Using an LLM to help with code understanding",
    368       "authors": ["D. Nam", "A. Macvean", "V. Hellendoorn", "B. Vasilescu", "B. Myers"],
    369       "year": 2024,
    370       "relevance": "Found coding experience may amplify value gained from AI; directly relevant to interaction effects explored in this paper."
    371     },
    372     {
    373       "title": "Reading between the lines: Modeling user behavior and costs in AI-assisted programming",
    374       "authors": ["H. Mozannar", "G. Bansal", "A. Fourney", "E. Horvitz"],
    375       "year": 2024,
    376       "relevance": "Proposes framework for understanding developer behavior with AI coding tools, cited as path for future mechanistic analysis."
    377     },
    378     {
    379       "title": "The SPACE of developer productivity: There's more to it than you think",
    380       "authors": ["N. Forsgren", "M.-A. Storey", "C. Maddila", "T. Zimmermann"],
    381       "year": 2021,
    382       "relevance": "Foundational framework for multi-dimensional developer productivity measurement (Satisfaction, Performance, Activity, Communication, Efficiency)."
    383     },
    384     {
    385       "title": "Randomized controlled trial for Microsoft Security Copilot",
    386       "authors": ["B. G. Edelman", "J. Bono", "S. Peng", "R. Rodriguez", "S. Ho"],
    387       "year": 2023,
    388       "relevance": "RCT suggesting beginners may benefit more from AI than experienced developers, providing contrasting evidence on seniority interaction."
    389     },
    390     {
    391       "title": "Is GitHub Copilot a substitute for human pair-programming? An empirical study",
    392       "authors": ["S. Imai"],
    393       "year": 2022,
    394       "relevance": "Small-scale study (n=21) finding AI may increase code churn, relevant to code quality concerns."
    395     },
    396     {
    397       "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models",
    398       "authors": ["P. Vaithilingam", "T. Zhang", "E. L. Glassman"],
    399       "year": 2022,
    400       "relevance": "Found AI code generation may not improve task completion rates despite perceived benefits."
    401     },
    402     {
    403       "title": "Coding on Copilot: 2023 data suggests downward pressure on code quality",
    404       "authors": ["W. Harding", "M. Kloster"],
    405       "year": 2024,
    406       "relevance": "Evidence that AI coding assistants may lower code quality at the ecosystem level, important counterpoint to speed gains."
    407     },
    408     {
    409       "title": "The impact of large language models on open-source innovation: Evidence from GitHub Copilot",
    410       "authors": ["D. Yeverechyahu", "R. Mayya", "G. Oestreicher-Singer"],
    411       "year": 2024,
    412       "arxiv_id": "2409.08379",
    413       "relevance": "Evidence that AI increases total code contributions in open source, relevant to ecosystem-level AI impact."
    414     }
    415   ]
    416 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs