scan.json (27252B)
1 { 2 "paper": { 3 "title": "Beyond the Commit: Developer Perspectives on Productivity with AI Coding Assistants", 4 "authors": [ 5 "Valerie Chen", 6 "Jasmyn He", 7 "Behnjamin Williams", 8 "Jason Valentino", 9 "Ameet Talwalkar" 10 ], 11 "year": 2026, 12 "venue": "ICSE-SEIP '26", 13 "arxiv_id": "2602.03593", 14 "doi": "10.1145/3786583.3786848" 15 }, 16 "checklist": { 17 "artifacts": { 18 "code_released": { 19 "applies": true, 20 "answer": false, 21 "justification": "No repository URL, Zenodo archive, or any code/data artifact link is provided in the paper. The survey instrument and interview protocol are not released." 22 }, 23 "data_released": { 24 "applies": true, 25 "answer": false, 26 "justification": "The survey data and interview transcripts are not released. The paper notes the survey was conducted within BNY Mellon and certain questions are omitted 'to protect the confidentiality of company metrics and goals.' No anonymized dataset is provided." 27 }, 28 "environment_specified": { 29 "applies": true, 30 "answer": false, 31 "justification": "No environment or dependency specifications are provided. The analysis methods (Pearson correlation, thematic coding) are described conceptually but no analysis scripts or software environment details are given." 32 }, 33 "reproduction_instructions": { 34 "applies": true, 35 "answer": false, 36 "justification": "No reproduction instructions are provided. The survey was administered via the DX platform at BNY Mellon, a proprietary setting, and no steps for reproducing the analysis are included." 37 } 38 }, 39 "statistical_methodology": { 40 "confidence_intervals_or_error_bars": { 41 "applies": true, 42 "answer": false, 43 "justification": "The survey results are reported as percentages (e.g., '86% are either satisfied or very satisfied') without confidence intervals or error bars. The correlation is reported as r=0.34 with p<0.0001 but no confidence interval for the correlation coefficient." 44 }, 45 "significance_tests": { 46 "applies": true, 47 "answer": true, 48 "justification": "The paper reports a Pearson Correlation Score of r=0.34 with p<0.0001 for the relationship between satisfaction and time savings (Section 4.1). This is a significance test for the correlation." 49 }, 50 "effect_sizes_reported": { 51 "applies": true, 52 "answer": true, 53 "justification": "The Pearson correlation r=0.34 is itself an effect size measure for the relationship between satisfaction and time savings. The paper characterizes it as 'a positive but weak correlation,' providing context for interpretation." 54 }, 55 "sample_size_justified": { 56 "applies": true, 57 "answer": false, 58 "justification": "The survey sample of N=2989 is large but not justified through power analysis. The interview sample of N=11 is not justified with any formal reasoning about saturation or adequacy for the claims made." 59 }, 60 "variance_reported": { 61 "applies": true, 62 "answer": false, 63 "justification": "No variance, standard deviation, or spread measures are reported for any quantitative results. The survey results are presented only as aggregate percentages and a single correlation value." 64 } 65 }, 66 "evaluation_design": { 67 "baselines_included": { 68 "applies": true, 69 "answer": true, 70 "justification": "The paper compares its identified factors against existing frameworks (SPACE, DORA) and prior studies in Table 3, showing which factors are and are not captured by existing work." 71 }, 72 "baselines_contemporary": { 73 "applies": true, 74 "answer": true, 75 "justification": "The comparison frameworks and prior studies cited are contemporary, including SPACE (2021), DORA (2018), and recent empirical studies from 2022-2025 (Table 3)." 76 }, 77 "ablation_study": { 78 "applies": false, 79 "answer": false, 80 "justification": "This is a mixed-methods qualitative/survey study, not a system with components to ablate. There is no system or tool being proposed that could be decomposed." 81 }, 82 "multiple_metrics": { 83 "applies": true, 84 "answer": true, 85 "justification": "The paper explicitly uses two survey metrics—general satisfaction and perceived time savings—and demonstrates that they capture different aspects of productivity (r=0.34 weak correlation). The qualitative component identifies six additional factors." 86 }, 87 "human_evaluation": { 88 "applies": false, 89 "answer": false, 90 "justification": "The paper does not produce system outputs that require human evaluation. It is itself a study of human perspectives. Human evaluation of system outputs is not applicable here." 91 }, 92 "held_out_test_set": { 93 "applies": false, 94 "answer": false, 95 "justification": "This is not a machine learning or benchmark study. There is no test set or training/test split applicable." 96 }, 97 "per_category_breakdown": { 98 "applies": true, 99 "answer": true, 100 "justification": "Figure 1 (right panel) provides a fine-grained breakdown of satisfaction vs. time savings categories. Table 1 breaks down interview participants by seniority, role, and department. Figure 2 maps use cases to impact factors." 101 }, 102 "failure_cases_discussed": { 103 "applies": true, 104 "answer": true, 105 "justification": "The paper discusses cases where AI tools fail: hallucinations ('asking it to fix it does not work and just returns the same wrong answer'), limitations with certain languages ('does not work as well for C#'), and difficulties with refactoring tasks (Section 4.3.2)." 106 }, 107 "negative_results_reported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The paper reports several negative findings: 60% of engineers report less than one hour of time savings, satisfaction and time savings are only weakly correlated (r=0.34), AI may erode junior developer skills (Factor 5), and refactoring use cases are particularly challenging (Section 4.3.2). Figure 2 explicitly marks negative impacts." 111 } 112 }, 113 "claims_and_evidence": { 114 "abstract_claims_supported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The abstract claims are supported: 'survey results expose conflicting perspectives' is backed by the weak correlation finding (r=0.34); 'six distinct factors' are identified in Section 4.2; 'long-term metrics like technical expertise and ownership' are detailed in Section 4.2.3. No overclaiming detected." 118 }, 119 "causal_claims_justified": { 120 "applies": true, 121 "answer": true, 122 "justification": "The paper is careful to avoid strong causal claims. It uses language like 'our findings demonstrate that a multifaceted approach is needed' and 'these results suggest that developers can be satisfied...without necessarily getting significant time savings.' The claims are correlational and descriptive, matching the observational study design." 123 }, 124 "generalization_bounded": { 125 "applies": true, 126 "answer": true, 127 "justification": "Section 5.3 (Threats to Validity) explicitly states 'The surveys and interviews conducted in this work were limited to one company' and acknowledges results 'may not generalize to all other organizations or to independent developers.' The scope is bounded to BNY Mellon and GitHub Copilot." 128 }, 129 "alternative_explanations_discussed": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper discusses alternative explanations for the satisfaction-time savings disconnect: hallucinations, language-specific limitations, and the possibility that satisfaction reflects factors beyond time savings. Section 5.3 discusses that the findings may reflect BNY Mellon's specific adoption stage rather than general patterns." 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": false, 138 "answer": false, 139 "justification": "This paper does not run experiments with LLMs. It studies developers' experiences with GitHub Copilot as a black-box tool. There are no model versions to specify for the study itself." 140 }, 141 "prompts_provided": { 142 "applies": false, 143 "answer": false, 144 "justification": "The paper does not use prompting as part of its methodology. It studies how developers use GitHub Copilot but does not run any LLM experiments itself." 145 }, 146 "hyperparameters_reported": { 147 "applies": false, 148 "answer": false, 149 "justification": "No LLM experiments are conducted. The study is a survey and interview study, not a computational experiment with hyperparameters." 150 }, 151 "scaffolding_described": { 152 "applies": false, 153 "answer": false, 154 "justification": "No agentic scaffolding is used. The paper studies developer experiences with GitHub Copilot as a black-box third-party tool." 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": false, 159 "justification": "The paper mentions that 8000 total engineers completed the survey but only 2989 responded to the AI-specific questions, and the correlation analysis uses N=2754 who answered both questions. However, the filtering criteria for going from 8000 to 2989 are vague ('Engineers opted in to answering questions based on whether it was relevant to them'). The criteria for the interview coding process are described at a high level but lack specific detail about how codes were developed and refined." 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 5.3 is titled 'Threats to Validity' and provides a substantive discussion of limitations spanning an entire paragraph." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 5.3 identifies specific threats: limitation to one company (BNY Mellon), access only to GitHub Copilot (not other AI coding assistants), exclusion of agentic workflows, varied response rates across subgroups, and factors may not be fully comprehensive. These are specific to this study." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 5.3 explicitly states boundaries: results limited to one company, only GitHub Copilot was assessed, agentic workflows 'were out of scope for this work,' and the factors 'may not be fully comprehensive.' These are specific scope boundaries." 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": false, 183 "justification": "No raw survey data or interview transcripts are released. The company setting likely prevents this due to confidentiality, but the data cannot be independently verified." 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": true, 188 "justification": "Section 3.1 describes the survey: conducted via the DX platform, targeting engineers who actively commit code at BNY Mellon, with specific questions listed. Section 3.2 describes the interview process: 30-45 minute semi-structured interviews via Teams, with the protocol topics listed and participant selection criteria specified." 189 }, 190 "recruitment_methods_described": { 191 "applies": true, 192 "answer": true, 193 "justification": "Survey recruitment: 'we leveraged the large pool of engineers throughout BNY Mellon who actively commit code' with opt-in participation. Interview recruitment: purposive sampling based on three criteria (seniority, business sector, development role), filtered from an internal list, contacted via Teams, supplemented by snowball sampling (Section 3.2)." 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": false, 198 "justification": "The pipeline from 8000 total survey respondents to 2989 AI-related respondents to 2754 used in correlation analysis is mentioned but the filtering steps are not clearly documented (what caused the drop from 2989 to 2754 is explained, but the 8000→2989 drop relies on opt-in relevance with no formal criteria). The qualitative coding process is described at a high level ('inductive open coding,' 'iterating multiple times') but specific stages and counts are not provided." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": false, 205 "justification": "No funding source or acknowledgments section is present in the paper. Three of the five authors are affiliated with BNY Mellon, but no funding disclosure is made." 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "Author affiliations are clearly listed: Valerie Chen and Ameet Talwalkar at Carnegie Mellon University; Jasmyn He, Behnjamin Williams, and Jason Valentino at BNY Mellon. The company affiliation is prominent." 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": false, 215 "justification": "Three authors are BNY Mellon employees and the study was conducted at BNY Mellon using their engineering population. BNY Mellon has a commercial interest in demonstrating value from their GitHub Copilot deployment. No disclosure is made about whether the funder (BNY Mellon, implicitly) has independence from the study outcomes." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "No competing interests statement or financial disclosures are present in the paper. Absence of disclosure is not absence of conflict." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": false, 226 "answer": false, 227 "justification": "This paper does not evaluate a pre-trained model on any benchmark. It is a survey and interview study of developer experiences." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": false, 231 "answer": false, 232 "justification": "This paper does not evaluate a pre-trained model on any benchmark. Contamination is not applicable." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": false, 236 "answer": false, 237 "justification": "This paper does not evaluate a pre-trained model on any benchmark. Contamination is not applicable." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": true, 243 "answer": false, 244 "justification": "No pre-registration is mentioned. No link to OSF, AsPredicted, or any registry is provided." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": true, 248 "answer": false, 249 "justification": "No IRB or ethics board approval is mentioned in the paper, despite conducting a survey of 2989 employees and 11 interviews." 250 }, 251 "demographics_reported": { 252 "applies": true, 253 "answer": true, 254 "justification": "Table 1 reports interview participant demographics: seniority level (early career, mid career, management), developer role (backend, full stack, frontend), and department function. Survey respondents are described as 'engineers who actively commit code' with access to GitHub Copilot. However, detailed demographics for survey respondents (gender, experience years, etc.) are not provided." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": true, 258 "answer": true, 259 "justification": "Interview participants were selected using three criteria: 'different levels of seniority,' 'business sector,' and 'varying roles in development' (Section 3.2). Survey participants were engineers who 'actively commit code' and had access to GitHub Copilot. These constitute stated inclusion criteria." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "This is not an experimental study with treatment and control conditions. It is a cross-sectional survey and qualitative interview study. Randomization is not applicable." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "This is not an experimental study. There are no conditions to blind participants to. Blinding is not applicable for a survey and interview study." 270 }, 271 "attrition_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "The paper reports that 8000 engineers completed the broader survey, 2989 answered the AI-specific questions, and 2754 answered both questions for the correlation analysis. For interviews, they note 'the response rate varied' across subgroups and all 11 completed the study." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "This is a survey and interview study, not a computational method. There is no inference cost applicable to the paper's own methodology." 282 }, 283 "compute_budget_stated": { 284 "applies": false, 285 "answer": false, 286 "justification": "This is a survey and interview study with no significant computational component. Compute budget is not applicable." 287 } 288 } 289 }, 290 "claims": [ 291 { 292 "claim": "86% of developers are satisfied or very satisfied with GitHub Copilot, but around 60% report less than one hour of time savings per week.", 293 "evidence": "Figure 1 left panel shows 50% very satisfied and 36% satisfied. Figure 1 middle panel shows 29% save 1-30 min, 23% save 31-60 min, and 8% report no time savings (Section 4.1).", 294 "supported": "strong" 295 }, 296 { 297 "claim": "The correlation between satisfaction and perceived time savings is weak (r=0.34, p<0.0001), demonstrating that different productivity metrics can give conflicting signals.", 298 "evidence": "Section 4.1 reports Pearson Correlation Score of r=0.34 with N=2754, p<0.0001. Figure 1 right panel shows nearly 400 developers very satisfied despite saving only 30 minutes per week.", 299 "supported": "strong" 300 }, 301 { 302 "claim": "Six factors capture the breadth of AI coding assistant productivity impacts: self-sufficiency, frustration/cognitive load, task completion rate, ease of peer review, technical expertise, and ownership of work.", 303 "evidence": "Section 4.2 details these six factors derived from 11 semi-structured interviews, with supporting quotes from participants. Table 2 provides operationalization questions for each factor.", 304 "supported": "moderate" 305 }, 306 { 307 "claim": "Long-term factors (technical expertise and ownership of work) are largely unaddressed in existing frameworks like SPACE and DORA and in prior empirical studies.", 308 "evidence": "Table 3 shows that Factors 5 (long-term technical expertise) and 6 (ownership of work) have 'No' for SPACE/DORA coverage and 'N/A' for prior studies (Section 5.2).", 309 "supported": "moderate" 310 }, 311 { 312 "claim": "The impact of GitHub Copilot on productivity factors varies by use case, with different use cases showing different positive and negative impacts across the six factors.", 313 "evidence": "Figure 2 maps three use cases (implementing new features, improving existing code, generating tests/documentation) to the six factors, showing varied positive and negative impacts. Sections 4.3.1-4.3.3 provide qualitative support.", 314 "supported": "moderate" 315 } 316 ], 317 "methodology_tags": [ 318 "qualitative", 319 "observational" 320 ], 321 "key_findings": "A survey of 2989 developers at BNY Mellon reveals that 86% are satisfied with GitHub Copilot but about 60% report less than one hour of weekly time savings, with only a weak correlation (r=0.34) between satisfaction and time savings. Semi-structured interviews with 11 developers identify six productivity factors spanning development (self-sufficiency, cognitive load), deployment (task completion, peer review), and long-term impacts (technical expertise, ownership of work). The study highlights that long-term factors like skill development and code ownership are largely absent from existing productivity frameworks (SPACE, DORA) and prior empirical studies of AI coding assistants.", 322 "red_flags": [ 323 { 324 "flag": "Small interview sample for broad claims", 325 "detail": "Only 11 interviews from a single company are used to derive a six-factor productivity framework presented as broadly applicable. While qualitative research can use small samples, the diversity of perspectives within BNY Mellon may not capture the full range of AI productivity dimensions across different organizations, team sizes, or development cultures." 326 }, 327 { 328 "flag": "Company-affiliated authors studying company's tool deployment", 329 "detail": "Three of five authors are BNY Mellon employees, and the study evaluates productivity of AI tools deployed at BNY Mellon. While the paper does report mixed/negative findings, no conflict of interest statement or funding disclosure is present." 330 }, 331 { 332 "flag": "Self-reported time savings rather than measured", 333 "detail": "The paper acknowledges this limitation: 'we considered the use of direct objective metrics, like acceptance rates or chat acceptance rates, that do not rely on self-reports, but were unable to reliably obtain these metrics.' Self-reported time savings are known to be unreliable, as the paper itself notes (Section 5.1)." 334 }, 335 { 336 "flag": "No IRB or ethics approval mentioned", 337 "detail": "The study collects survey data from 2989 employees and conducts 11 interviews at their employer, but no IRB or ethics board approval is mentioned. This raises questions about participant protections, especially given the employer-employee dynamic." 338 } 339 ], 340 "cited_papers": [ 341 { 342 "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot", 343 "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"], 344 "year": 2023, 345 "arxiv_id": "2306.08516", 346 "relevance": "RCT studying causal impact of GitHub Copilot on developer productivity, a key baseline for this survey's productivity claims." 347 }, 348 { 349 "title": "Measuring GitHub Copilot's Impact on Productivity", 350 "authors": ["Albert Ziegler", "Eirini Kalliamvakou", "X. Alice Li", "Andrew Rice", "Devon Rifkin", "Shawn Simister", "Ganesh Sittampalam", "Edward Aftandilian"], 351 "year": 2024, 352 "relevance": "Empirical study of GitHub Copilot's productivity impact using self-reported efficiency and flow measures." 353 }, 354 { 355 "title": "The SPACE of Developer Productivity: There's more to it than you think", 356 "authors": ["Nicole Forsgren", "Margaret-Anne Storey", "Chandra Maddila", "Thomas Zimmermann", "Brian Houck", "Jenna Butler"], 357 "year": 2021, 358 "relevance": "Foundational developer productivity framework (SPACE) that this paper extends for AI coding assistants." 359 }, 360 { 361 "title": "Better Together? An Evaluation of AI-Supported Code Translation", 362 "authors": ["Justin D. Weisz", "Michael Muller", "Steven I. Ross"], 363 "year": 2022, 364 "relevance": "Evaluates variation in developer reliance on AI assistance, relevant to the self-sufficiency factor identified in this paper." 365 }, 366 { 367 "title": "The RealHumanEval: Evaluating Large Language Models' Abilities to Support Programmers", 368 "authors": ["Hussein Mozannar", "Valerie Chen", "Mohammed Alsobay"], 369 "year": 2024, 370 "arxiv_id": "2404.02806", 371 "relevance": "Evaluates LLMs' ability to support programmers through human evaluation, directly relevant to AI productivity measurement." 372 }, 373 { 374 "title": "An empirical evaluation of GitHub copilot's code suggestions", 375 "authors": ["Nhan Nguyen", "Sarah Nadi"], 376 "year": 2022, 377 "relevance": "Empirical evaluation of AI code quality and reviewability, relevant to the peer review factor identified in this paper." 378 }, 379 { 380 "title": "The Impact of Generative AI on Collaborative Open-Source Software Development: Evidence from GitHub Copilot", 381 "authors": ["Fangchen Song", "Ashish Agarwal", "Wen Wen"], 382 "year": 2024, 383 "arxiv_id": "2401.01812", 384 "relevance": "Field study measuring objective impacts of GitHub Copilot on open-source development, complementary evidence to this survey's subjective measures." 385 }, 386 { 387 "title": "Modeling User Behavior and Costs in AI-Assisted Programming", 388 "authors": ["Hussein Mozannar", "Gagan Bansal", "Adam Fourney", "Eric Horvitz"], 389 "year": 2024, 390 "doi": "10.1145/3613904.3641936", 391 "relevance": "Models developer behavior and cognitive costs with AI assistants, directly relevant to the cognitive load factor." 392 }, 393 { 394 "title": "A large-scale survey on the usability of ai programming assistants: Successes and challenges", 395 "authors": ["Jenny T Liang", "Chenyang Yang", "Brad A Myers"], 396 "year": 2024, 397 "relevance": "Large-scale survey on AI programming assistant usability, complementary methodology to this paper's BNY Mellon survey." 398 }, 399 { 400 "title": "Is GitHub copilot a substitute for human pair-programming? an empirical study", 401 "authors": ["Saki Imai"], 402 "year": 2022, 403 "doi": "10.1145/3510454.3522684", 404 "relevance": "Empirical study comparing AI assistance to pair programming in terms of productivity, relevant to task completion metrics." 405 }, 406 { 407 "title": "What's DAT? Three Case Studies of Measuring Software Development Productivity at Meta With Diff Authoring Time", 408 "authors": ["Moritz Beller", "Amanda Park", "Karim Nakad"], 409 "year": 2025, 410 "arxiv_id": "2503.10977", 411 "relevance": "Introduces Diff Authoring Time as an objective productivity metric at Meta, relevant to operationalizing AI productivity measurement." 412 }, 413 { 414 "title": "Code with me or for me? how increasing ai automation transforms developer workflows", 415 "authors": ["Valerie Chen", "Ameet Talwalkar", "Robert Brennan", "Graham Neubig"], 416 "year": 2025, 417 "arxiv_id": "2507.08149", 418 "relevance": "Studies how AI automation levels affect developer workflows, extending the productivity framework from this paper to agentic tools." 419 }, 420 { 421 "title": "Examining the use and impact of an ai code assistant on developer productivity and experience in the enterprise", 422 "authors": ["Justin D Weisz", "Shraddha Vijay Kumar", "Michael Muller"], 423 "year": 2025, 424 "relevance": "Enterprise study of AI code assistant impact on developer productivity, directly comparable to this paper's BNY Mellon study." 425 } 426 ] 427 }