scan.json (24516B)
1 { 2 "paper": { 3 "title": "Dear Diary: A randomized controlled trial of Generative AI coding tools in the workplace", 4 "authors": [ 5 "Jenna Butler", 6 "Jina Suh", 7 "Sankeerti Haniyur", 8 "Constance Hadley" 9 ], 10 "year": 2024, 11 "venue": "arXiv", 12 "arxiv_id": "2410.18334" 13 }, 14 "scan_version": 3, 15 "active_modules": [], 16 "methodology_tags": [ 17 "rct", 18 "qualitative" 19 ], 20 "key_findings": "A 3-week RCT of GitHub Copilot at a large software company (N=106 final) found that first-time use significantly increased beliefs that GenAI tools are useful (p=0.001) and enjoyable (p<0.0001), but did not change trust in AI-generated code. No statistically significant changes were found in telemetry metrics (code changes, PRs, development time). 84% of treatment participants reported positive changes in daily work practices, and developers found unexpected uses such as replacing web search and creative ideation.", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": false, 26 "justification": "No source code or analysis scripts are released. The paper references supplemental survey materials on Zenodo [5] but no analysis code." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": false, 31 "justification": "No raw data (survey responses, diary entries, telemetry) is released. Only supplemental survey questions are shared via Zenodo." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "No environment specifications or software dependencies are provided for reproducing the analysis." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "No reproduction instructions are provided. The supplemental material contains only survey instruments, not analysis procedures." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "No confidence intervals or error bars are reported. Results are presented as point estimates with p-values only (e.g., 'average rating rose from 2.72 to 3.61')." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": true, 53 "justification": "Paired t-tests are used for before/after Likert comparisons (Section 4.2), chi-square for randomization balance, Kruskal-Wallis for diary distributions, and difference-in-differences for telemetry (Section 4.2.1)." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": false, 58 "justification": "No standardized effect sizes (Cohen's d, etc.) are reported. Raw mean differences are given (e.g., 2.72 to 3.61) and correlation coefficients (r=0.691 vs 0.606), but no formal effect size measures for the main treatment comparisons." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "No power analysis or sample size justification is provided. The paper acknowledges low power post-hoc in Table 2 (power=0.06 for CodeChanges) but did not plan the sample size in advance." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "Standard deviation is reported only for diary submission counts (SD=4.819). No variance measures are reported for the main outcome variables (Likert scales, telemetry metrics)." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "The RCT includes a control group (no Copilot) and a continuing group (already using Copilot), providing baseline comparisons for the treatment group." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "The control condition (no GenAI tools) is the appropriate contemporary baseline for evaluating the introduction of a new tool." 81 }, 82 "ablation_study": { 83 "applies": false, 84 "answer": false, 85 "justification": "Not applicable — this is an RCT evaluating a single tool (GitHub Copilot), not a multi-component system." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Multiple metrics are used: Likert belief scales (usefulness, trust, enjoyment), telemetry (code changes, PRs, development minutes, email minutes, build minutes), and qualitative diary coding." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": true, 95 "justification": "The study centers on human evaluation — surveys, diary entries, and qualitative coding of developer responses about their experience with the tools." 96 }, 97 "held_out_test_set": { 98 "applies": false, 99 "answer": false, 100 "justification": "Not applicable — this is an RCT with human participants, not a benchmark evaluation with train/test splits." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Results are broken down by prior experience (experienced vs. inexperienced users), by group (treatment/control/continuing), and by demographic categories in Table 1." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Section 4.3.2 'Challenges' discusses failure cases extensively: incorrect but plausible code, validation overhead negating productivity, lack of language support, with specific diary verbatims." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper reports null telemetry results (Table 2: no statistically significant DiD results for any metric), trust not changing, and 16% negative responses about work changes." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "Abstract claims about increased usefulness/enjoyment perception (supported by paired t-tests), unchanged trustworthiness (supported by non-significant changes), unexpected uses (supported by diary coding), and 84%/66% positive change rates (supported by qualitative coding) are all backed by results." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": false, 127 "justification": "The RCT design supports causal claims, but 25% non-compliance in both groups (control used GenAI, treatment didn't use it) and high attrition (228→106) undermine causal inference. No intention-to-treat analysis is performed; only compliant participants are analyzed, introducing selection bias." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": true, 132 "justification": "Section 5 (Limitations) explicitly bounds generalization to a single company, acknowledges the population characteristics, and discusses the single-company limitation with reference to Flyvbjerg's case study methodology." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": true, 137 "justification": "The paper discusses several alternative explanations: the 11-week tipping point for proficiency, confounding factors in telemetry (sprint planning, oncall, vacation), non-compliance, and the hypothesis that AI news cycle (not tool use) drove increased self-confidence in unique skills." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper measures Likert scale beliefs and telemetry proxies (lines of code, PRs) but does not explicitly discuss the gap between these proxies and actual productivity or developer experience. Telemetry is presented as measuring 'impact' without discussing what these proxies actually capture." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": false, 149 "justification": "The paper refers to 'Github Copilot' throughout without specifying which version, model backend, or snapshot was used. Given Copilot changed significantly over 2022-2023, this is a meaningful omission." 150 }, 151 "prompts_provided": { 152 "applies": false, 153 "answer": false, 154 "justification": "The paper evaluates GitHub Copilot as a black-box tool used by developers naturally — there are no researcher-designed prompts to report." 155 }, 156 "hyperparameters_reported": { 157 "applies": false, 158 "answer": false, 159 "justification": "Not applicable — Copilot is used as a black-box developer tool with default settings; there are no researcher-controlled hyperparameters." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "Copilot is evaluated as a third-party black-box tool. The authors cannot describe internal scaffolding." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper describes high-level filtering (228→106) but does not document the open coding procedure for qualitative data, inter-rater reliability for coding, or how telemetry was preprocessed for the DiD analysis." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 5 is a dedicated 'Limitations' section discussing self-report bias, unvalidated survey instruments, and single-company generalizability." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": true, 181 "justification": "The limitations section identifies specific threats: self-reported data introducing social desirability and recall bias, unvalidated survey instruments (novel questions with no prior validation), and single-company population." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": true, 186 "justification": "The paper explicitly states it was conducted at a single company with specific demographics, acknowledges the survey questions were not previously validated, and discusses what the study timeframe may not capture (citing the 11-week tipping point)." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": false, 193 "justification": "Raw survey responses, diary entries, and telemetry data are not released. Only supplemental survey instruments are available on Zenodo." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Data collection is described in detail: intake surveys (Section 3.4), daily diary via Teams messages (Section 3.6), closing survey (Section 3.7), and telemetry collection with consent (Section 3.8)." 199 }, 200 "recruitment_methods_described": { 201 "applies": true, 202 "answer": true, 203 "justification": "Section 3.3 describes recruitment: randomly chosen from 10,000 engineers, 337 completed survey, 269 agreed, 228 after country filtering, 106 final compliant population." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "The pipeline is documented: 10,000 randomly chosen → 337 completed survey → 269 consented → 228 after country exclusion → 106 final (compliant + completed diary + completed exit survey). Open coding is mentioned for qualitative analysis." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": false, 215 "justification": "No funding statement is provided. Three of four authors are Microsoft employees, but there is no explicit funding disclosure." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Author affiliations are clearly listed: three authors at Microsoft (Redmond, WA) and one at Institute for Work Life (Boston, MA)." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": false, 225 "justification": "Three authors are Microsoft employees evaluating GitHub Copilot, a Microsoft product. Microsoft has a direct financial interest in positive Copilot findings. This is not disclosed as a conflict." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial interests statement is present. Microsoft employees evaluating a Microsoft product without any COI declaration." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": false, 236 "answer": false, 237 "justification": "This is an RCT studying developer behavior and beliefs, not evaluating a model's capability on a benchmark." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": false, 241 "answer": false, 242 "justification": "Not applicable — this study does not evaluate model performance on benchmark tasks." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": false, 246 "answer": false, 247 "justification": "Not applicable — no benchmark evaluation is performed." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": true, 253 "answer": false, 254 "justification": "No pre-registration is mentioned. No link to OSF, AsPredicted, or any registry." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": true, 258 "answer": true, 259 "justification": "The Acknowledgments state: 'The ethics for this study were reviewed and approved by the Microsoft Research Institutional Review Board (MSRIRB), which is an IRB federally registered with the United States Department of Health & Human Services.'" 260 }, 261 "demographics_reported": { 262 "applies": true, 263 "answer": true, 264 "justification": "Section 3.3 reports gender, management level, seniority (junior/senior/principal), primary programming language, and Table 1 breaks down demographics by randomization group." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": true, 268 "answer": true, 269 "justification": "Inclusion criteria: software engineers at the company, in allowed countries. Exclusion: participants not in allowed countries were removed. Final population required completing ≥1 diary, exit survey, and treatment compliance." 270 }, 271 "randomization_described": { 272 "applies": true, 273 "answer": true, 274 "justification": "Section 3.5 describes block randomization based on gender, initial perception of AI tools ('I like AI coding tools' and 'I trust AI coding tools'), with chi-square verification of balance." 275 }, 276 "blinding_described": { 277 "applies": true, 278 "answer": false, 279 "justification": "No mention of blinding. Participants knew which group they were in (treatment received Copilot access, control was told not to use GenAI). No discussion of whether this knowledge affected responses." 280 }, 281 "attrition_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Attrition is documented: 228 intake → 106 final population (53% attrition). 25% non-compliance in both groups is also reported. However, reasons for dropout beyond non-compliance are not detailed." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": false, 290 "answer": false, 291 "justification": "This is an RCT studying developer beliefs, not proposing a method with inference costs." 292 }, 293 "compute_budget_stated": { 294 "applies": false, 295 "answer": false, 296 "justification": "This is a human subjects study, not a computational experiment requiring compute budget reporting." 297 } 298 } 299 }, 300 "claims": [ 301 { 302 "claim": "Use of GitHub Copilot significantly increases developers' belief that GenAI tools are useful (from 2.93 to 3.51, p=0.001) and enjoyable (from 2.72 to 3.61, p<0.0001).", 303 "evidence": "Section 4.2: paired t-tests on treatment group Likert responses before and after 3-week intervention.", 304 "supported": "moderate" 305 }, 306 { 307 "claim": "Developers' trust in AI-generated code did not change after sustained use.", 308 "evidence": "Section 4.2: no statistically significant changes in trust-related Likert items.", 309 "supported": "moderate" 310 }, 311 { 312 "claim": "No statistically significant differences in developer telemetry (code changes, PRs, development time) between treatment and control groups.", 313 "evidence": "Table 2: DiD analysis showing non-significant p-values for all metrics. Power analysis shows most metrics were severely underpowered (CodeChanges power=0.06).", 314 "supported": "weak" 315 }, 316 { 317 "claim": "84% of treatment participants reported positive changes in daily work practices.", 318 "evidence": "Section 4.4: open coding of 99 verbatims, 84% of 129 coded responses were positive.", 319 "supported": "moderate" 320 }, 321 { 322 "claim": "Developers used GenAI tools as a replacement for web search, an unexpected use case not anticipated in intake surveys.", 323 "evidence": "Section 4.3.1: diary verbatims showing developers using Copilot/chat for searching information instead of Google/Stack Overflow.", 324 "supported": "moderate" 325 } 326 ], 327 "red_flags": [ 328 { 329 "flag": "Company evaluating own product", 330 "detail": "Three of four authors are Microsoft employees evaluating GitHub Copilot, a Microsoft product. No conflict of interest is declared despite the obvious financial stake in positive findings." 331 }, 332 { 333 "flag": "High attrition with per-protocol analysis only", 334 "detail": "53% attrition (228→106) and 25% non-compliance in both groups. Only compliant participants are analyzed (per-protocol), not intention-to-treat, which introduces selection bias favoring positive results." 335 }, 336 { 337 "flag": "Severely underpowered telemetry analysis", 338 "detail": "Table 2 shows power as low as 0.06 for CodeChanges. The null telemetry results cannot be interpreted as 'no effect' — the study simply lacked power to detect effects." 339 }, 340 { 341 "flag": "No blinding", 342 "detail": "Participants knew their group assignment. Treatment group received special instructions and daily prompts about AI tool use, creating demand characteristics that could inflate self-reported satisfaction and usefulness." 343 }, 344 { 345 "flag": "Unvalidated survey instruments", 346 "detail": "The paper acknowledges the survey questions were original and not previously validated, which may affect reliability and construct validity of the belief measurements." 347 }, 348 { 349 "flag": "Short treatment duration", 350 "detail": "3-week intervention is short relative to the 11-week 'tipping point' the paper itself cites. The novelty effect cannot be distinguished from sustained value at this timeframe." 351 } 352 ], 353 "cited_papers": [ 354 { 355 "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot", 356 "authors": [ 357 "Sida Peng", 358 "Eirini Kalliamvakou", 359 "Peter Cihon", 360 "Mert Demirer" 361 ], 362 "year": 2023, 363 "arxiv_id": "2302.06590", 364 "relevance": "Seminal Copilot RCT showing 55% productivity gain in lab setting — the study this paper builds on and contrasts with." 365 }, 366 { 367 "title": "The Impact of AI Tool on Engineering at ANZ Bank: An Empirical Study on GitHub Copilot within Corporate Environment", 368 "authors": [ 369 "Sayan Chatterjee", 370 "Ching Louis Liu", 371 "Gareth Rowland", 372 "Tim Hogarth" 373 ], 374 "year": 2024, 375 "relevance": "Corporate Copilot productivity study finding 42.36% improvement, relevant to real-world GenAI tool evaluation." 376 }, 377 { 378 "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions", 379 "authors": [ 380 "Hammond Pearce", 381 "Baleegh Ahmad", 382 "Benjamin Tan", 383 "Brendan Dolan-Gavitt", 384 "Ramesh Karri" 385 ], 386 "year": 2022, 387 "doi": "10.1109/SP46214.2022.9833571", 388 "relevance": "Found ~40% of Copilot-generated programs contained security vulnerabilities — key evidence on AI code quality risks." 389 }, 390 { 391 "title": "GitHub Copilot AI pair programmer: Asset or Liability?", 392 "authors": [ 393 "Arghavan Moradi Dakhel", 394 "Vahid Majdinasab", 395 "Amin Nikanjam", 396 "Foutse Khomh", 397 "Michel C. Desmarais", 398 "Zhen Ming (Jack) Jiang" 399 ], 400 "year": 2023, 401 "doi": "10.1016/j.jss.2023.111734", 402 "relevance": "Compared GenAI vs human solutions to fundamental coding problems, finding humans still outperform but AI bugs are easier to fix." 403 }, 404 { 405 "title": "Taking Flight with Copilot: Early insights and opportunities of AI-powered pair-programming tools", 406 "authors": [ 407 "Christian Bird", 408 "Denae Ford", 409 "Thomas Zimmermann", 410 "Nicole Forsgren", 411 "Eirini Kalliamvakou", 412 "Travis Lowdermilk", 413 "Idan Gazit" 414 ], 415 "year": 2023, 416 "doi": "10.1145/3582083", 417 "relevance": "Early qualitative insights on Copilot adoption and AI pair programming in practice." 418 }, 419 { 420 "title": "Using AI-Based Coding Assistants in Practice: State of Affairs, Perceptions, and Ways Forward", 421 "authors": [ 422 "Agnia Sergeyuk", 423 "Yaroslav Golubev", 424 "Timofey Bryksin", 425 "Iftekhar Ahmed" 426 ], 427 "year": 2024, 428 "relevance": "Survey of developer perceptions of AI coding assistants, relevant to beliefs and adoption research." 429 }, 430 { 431 "title": "Is GitHub copilot a substitute for human pair-programming? An empirical study", 432 "authors": [ 433 "Saki Imai" 434 ], 435 "year": 2022, 436 "doi": "10.1145/3510454.3522684", 437 "relevance": "Compared Copilot vs human pair programming, finding Copilot increases code quantity but not quality." 438 }, 439 { 440 "title": "Transforming Software Development: Evaluating the Efficiency and Challenges of GitHub Copilot in Real-World Projects", 441 "authors": [ 442 "Ruchika Pandey", 443 "Prabhat Singh", 444 "Raymond Wei", 445 "Shaila Shankar" 446 ], 447 "year": 2024, 448 "relevance": "Evaluated Copilot in real codebases finding 33-50% time savings depending on task complexity." 449 }, 450 { 451 "title": "Practices and Challenges of Using GitHub Copilot: An Empirical Study", 452 "authors": [ 453 "Beiqi Zhang", 454 "Peng Liang", 455 "Xiyu (Thomas) Zhou", 456 "Aakash Ahmad", 457 "Muhammad Waseem" 458 ], 459 "year": 2023, 460 "relevance": "Analyzed Stack Overflow and GitHub Discussions for Copilot usage patterns, benefits, and limitations." 461 } 462 ], 463 "engagement_factors": { 464 "practical_relevance": { 465 "score": 1, 466 "justification": "Findings about Copilot adoption barriers and use cases are interesting but don't give practitioners a new technique or tool to apply." 467 }, 468 "surprise_contrarian": { 469 "score": 2, 470 "justification": "The null telemetry result — no measurable productivity gain despite self-reported enthusiasm — directly undermines the widely-cited '55% faster' claim." 471 }, 472 "fear_safety": { 473 "score": 0, 474 "justification": "No safety, security, or risk angle beyond brief mentions of AI-generated code bugs." 475 }, 476 "drama_conflict": { 477 "score": 2, 478 "justification": "Microsoft employees finding no objective productivity gain from their own product, while the company markets it as transformative, creates an uncomfortable tension." 479 }, 480 "demo_ability": { 481 "score": 0, 482 "justification": "This is a workplace study with no code, tool, or demo to try." 483 }, 484 "brand_recognition": { 485 "score": 3, 486 "justification": "Directly about GitHub Copilot (millions of users) conducted at Microsoft, two of the most recognized names in developer tools." 487 } 488 } 489 }