scan.json (26473B)
1 { 2 "paper": { 3 "title": "Generative AI for Pull Request Descriptions: Adoption, Impact, and Developer Interventions", 4 "authors": [ 5 "Tao Xiao", 6 "Hideaki Hata", 7 "Christoph Treude", 8 "Kenichi Matsumoto" 9 ], 10 "year": 2024, 11 "venue": "Proc. ACM Softw. Eng.", 12 "arxiv_id": "2402.08967", 13 "doi": "10.1145/3643773" 14 }, 15 "scan_version": 3, 16 "active_modules": [], 17 "methodology_tags": ["observational", "qualitative"], 18 "key_findings": "This study of 18,256 GitHub PRs finds growing adoption of Copilot for Pull Requests during its early access period (March–August 2023), with copilot:summary as the most popular marker tag. Using propensity score weighting with 17 covariates and entropy balancing, the authors estimate Copilot for PRs reduces review time by 19.3 hours and increases merge likelihood by 1.57× (OR=1.57, 95% CI 1.35–1.84). Qualitative analysis of 1,437 revisions reveals developers frequently complement AI-generated descriptions with templates and links, while also partially deleting (22.9%) or refining (20.8%) Copilot's suggestions.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "A replication package is provided on GitHub (https://github.com/NAIST-SE/CopilotForPRsEarlyAdoption) and archived on Zenodo (doi: 10.5281/zenodo.10656106). The Data Availability section states it includes 'lists of studied PRs, features of PRs, coding results for RQ3, and scripts.'" 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "The replication package includes lists of studied PRs from GitHub (both Copilot and non-Copilot), PR features used in RQ2, and coding results for RQ3. The underlying GitHub PR data is publicly accessible." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "No requirements.txt, Dockerfile, or environment specification is mentioned in the paper. The replication package contents described do not include environment setup details." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No step-by-step reproduction instructions are described in the paper. The replication package includes scripts but no documented procedure for reproducing the main results." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": true, 46 "justification": "The odds ratio for RQ2.2 is reported with a 95% confidence interval (1.35 to 1.84). Table 4 reports standard errors for all regression coefficients." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": true, 51 "justification": "Table 4 reports p-values for all regression coefficients. The treatment effect is significant at p=1.64e-17 for review time (RQ2.1) and p<0.001 for merge likelihood (RQ2.2). Inter-rater agreement is reported via Cohen's kappa." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "The Average Treatment Effect on the Treated (ATT) of -19.3 hours is reported for review time. For merge likelihood, an odds ratio of 1.57 is reported with baseline context (84% vs 71% merge rates). These provide meaningful magnitude context." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "No power analysis or formal sample size justification is provided. The study uses all available data from the observation period (18,256 Copilot PRs, 54,188 non-Copilot PRs) but does not discuss whether this is adequate for the claimed effects." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": true, 66 "justification": "Standard errors are reported for all regression coefficients in Table 4. For the qualitative analysis, inter-rater reliability is quantified via Cohen's kappa (0.64 for RQ3.1, 0.62 for RQ3.2). These are the appropriate uncertainty measures for this study design." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "The study compares 17,177 merged/closed Copilot PRs (treatment) against 50,695 merged/closed non-Copilot PRs (control) from the same 146 GitHub repositories during the same time period." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "Control PRs are from the same time period (March–August 2023) and same repositories as the treatment PRs, ensuring temporal comparability." 79 }, 80 "ablation_study": { 81 "applies": false, 82 "answer": false, 83 "justification": "This is an observational study of a third-party service, not a system with decomposable components. There is no system to ablate." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "Two distinct outcome variables are used: review time in hours (RQ2.1) and binary merge likelihood (RQ2.2). The qualitative analysis (RQ3) adds complementary findings via categorical coding." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": true, 93 "justification": "RQ3 involves qualitative analysis by three human raters who independently coded PR revisions. Two rounds of coding achieved substantial inter-rater agreement (κ=0.64 for RQ3.1, κ=0.62 for RQ3.2)." 94 }, 95 "held_out_test_set": { 96 "applies": false, 97 "answer": false, 98 "justification": "This is an observational study, not a predictive modeling task. There is no train/test split or held-out evaluation set." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Results are broken down by marker tag type (Table 3), 13 categories of supplementary information (Table 5), and 7 categories of editorial actions with subcategories (Table 6). Table 4 breaks down regression coefficients by language, purpose, and other covariates." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "The qualitative analysis documents cases where developers deleted Copilot output (22.9%), replaced it (14.7%), or explicitly rejected it (e.g., developer commenting 'Nope, you didn't get it this time'). 24 false positives in the coding are also reported (5.8%)." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": false, 113 "justification": "All quantitative results show positive effects of Copilot for PRs (reduced review time, higher merge rate, growing adoption). No failed analyses, abandoned approaches, or configurations that didn't work are discussed." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "The abstract's three claims are supported: (1) growing adoption shown in Figures 3–4 and Table 3; (2) reduced review time (-19.3h) and higher merge likelihood (OR=1.57) in Table 4; (3) developer adaptations documented in Tables 5–6 with 13 categories of supplementary information and 7 editorial action types." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper explicitly frames RQ2 as causal inference and uses propensity score weighting with entropy balancing across 17 covariates. Figure 2 demonstrates covariate balance achieved post-weighting (all absolute mean differences <0.10). This is a recognized quasi-experimental design, though the paper acknowledges potential unobserved confounders in Section 6.2." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": true, 130 "justification": "Section 6.2 explicitly states: 'our results are not universally applicable to the broader open-source developer community, but are more pertinent to these early adopters. Developers who are less eager to adopt new technologies might use Copilot for PRs less or differently.'" 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": true, 135 "justification": "Section 6.2 discusses specific threats: bot detection methods might miss bots (construct validity), 17 confounders are controlled but others may exist (internal validity), early adopters may differ from general developers (external validity). The PSW analysis explicitly addresses 17 potential confounding variables." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper defines 'review time' as 'the time interval between the PR creation date and closed date in hours' but does not discuss that this measures calendar time to close, not actual review effort. Time-to-close includes idle time, timezone effects, and queuing time, which are distinct from active review engagement." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "Section 2.1 mentions 'the GPT-4 model by OpenAI' without specifying a version (e.g., gpt-4-0314 or gpt-4-0613). Since the paper studies the outputs of this model via Copilot for PRs, the exact version affects the study's findings." 148 }, 149 "prompts_provided": { 150 "applies": false, 151 "answer": false, 152 "justification": "The paper does not prompt any model directly. It studies an external service (Copilot for PRs) where the interface is marker tags (copilot:summary, copilot:walkthrough, etc.), which are fully specified." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": false, 157 "justification": "The underlying LLM hyperparameters (temperature, sampling strategy, etc.) used by Copilot for PRs are not reported. While the authors cannot control these, they affect the generated content being studied, and this limitation is not acknowledged." 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "The paper evaluates Copilot for PRs as a third-party black-box service. The authors cannot be expected to describe GitHub's internal scaffolding for the copilot4prs bot." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 3 thoroughly documents the data pipeline: initial GraphQL search yielding 18,858 PRs → removing false positives → excluding obsolete uses (18,322) → filtering bot PRs (18,256). For RQ3: 46,700 revisions → 18,486 post-Copilot edits → 4,391 after marker tag reapplication filter → 1,437 revisions from 311 PRs with substantive edits." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 6.2 'Threats to validity' provides substantive discussion organized into construct validity, internal validity, and external validity subsections." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "Specific threats are discussed: bot detection methods 'might not capture every true negative' (construct); 'there may be other confounding variables not accounted for' in the 17-covariate model (internal); early adopters have 'an inherent threat to the external validity' and results are 'more pertinent to these early adopters' (external)." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": true, 184 "justification": "Section 6.2 explicitly states: 'our results are not universally applicable to the broader open-source developer community, but are more pertinent to these early adopters. Developers who are less eager to adopt new technologies might use Copilot for PRs less or differently compared to the early adopters studied in this work.'" 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": true, 191 "justification": "The replication package (GitHub + Zenodo doi:10.5281/zenodo.10656106) includes 'lists of studied PRs from GitHub, both with and without the use of Copilot for PRs,' PR features for RQ2, and coding results for RQ3." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 3 describes the collection procedure in detail: GitHub GraphQL search for PRs containing 'Generated by Copilot', time period (up to 31 August 2023), the strategy for handling the 1000-result API limit by halving time periods, and filtering for copilot4prs bot edits." 197 }, 198 "recruitment_methods_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "The paper describes how PRs and repositories were identified: GraphQL search for specific phrases, filtering by bot edits, time-period constraints, and bot exclusion using methods from Golzadeh et al. (2022). The selection process for both treatment (18,256 PRs) and control (54,188 PRs) groups is documented." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "The full pipeline is documented with counts at each stage: 18,858 initial PRs → false positive removal → 18,322 non-obsolete → 18,256 after bot filtering. For RQ3: 46,700 revisions → 18,486 post-Copilot → 4,391 after reapplication filter → 1,437 from 311 PRs. Exclusion criteria are stated at each stage." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "The Acknowledgments section discloses funding: JSPS Grant-in-Aid for JSPS Fellows JP23KJ1589, JSPS KAKENHI Grant Numbers JP20H05706, and JST PRESTO Grant Number JPMJPR22P6." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "All authors' affiliations are listed: Nara Institute of Science and Technology, Shinshu University, and Singapore Management University. None are affiliated with GitHub/Microsoft, so there is no product-evaluator conflict." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": true, 223 "justification": "Funding comes from JSPS and JST (Japanese government research agencies), which have no financial interest in the success or failure of GitHub Copilot for PRs." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial interests statement is present in the paper. The absence of a formal declaration is not equivalent to the absence of conflicts." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": false, 234 "answer": false, 235 "justification": "This paper does not evaluate a pre-trained model's capability on a benchmark. It is an observational study of the adoption and impact of a third-party AI service on software development processes." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Not applicable — the paper is an observational mining study of GitHub PRs, not a benchmark evaluation of a model." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": false, 244 "answer": false, 245 "justification": "Not applicable — no benchmark evaluation is performed in this study." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants are involved. The study mines public GitHub PR data and the qualitative coding is performed by the authors themselves, not study participants." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants. The study analyzes publicly available GitHub data." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants. Developer characteristics are described at the repository level (e.g., repo age), not at the individual level." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants. Inclusion/exclusion criteria for PRs and repositories are described in Section 3." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants. This is an observational study with no experimental assignment of participants." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants and not an experimental study. Blinding is not applicable." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants. Data attrition through filtering is documented in the data pipeline (Section 3)." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": false, 288 "answer": false, 289 "justification": "This is an observational study of an existing service, not a proposed method. The paper does not invoke any model or incur inference costs." 290 }, 291 "compute_budget_stated": { 292 "applies": false, 293 "answer": false, 294 "justification": "This is an observational study. The computational requirements (GraphQL queries, regression analysis) are negligible and not the focus of the work." 295 } 296 } 297 }, 298 "claims": [ 299 { 300 "claim": "Copilot for PRs shows growing adoption, with copilot:summary as the most popular marker tag (13,231 instances across 18,256 PRs).", 301 "evidence": "Figure 3 shows cumulative time-series of PRs growing steadily from March to August 2023. Table 3 shows 13,231 copilot:summary instances. 50 repositories have >50% of PRs powered by Copilot. Section 5.1.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "Copilot for PRs reduces code review time by an average of 19.3 hours.", 306 "evidence": "Table 4 reports ATT of -19.3 hours (std error 2.27, p=1.64e-17) from propensity score weighted linear regression with entropy balancing across 17 covariates. Median treatment review time is 12.17h vs 16.09h control. Section 5.2.", 307 "supported": "moderate" 308 }, 309 { 310 "claim": "PRs generated with Copilot for PRs are approximately 1.57 times more likely to be merged (OR=1.57, 95% CI 1.35–1.84, p<0.001).", 311 "evidence": "Propensity score weighted logistic regression. Raw merge rates: 84% (Copilot) vs 71% (non-Copilot). Section 5.2.", 312 "supported": "moderate" 313 }, 314 { 315 "claim": "Developers frequently complement AI-generated descriptions with templates (22.8%) and associated links (22.7%), while the most common editorial action is partial deletion (22.9%).", 316 "evidence": "Qualitative coding of 1,437 revisions from 311 PRs by three independent raters achieving κ=0.64 (RQ3.1) and κ=0.62 (RQ3.2). Tables 5 and 6 detail frequencies. Section 5.3.", 317 "supported": "strong" 318 } 319 ], 320 "red_flags": [ 321 { 322 "flag": "Selection bias in treatment group", 323 "detail": "Early adopters of Copilot for PRs are likely more tech-savvy, productive developers in better-maintained repositories. This systematic difference could confound the treatment effect even after propensity score adjustment, as unobserved characteristics (developer skill, project management quality) may drive both Copilot adoption and faster reviews." 324 }, 325 { 326 "flag": "No sensitivity analysis for unobserved confounders", 327 "detail": "The causal claims rely on propensity score weighting, which assumes all relevant confounders are observed. No Rosenbaum bounds, E-value analysis, or other sensitivity analysis is performed to assess how robust the causal estimates are to potential unobserved confounding." 328 }, 329 { 330 "flag": "Large pre-weighting covariate imbalance", 331 "detail": "The median description length differs by 5× between groups (1,825 vs 343 characters, Table 2), indicating Copilot PRs are fundamentally different from non-Copilot PRs. While entropy balancing adjusts for this, extreme initial imbalances raise concerns about extrapolation and the reliability of weighted estimates." 332 }, 333 { 334 "flag": "ATT magnitude seems implausibly large", 335 "detail": "The 19.3-hour reduction in review time is larger than the control group median (16.09 hours) and treatment group median (12.17 hours), suggesting the effect may be driven by outliers or heavy tails rather than a typical improvement. The paper does not report the median treatment effect or discuss distributional sensitivity." 336 } 337 ], 338 "cited_papers": [ 339 { 340 "title": "Evaluating large language models trained on code", 341 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 342 "year": 2021, 343 "arxiv_id": "2107.03374", 344 "relevance": "Foundational Codex paper establishing LLM code generation evaluation methodology." 345 }, 346 { 347 "title": "Large Language Models for Software Engineering: A Systematic Literature Review", 348 "authors": ["Xinyi Hou", "Yanjie Zhao", "Yue Liu"], 349 "year": 2023, 350 "arxiv_id": "2308.10620", 351 "relevance": "Comprehensive systematic review of 229 papers on LLMs in software engineering from 2017–2023." 352 }, 353 { 354 "title": "Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation", 355 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 356 "year": 2023, 357 "arxiv_id": "2305.01210", 358 "relevance": "Rigorous evaluation of LLM code generation correctness, relevant to understanding AI code quality." 359 }, 360 { 361 "title": "Conversational automated program repair", 362 "authors": ["Chunqiu Steven Xia", "Lingming Zhang"], 363 "year": 2023, 364 "arxiv_id": "2301.13246", 365 "relevance": "Applies conversational LLMs to program repair, demonstrating agentic AI programming capability." 366 }, 367 { 368 "title": "From Copilot to Pilot: Towards AI Supported Software Development", 369 "authors": ["Rohith Pudari", "Neil A Ernst"], 370 "year": 2023, 371 "arxiv_id": "2303.04142", 372 "relevance": "Directly examines the evolution of AI support in software development, including GitHub Copilot." 373 }, 374 { 375 "title": "Examining zero-shot vulnerability repair with large language models", 376 "authors": ["Hammond Pearce", "Benjamin Tan", "Baleegh Ahmad"], 377 "year": 2023, 378 "relevance": "Evaluates LLMs for security vulnerability repair, relevant to AI safety in code generation." 379 }, 380 { 381 "title": "Teaching large language models to self-debug", 382 "authors": ["Xinyun Chen", "Maxwell Lin", "Nathanael Schärli", "Denny Zhou"], 383 "year": 2023, 384 "arxiv_id": "2304.05128", 385 "relevance": "Proposes self-debugging for LLMs, a key agentic capability for autonomous code improvement." 386 }, 387 { 388 "title": "ChatGPT: A Study on its Utility for Ubiquitous Software Engineering Tasks", 389 "authors": ["Giriprasad Sridhara", "Sourav Mazumdar"], 390 "year": 2023, 391 "arxiv_id": "2305.16837", 392 "relevance": "Evaluates ChatGPT across multiple software engineering tasks including code review." 393 }, 394 { 395 "title": "How Effective Are Neural Networks for Fixing Security Vulnerabilities", 396 "authors": ["Yi Wu", "Nan Jiang", "Hung Viet Pham"], 397 "year": 2023, 398 "arxiv_id": "2305.18607", 399 "relevance": "Assesses neural network effectiveness for security vulnerability repair in code." 400 }, 401 { 402 "title": "Evaluating the Code Quality of AI-Assisted Code Generation Tools: An Empirical Study on GitHub Copilot, Amazon CodeWhisperer, and ChatGPT", 403 "authors": ["Burak Yetiştiren", "Işık Özsoy", "Miray Ayerdem", "Eray Tüzün"], 404 "year": 2023, 405 "arxiv_id": "2304.10778", 406 "relevance": "Comparative evaluation of major AI code generation tools including GitHub Copilot." 407 }, 408 { 409 "title": "Self-collaboration Code Generation via ChatGPT", 410 "authors": ["Yihong Dong", "Xue Jiang", "Zhi Jin", "Ge Li"], 411 "year": 2023, 412 "arxiv_id": "2304.07590", 413 "relevance": "Explores multi-agent collaboration in LLM code generation." 414 }, 415 { 416 "title": "Automated program repair in the era of large pre-trained language models", 417 "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"], 418 "year": 2023, 419 "relevance": "Examines how large pre-trained language models transform automated program repair." 420 } 421 ], 422 "engagement_factors": { 423 "practical_relevance": { 424 "score": 2, 425 "justification": "Provides actionable data for teams deciding whether to adopt Copilot for PRs, with specific metrics on time savings and merge rates." 426 }, 427 "surprise_contrarian": { 428 "score": 1, 429 "justification": "Results largely confirm the expected narrative that AI tools help with development tasks; the finding that developers frequently delete AI suggestions is mildly surprising." 430 }, 431 "fear_safety": { 432 "score": 0, 433 "justification": "No safety, security, or risk concerns are raised by the findings." 434 }, 435 "drama_conflict": { 436 "score": 0, 437 "justification": "No controversial claims or conflicts; the study is a straightforward empirical investigation." 438 }, 439 "demo_ability": { 440 "score": 0, 441 "justification": "No tool or demo to try; the replication package contains analysis scripts but not an end-user tool." 442 }, 443 "brand_recognition": { 444 "score": 3, 445 "justification": "Directly studies GitHub Copilot, one of the most well-known AI developer tools from Microsoft/GitHub." 446 } 447 } 448 }