scan-v5.json (26769B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "The Impact of Large Language Models on Code Review Process", 6 "authors": [ 7 "Antonio Collante", 8 "Samuel Abedu", 9 "SayedHassan Khatoonabadi", 10 "Ahmad Abdellatif", 11 "Ebube Alor", 12 "Emad Shihab" 13 ], 14 "year": 2025, 15 "venue": "arXiv", 16 "arxiv_id": "2508.11034", 17 "doi": null 18 }, 19 "checklist": { 20 "claims_and_evidence": { 21 "abstract_claims_supported": { 22 "applies": true, 23 "answer": true, 24 "justification": "The abstract's quantitative claims (61% resolution time reduction, 66.7% review phase, 87.5% waiting time) are directly supported by Tables 2 and 3 and Figure 3. The 60% enhancement figure is backed by Table 4.", 25 "source": "haiku" 26 }, 27 "causal_claims_justified": { 28 "applies": true, 29 "answer": false, 30 "justification": "The paper repeatedly uses causal language ('GPT assistance reduces PR resolution time') but uses an observational, retrospective design. The internal validity section acknowledges residual confounding from developer skill and task complexity that regression controls cannot eliminate; self-selection is the core threat and is not adequately addressed.", 31 "source": "haiku" 32 }, 33 "generalization_bounded": { 34 "applies": true, 35 "answer": false, 36 "justification": "The abstract and conclusion make broad claims about GPT accelerating code reviews and improving collaboration, but the sample is restricted to GitHub repos with ≥10 stars from a single snapshot date (May 9, 2024). The threats section acknowledges these limits but the main text does not bound its headline claims accordingly.", 37 "source": "haiku" 38 }, 39 "alternative_explanations_discussed": { 40 "applies": true, 41 "answer": true, 42 "justification": "Section 6.2 explicitly discusses that GPT-assisted PRs may attract more skilled or efficient developers, that task simplicity could explain shorter times, and that team dynamics may differ — these alternative explanations are named even if not fully tested.", 43 "source": "haiku" 44 }, 45 "proxy_outcome_distinction": { 46 "applies": true, 47 "answer": false, 48 "justification": "The abstract states GPT can 'improve code quality' and 'address persistent bottlenecks,' but the paper only measures time-to-merge and phase durations; no code quality metric is collected or analyzed, and this gap is not acknowledged in the main findings.", 49 "source": "haiku" 50 } 51 }, 52 "limitations_and_scope": { 53 "limitations_section_present": { 54 "applies": true, 55 "answer": true, 56 "justification": "Section 6 'Threats to Validity' has three dedicated subsections (Construct, Internal, External) that go well beyond a single sentence.", 57 "source": "haiku" 58 }, 59 "threats_to_validity_specific": { 60 "applies": true, 61 "answer": true, 62 "justification": "Specific threats are named: keyword-based detection introduces false negatives/positives, self-selection of GPT users confounds results, developer experience and task complexity are uncontrolled confounders, and GitHub-only data limits platform generalizability.", 63 "source": "haiku" 64 }, 65 "scope_boundaries_stated": { 66 "applies": true, 67 "answer": true, 68 "justification": "Section 6.3 explicitly states that results are bounded to GitHub repos with ≥10 stars from one snapshot date, and may not generalize to GitLab/Bitbucket or private repositories.", 69 "source": "haiku" 70 } 71 }, 72 "conflicts_of_interest": { 73 "funding_disclosed": { 74 "applies": true, 75 "answer": true, 76 "justification": "Acknowledgements state 'This work was supported by the NSERC CREATE grant number 555406, 2021.'", 77 "source": "haiku" 78 }, 79 "affiliations_disclosed": { 80 "applies": true, 81 "answer": true, 82 "justification": "Author affiliations (Concordia University and University of Calgary) are disclosed on the title page.", 83 "source": "haiku" 84 }, 85 "funder_independent_of_outcome": { 86 "applies": true, 87 "answer": true, 88 "justification": "NSERC is Canada's national science funding agency with no commercial interest in GPT/OpenAI outcomes.", 89 "source": "haiku" 90 }, 91 "financial_interests_declared": { 92 "applies": true, 93 "answer": false, 94 "justification": "There is no competing interests or financial disclosures statement anywhere in the paper; the acknowledgements only mention NSERC funding.", 95 "source": "haiku" 96 } 97 }, 98 "scope_and_framing": { 99 "key_terms_defined": { 100 "applies": true, 101 "answer": true, 102 "justification": "'GPT-assisted PRs' is precisely defined (developers interacting conversationally with GPT for review tasks, excluding automated API calls), and the four PR phases are operationally defined in Section 3.3.2 with timestamps.", 103 "source": "haiku" 104 }, 105 "intended_contribution_clear": { 106 "applies": true, 107 "answer": true, 108 "justification": "Three contributions are explicitly enumerated in Section 1: first comprehensive phase-specific GPT analysis, empirical evidence on GPT's phase impact, and public release of scripts and dataset.", 109 "source": "haiku" 110 }, 111 "engagement_with_prior_work": { 112 "applies": true, 113 "answer": true, 114 "justification": "Section 2 has three subsections that contrast this work with Tufano et al., Xiao et al., Cihan et al., and Hao et al., explaining what this study adds beyond task-specific analyses.", 115 "source": "haiku" 116 } 117 } 118 }, 119 "type_checklist": { 120 "empirical": { 121 "artifacts": { 122 "code_released": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section 1 states 'we publicly share our scripts and dataset online at https://github.com/acollant/GPT-Assistance-PR' — a live URL, not a promise.", 126 "source": "haiku" 127 }, 128 "data_released": { 129 "applies": true, 130 "answer": true, 131 "justification": "The dataset of 25,473 PRs is shared at the same GitHub repository.", 132 "source": "haiku" 133 }, 134 "environment_specified": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper mentions R and the stats package (footnote 2) and Python implicitly, but provides no requirements file, R session info, package versions, or Dockerfile.", 138 "source": "haiku" 139 }, 140 "reproduction_instructions": { 141 "applies": true, 142 "answer": false, 143 "justification": "No step-by-step reproduction instructions appear in the paper; the reader is pointed to the GitHub repo but specific usage instructions are not provided in the manuscript.", 144 "source": "haiku" 145 } 146 }, 147 "statistical_methodology": { 148 "confidence_intervals_or_error_bars": { 149 "applies": true, 150 "answer": false, 151 "justification": "Table 2 reports standard errors for regression coefficients, and Figure 3 is a box plot, but the headline percentage reductions (61%, 66.7%, 87.5%) are presented as point estimates without confidence intervals.", 152 "source": "haiku" 153 }, 154 "significance_tests": { 155 "applies": true, 156 "answer": true, 157 "justification": "Mann-Whitney U tests are used for phase comparisons (RQ2) and the multiple linear regression provides p-values for RQ1; all key comparative claims report p-values.", 158 "source": "haiku" 159 }, 160 "effect_sizes_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Percentage reductions (61%, 66.7%, 87.5%) with baseline medians serve as effect sizes; the regression coefficient for is_gpt-assisted is also reported.", 164 "source": "haiku" 165 }, 166 "sample_size_justified": { 167 "applies": true, 168 "answer": true, 169 "justification": "The 450-PR annotation sample is justified at 95% confidence, 5% margin of error; the 310-PR sample for RQ3 is justified with the same formula.", 170 "source": "haiku" 171 }, 172 "variance_reported": { 173 "applies": true, 174 "answer": true, 175 "justification": "Figure 3 presents box plots showing IQR and distribution spread for merge times in both groups.", 176 "source": "haiku" 177 } 178 }, 179 "evaluation_design": { 180 "baselines_included": { 181 "applies": true, 182 "answer": true, 183 "justification": "Matched GPT-non-assisted PRs (selected via Manhattan distance similarity) serve as the comparison baseline throughout.", 184 "source": "haiku" 185 }, 186 "baselines_contemporary": { 187 "applies": true, 188 "answer": true, 189 "justification": "The comparison PRs come from the same dataset, same time period (collected May 9, 2024), and are matched on structural features.", 190 "source": "haiku" 191 }, 192 "ablation_study": { 193 "applies": false, 194 "answer": false, 195 "justification": "This is an observational mining study, not a system evaluation; ablation studies are not applicable.", 196 "source": "haiku" 197 }, 198 "multiple_metrics": { 199 "applies": true, 200 "answer": true, 201 "justification": "The study reports total resolution time, phase-level durations (review, waiting, change), task-type frequency, and regression coefficients across multiple RQs.", 202 "source": "haiku" 203 }, 204 "human_evaluation": { 205 "applies": true, 206 "answer": true, 207 "justification": "Three authors independently annotated 450 PRs to develop heuristics (Krippendorff's α = 0.77); two authors independently labeled 310 PRs for task classification.", 208 "source": "haiku" 209 }, 210 "held_out_test_set": { 211 "applies": false, 212 "answer": false, 213 "justification": "This is an observational analysis, not a prediction task requiring a held-out test set.", 214 "source": "haiku" 215 }, 216 "per_category_breakdown": { 217 "applies": true, 218 "answer": true, 219 "justification": "Table 3 breaks down phase durations across four PR phases; Table 4 breaks down GPT task types across three phases.", 220 "source": "haiku" 221 }, 222 "failure_cases_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "Section 4.3 cites specific PRs where reviewers rejected GPT suggestions (e.g., 'I will not blindly accept any AI-generated code') as failure or pushback cases.", 226 "source": "haiku" 227 }, 228 "negative_results_reported": { 229 "applies": true, 230 "answer": true, 231 "justification": "The paper explicitly reports no significant GPT effect in the At Submission and At Waiting After Acceptance phases, and no significant difference in the At Change phase (Table 3).", 232 "source": "haiku" 233 } 234 }, 235 "setup_transparency": { 236 "model_versions_specified": { 237 "applies": true, 238 "answer": false, 239 "justification": "The paper refers variously to 'GPT,' 'GPT-4,' and 'ChatGPT' without controlling for which model version developers used; model version heterogeneity across the 1,600 PRs is not documented.", 240 "source": "haiku" 241 }, 242 "prompts_provided": { 243 "applies": false, 244 "answer": false, 245 "justification": "The researchers are not constructing prompts; they analyze prompts developers organically wrote to GPT, which are not systematically documented.", 246 "source": "haiku" 247 }, 248 "hyperparameters_reported": { 249 "applies": false, 250 "answer": false, 251 "justification": "The researchers run no LLM inference; hyperparameters are not applicable to this observational study.", 252 "source": "haiku" 253 }, 254 "scaffolding_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No agentic scaffolding is used by the researchers; this is a retrospective mining study.", 258 "source": "haiku" 259 }, 260 "data_preprocessing_documented": { 261 "applies": true, 262 "answer": true, 263 "justification": "Filtering criteria are explicitly stated: ≥10 stars, ≥2 contributors, ≥1 event, sequential timestamps only, bots excluded; Figure 1 diagrams the full pipeline.", 264 "source": "haiku" 265 } 266 }, 267 "data_integrity": { 268 "raw_data_available": { 269 "applies": true, 270 "answer": true, 271 "justification": "The dataset is publicly released at https://github.com/acollant/GPT-Assistance-PR.", 272 "source": "haiku" 273 }, 274 "data_collection_described": { 275 "applies": true, 276 "answer": true, 277 "justification": "Section 3.1 details the GitHub REST API search on May 9, 2024, with keyword fields, filtering criteria, and the fields extracted per PR.", 278 "source": "haiku" 279 }, 280 "recruitment_methods_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants were recruited; the data is mined from public GitHub repositories.", 284 "source": "haiku" 285 }, 286 "data_pipeline_documented": { 287 "applies": true, 288 "answer": true, 289 "justification": "Figure 1 diagrams the full pipeline from API search through filtering, heuristic labeling, and phase mapping; each step is described in Sections 3.1–3.3.", 290 "source": "haiku" 291 } 292 }, 293 "contamination": { 294 "training_cutoff_stated": { 295 "applies": false, 296 "answer": false, 297 "justification": "The study is observational and does not evaluate model capabilities on benchmarks; training cutoff is not relevant.", 298 "source": "haiku" 299 }, 300 "train_test_overlap_discussed": { 301 "applies": false, 302 "answer": false, 303 "justification": "Not applicable; the paper does not train or evaluate an ML model on benchmark data.", 304 "source": "haiku" 305 }, 306 "benchmark_contamination_addressed": { 307 "applies": false, 308 "answer": false, 309 "justification": "No benchmark evaluation is conducted by the researchers.", 310 "source": "haiku" 311 } 312 }, 313 "human_studies": { 314 "pre_registered": { 315 "applies": false, 316 "answer": false, 317 "justification": "This is a mining study on public GitHub data; no human subjects were recruited and pre-registration is not applicable.", 318 "source": "haiku" 319 }, 320 "irb_or_ethics_approval": { 321 "applies": false, 322 "answer": false, 323 "justification": "Public GitHub data mining; no IRB/ethics review required or mentioned.", 324 "source": "haiku" 325 }, 326 "demographics_reported": { 327 "applies": false, 328 "answer": false, 329 "justification": "No recruited participants; developer demographics from GitHub are not collected.", 330 "source": "haiku" 331 }, 332 "inclusion_exclusion_criteria": { 333 "applies": false, 334 "answer": false, 335 "justification": "No human participant recruitment; project/PR inclusion criteria are stated but that is repository selection, not participant selection.", 336 "source": "haiku" 337 }, 338 "randomization_described": { 339 "applies": false, 340 "answer": false, 341 "justification": "No experimental treatment assigned to human participants.", 342 "source": "haiku" 343 }, 344 "blinding_described": { 345 "applies": false, 346 "answer": false, 347 "justification": "No human subjects experiment; blinding is not applicable.", 348 "source": "haiku" 349 }, 350 "attrition_reported": { 351 "applies": false, 352 "answer": false, 353 "justification": "No recruited participants who could drop out.", 354 "source": "haiku" 355 } 356 }, 357 "cost_and_practicality": { 358 "inference_cost_reported": { 359 "applies": false, 360 "answer": false, 361 "justification": "The researchers do not run LLM inference; they analyze existing PRs. Cost is not applicable.", 362 "source": "haiku" 363 }, 364 "compute_budget_stated": { 365 "applies": false, 366 "answer": false, 367 "justification": "No large-scale model training or inference is performed by the research team.", 368 "source": "haiku" 369 } 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "GPT-assisted PRs have a 61% shorter median resolution time (9h vs 23h) compared to non-assisted PRs.", 376 "evidence": "Multiple linear regression (is_gpt-assisted coefficient p=4.88e-11) and Figure 3 showing median 8.9h vs 22.7h.", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "GPT assistance reduces the review phase by 66.7% (1h vs 3h median).", 381 "evidence": "Table 3, Mann-Whitney U test with p < 0.05.", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "GPT assistance reduces waiting-for-changes time by 87.5% (3h vs 24h median).", 386 "evidence": "Table 3, Mann-Whitney U test with p < 0.05.", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "Developers primarily use GPT for code enhancement (60%) and bug fixing (26%) during the review phase.", 391 "evidence": "Table 4, based on manual classification of 310 randomly sampled GPT-assisted PRs by two authors.", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "GPT-assisted PRs show no significant time difference during the submission and post-acceptance waiting phases.", 396 "evidence": "Section 4.2 explicitly states no evidence of GPT activity in these phases and excludes them from analysis.", 397 "supported": "strong" 398 }, 399 { 400 "claim": "GPT assistance improves code review quality.", 401 "evidence": "Abstract claim; no code quality metric is measured — only time metrics. This is unsupported by the data.", 402 "supported": "unsupported" 403 } 404 ], 405 "methodology_tags": [ 406 "observational", 407 "case-study" 408 ], 409 "key_findings": "Analysis of 25,473 GitHub PRs shows GPT-assisted PRs merge 61% faster than matched non-assisted PRs (9h vs 23h median), with the largest gain in the waiting-for-changes phase (87.5% reduction, 3h vs 24h). Statistical significance is confirmed via Mann-Whitney U tests and multiple linear regression controlling for PR size, commits, files changed, and project age. Developers use GPT primarily for code enhancement (60%) and bug fixing (26%) during review, with negligible use during submission or post-acceptance phases. The study cannot establish causality due to self-selection bias: GPT-adopting developers may systematically differ in skill or task choice.", 410 "red_flags": [ 411 { 412 "flag": "Self-selection confound", 413 "detail": "GPT-assisted PRs are submitted by developers who chose to use GPT — likely more skilled, more familiar with the codebase, or working on simpler tasks. The regression controls (commits, PR size, project age) cannot capture developer ability, creating a plausible alternative explanation for all observed time reductions." 414 }, 415 { 416 "flag": "Causal language for correlational design", 417 "detail": "Throughout the paper, GPT assistance is described as 'reducing' and 'accelerating' code review, implying a causal relationship. The observational, retrospective design cannot support this framing; the internal validity section acknowledges it but the main findings do not hedge accordingly." 418 }, 419 { 420 "flag": "Unmeasured outcome claimed", 421 "detail": "The abstract and conclusion claim GPT 'improves code quality,' but the study measures only time-to-merge and phase durations. No defect rate, rework frequency, or quality metric is collected." 422 }, 423 { 424 "flag": "Heterogeneous model versions uncontrolled", 425 "detail": "The 1,600 GPT-assisted PRs span multiple model versions (GPT-3.5, GPT-4, etc.) and time periods; the study does not stratify by model version, potentially mixing very different tool capabilities." 426 }, 427 { 428 "flag": "RQ3 annotation lacks formal IRR", 429 "detail": "For the 310-PR task classification (Table 4), the authors explicitly state they did not calculate inter-rater reliability metrics, resolving disagreements by discussion instead. This weakens the reliability of the task-type frequency estimates." 430 }, 431 { 432 "flag": "False negative detection bias", 433 "detail": "GPT use not explicitly mentioned in PR text is missed by keyword-based detection. If experienced developers use GPT silently (no public comments), they would be classified as non-assisted, biasing the non-assisted group toward lower-quality or implicit GPT usage." 434 } 435 ], 436 "cited_papers": [ 437 { 438 "title": "Unveiling ChatGPT's Usage in Open Source Projects: A Mining-Based Study", 439 "relevance": "Direct methodological predecessor; this paper extends Tufano et al.'s keyword-based GPT detection to phase-specific PR analysis." 440 }, 441 { 442 "title": "Generative AI for Pull Request Descriptions: Adoption, Impact, and Developer Interventions", 443 "relevance": "Closely related work finding Copilot-assisted PRs have 1.57x higher merge likelihood; provides comparative context." 444 }, 445 { 446 "title": "Mining Code Review Data to Understand Waiting Times Between Acceptance and Merging", 447 "relevance": "Establishes the PR phase framework and baseline statistics on waiting times that this paper builds upon." 448 }, 449 { 450 "title": "An Empirical Study on Developers' Shared Conversations with ChatGPT in GitHub Pull Requests and Issues", 451 "relevance": "Directly overlapping study on developer–ChatGPT interactions in PRs; the most directly comparable prior work." 452 }, 453 { 454 "title": "Pull Request Decisions Explained: An Empirical Overview", 455 "relevance": "Identifies structural factors (commits, PR size) affecting PR resolution; these are the control variables used in the regression model." 456 }, 457 { 458 "title": "Automated Code Review in Practice", 459 "relevance": "Industry study of LLM-based automated code review at 4,335 PRs; finds longer PR closure times — contrasts with this paper's findings of shorter times." 460 }, 461 { 462 "title": "Tales from the Trenches: Expectations and Challenges from Practice for Code Review in the Generative AI Era", 463 "relevance": "Identifies five challenges to AI adoption in code review; provides qualitative context for interpreting this paper's quantitative findings." 464 } 465 ], 466 "engagement_factors": { 467 "practical_relevance": { 468 "score": 3, 469 "justification": "Directly actionable for any engineering team using GitHub PRs — quantifies time savings by phase and identifies where GPT helps most." 470 }, 471 "surprise_contrarian": { 472 "score": 2, 473 "justification": "The 87.5% reduction in waiting-for-changes time is striking and counterintuitive; finding no effect at submission/post-acceptance narrows the claim usefully." 474 }, 475 "fear_safety": { 476 "score": 1, 477 "justification": "Briefly raises over-reliance risk in the discussion but does not develop safety implications." 478 }, 479 "drama_conflict": { 480 "score": 1, 481 "justification": "Quotes a reviewer saying 'I will not blindly accept any AI-generated code' — a hint of social tension but not a central theme." 482 }, 483 "demo_ability": { 484 "score": 2, 485 "justification": "Code and dataset are publicly available on GitHub, enabling replication and extension by other researchers." 486 }, 487 "brand_recognition": { 488 "score": 1, 489 "justification": "Concordia University and University of Calgary are reputable but not headline-grabbing institutions; no famous lab or product association." 490 } 491 }, 492 "hn_data": { 493 "threads": [ 494 { 495 "hn_id": "41492077", 496 "title": "Transfusion: Predict the next token and diffuse images with one multimodal model", 497 "points": 122, 498 "comments": 10, 499 "url": "https://news.ycombinator.com/item?id=41492077", 500 "created_at": "2024-09-09T18:51:31Z" 501 }, 502 { 503 "hn_id": "41319553", 504 "title": "First open source Legal AI retrieval benchmark for RAG finally released", 505 "points": 9, 506 "comments": 0, 507 "url": "https://news.ycombinator.com/item?id=41319553", 508 "created_at": "2024-08-22T12:33:17Z" 509 }, 510 { 511 "hn_id": "44770561", 512 "title": "B-Splines and Fourier-Best Friends for Spatial-Temporal Video Super-Resolution", 513 "points": 4, 514 "comments": 0, 515 "url": "https://news.ycombinator.com/item?id=44770561", 516 "created_at": "2025-08-02T19:22:17Z" 517 }, 518 { 519 "hn_id": "37325986", 520 "title": "Ramulator 2.0: A Modern, Modular, and Extensible DRAM Simulator", 521 "points": 3, 522 "comments": 0, 523 "url": "https://news.ycombinator.com/item?id=37325986", 524 "created_at": "2023-08-30T17:46:58Z" 525 }, 526 { 527 "hn_id": "44533795", 528 "title": "Human-Like Forgetting Curves in Deep Neural Networks", 529 "points": 2, 530 "comments": 0, 531 "url": "https://news.ycombinator.com/item?id=44533795", 532 "created_at": "2025-07-11T16:05:13Z" 533 }, 534 { 535 "hn_id": "24376348", 536 "title": "Would there be any pattern in the death of Roman emperors?", 537 "points": 1, 538 "comments": 1, 539 "url": "https://news.ycombinator.com/item?id=24376348", 540 "created_at": "2020-09-04T16:22:13Z" 541 }, 542 { 543 "hn_id": "44686218", 544 "title": "The Heteronomy of Algorithms", 545 "points": 1, 546 "comments": 0, 547 "url": "https://news.ycombinator.com/item?id=44686218", 548 "created_at": "2025-07-25T18:04:05Z" 549 }, 550 { 551 "hn_id": "41310667", 552 "title": "Predict the Next Token and Diffuse Images with One Multi-Modal Model", 553 "points": 1, 554 "comments": 0, 555 "url": "https://news.ycombinator.com/item?id=41310667", 556 "created_at": "2024-08-21T14:29:45Z" 557 } 558 ], 559 "top_points": 122, 560 "total_points": 143, 561 "total_comments": 11 562 } 563 }