scan.json (29208B)
1 { 2 "paper": { 3 "title": "Does AI Code Review Lead to Code Changes? A Case Study of GitHub Actions", 4 "authors": [ 5 "Kexin Sun", 6 "Hongyu Kuang", 7 "Sebastian Baltes", 8 "Xin Zhou", 9 "He Zhang", 10 "Xiaoxing Ma", 11 "Guoping Rong", 12 "Dong Shao", 13 "Christoph Treude" 14 ], 15 "year": 2025, 16 "venue": "arXiv.org", 17 "arxiv_id": "2508.18771", 18 "doi": "10.48550/arXiv.2508.18771" 19 }, 20 "scan_version": 3, 21 "active_modules": [], 22 "methodology_tags": ["observational"], 23 "key_findings": "Among 178 mature GitHub repositories with AI-based code review actions, only 0.9%–19.2% of valid AI-generated review comments led to code changes, compared to 60% for human-authored comments. Usage is highly concentrated on four popular actions accounting for 98.9% of comments. SHAP analysis reveals that hunk-level granularity, manual triggering, code-rich concise comments, and targeting less experienced contributors are the strongest predictors of comment addressing. The best-performing tool (coderabbitai/ai-pr-reviewer) achieved a 19.2% addressing rate, suggesting design choices directly affect developer response.", 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "Reference [12] provides a GitHub repository URL (brinnarlyne8585/AIReviewActionAnalysis) containing 'dataset, annotations, and scripts for LLM-assisted analysis.' The paper states it was accessed 30-05-2025, indicating the repository was live." 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": true, 34 "justification": "The online appendix (reference [12]) includes the dataset and annotations. Section III states: 'We provide an online appendix, including our dataset, annotations, and scripts.'" 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "No requirements.txt, Dockerfile, or environment specification is described. The paper mentions using PyYAML, FastText, difflib, and the GitHub REST API, but provides no comprehensive dependency or environment setup information." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "Scripts are available in the online appendix but the paper does not describe step-by-step reproduction instructions. No README with commands or a 'Reproducing Results' section is described." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": false, 51 "justification": "All results are reported as point estimates. Table VII reports accuracy and Cohen's κ without confidence intervals or error bars. Despite running LLM evaluations five times, no uncertainty measures are provided." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": true, 56 "justification": "Fisher's exact test is used in Table XI to compare addressing rates across trigger modes and LLM series, with p-values reported (p≤0.05 and p>0.05)." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Effect sizes are reported with sufficient context throughout: absolute addressing rates by tool (0.9%–19.2% for AI vs 60% for human), Cohen's κ for inter-rater agreement (0.674–0.764), and Macro-F1 (0.854) for the Random Forest. Comparisons include base rates enabling magnitude assessment." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "The annotation sample of 150 comments (50 per category) is not justified with a power analysis or explicit rationale for why 50 per category is sufficient. The ≥50 PRs maturity threshold references prior work [19] but no justification is given for the annotation sample size." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": false, 71 "justification": "LLM evaluations were run five times 'for robust evaluation' but no variance, standard deviation, or range across runs is reported. Table VII shows only point estimates. The Random Forest reports single accuracy/F1 values without cross-validation variance." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "Human-authored review comments serve as a comparison baseline, collected from the same 51 repositories during the same time period (Section IV-B, Phase II). Cross-tool comparisons across four actions also serve as baselines against each other." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": true, 83 "justification": "Human review comments are drawn from the same repositories and time windows as the AI comments (Section IV-B). The four AI tools studied are current and popular as of January 2025." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": false, 88 "justification": "No ablation study is performed. Different LLM models are compared for the classification framework (model selection), but no components of the two-stage framework or the feature engineering pipeline are systematically removed to assess their contribution." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "Multiple metrics are used: overall accuracy and Cohen's κ for the LLM classification (Table VII), and accuracy and Macro-F1 for the Random Forest (Section IV-C). SHAP importance and directionality are also reported." 94 }, 95 "human_evaluation": { 96 "applies": true, 97 "answer": true, 98 "justification": "150 comments were manually annotated by two independent raters with a third resolving disagreements (Section IV-B, Phase III). Inter-rater agreement (Cohen's κ) reached 0.674–0.764. An additional 250 samples were examined by the first author for representativeness." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": true, 103 "justification": "For the Random Forest classifier in RQ3, an 80/20 train/test split was used: 'we trained a Random Forest classifier (80% training data) that achieved 88.5% overall accuracy on the test set' (Section IV-C)." 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "Results are broken down by review granularity (PR/file/hunk-level), by individual action (ID-1 through ID-4), by trigger type (auto/manual), by LLM series (GPT-3.5/GPT-4), by code-text ratio bins, and by author experience bins (Tables IV, V, VIII, XI, XII)." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section V-A discusses common failure modes: vague comments like 'Without more context, it is difficult to provide further suggestions,' overly generic summaries, hallucinated style warnings, and redundant reviews. The causes of invalid comments and the 'one-in-one-out paradigm' are analyzed." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": true, 118 "justification": "Multiple negative results are prominently reported: mattzcarey/code-review-gpt had only 0.9% valid comments addressed; 37.1% of repositories declared an action but showed no generated comments; most AI comments are not addressed; and automatically triggered comments perform worse than manually triggered ones." 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "The abstract's claims about growing adoption (Table IV), effectiveness variation (Table VIII), and that concise, code-rich, manually triggered hunk-level comments are more effective (Table X, SHAP analysis) are all supported by the results in Sections IV-A through IV-C." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": false, 130 "justification": "The title asks 'Does AI Code Review Lead to Code Changes?' — causal framing. The paper uses associational language ('associated with,' 'more likely') but also causal language throughout ('lead to,' 'influence,' 'impact'). The study design is purely observational with no causal identification strategy. While the authors acknowledge in Section VI that 'these interpretations describe associations...not causal effects,' the overall framing exceeds what the observational design supports." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": true, 135 "justification": "Section VI explicitly bounds generalization: English-language comments only (75% excluded), repositories with ≥50 PRs, primarily small-to-medium projects (≤50 non-bot contributors), only GitHub Actions platform, data from early February 2025. The authors state 'our findings may not generalize to very large-scale projects.'" 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": true, 140 "justification": "Section VI discusses multiple alternative explanations: github-actions[bot] misattribution, file-level change detection insufficiency, language filtering bias, concentration on four popular actions, incomplete feature engineering, Random Forest vs logistic regression model choice, and association vs causation distinction." 141 }, 142 "proxy_outcome_distinction": { 143 "applies": true, 144 "answer": true, 145 "justification": "The paper measures file-level code changes as a proxy for comment effectiveness. Section V explicitly acknowledges: 'not all valid comments necessarily need to result in immediate code changes to be useful. Some AI-generated suggestions, even if ultimately not adopted, may still prompt reflection, discussion, or future improvements.' Section VI also notes that 'comment addressing based on file-level code changes may be insufficient for certain edge cases.'" 146 } 147 }, 148 "setup_transparency": { 149 "model_versions_specified": { 150 "applies": true, 151 "answer": false, 152 "justification": "The paper lists model names — gpt-4.1, gpt-4o, o4-mini, o3-mini, claude-3-sonnet, claude-3-haiku, deepseek-r1, deepseek-v3 — but defers exact version details to the appendix: 'The specific API endpoints and model versions used are documented in our online appendix scripts.' Some names (e.g., 'gpt-4o') are marketing names without snapshot dates." 153 }, 154 "prompts_provided": { 155 "applies": true, 156 "answer": true, 157 "justification": "The paper states: 'The details of the LLM-assisted framework with specific prompts are available in the online appendix for other researchers to use' (Section IV-B). The online appendix GitHub repository includes the scripts containing prompts." 158 }, 159 "hyperparameters_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Temperature is set to 0 for LLM evaluations (Section IV-B). Reasoning effort is 'medium' for o3-mini and o4-mini. LDA topic count is 6, selected via hyperparameter tuning following prior work [25]. Random Forest uses 80/20 split. While not exhaustive, key generation parameters are stated." 163 }, 164 "scaffolding_described": { 165 "applies": false, 166 "answer": false, 167 "justification": "No agentic scaffolding is used in this study. The LLMs are called directly for classification with single prompts, not through agentic workflows." 168 }, 169 "data_preprocessing_documented": { 170 "applies": true, 171 "answer": true, 172 "justification": "The data pipeline is documented in detail across multiple phases: 718 repos → 178 mature (≥50 PRs), 16,762 comments → 4,229 English (FastText), → 4,195 first-in-thread, reconstruction of reviewed changes and subsequent modifications, categorization of file changes (Table V). Each filtering step includes counts and criteria." 173 } 174 }, 175 "limitations_and_scope": { 176 "limitations_section_present": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section VI 'Threats to Validity' is a substantial section organized into Construct, Internal, External, and Conclusion validity subsections, spanning approximately one full page." 180 }, 181 "threats_to_validity_specific": { 182 "applies": true, 183 "answer": true, 184 "justification": "Threats are specific to this study: github-actions[bot] may include unrelated comments, 150-comment annotation sample from 22,000+, English-only filtering removed 75% of comments and 47 repositories (32 Korean-only), focus on four popular actions limits generalizability, small-to-medium projects only, binary addressed/not-addressed loses nuance, feature engineering may be incomplete." 185 }, 186 "scope_boundaries_stated": { 187 "applies": true, 188 "answer": true, 189 "justification": "Explicit boundaries stated: only GitHub Actions (not other CI/CD platforms), only English comments, only repositories with ≥50 PRs, only four actions analyzed in depth for RQ2-3, primarily small-to-medium projects, data collected through early February 2025, findings 'may not fully reflect the current landscape.'" 190 } 191 }, 192 "data_integrity": { 193 "raw_data_available": { 194 "applies": true, 195 "answer": true, 196 "justification": "The online appendix (reference [12]) includes the dataset and annotations. Section III states the appendix contains 'our dataset, annotations, and scripts.'" 197 }, 198 "data_collection_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Data collection is described in detail: GitHub REST API queries for workflow files, PR search queries ('repo:{repo_name} reviewed-by:github-actions[bot] is:pr'), specific API endpoints for inline and general comments, matching workflow files to actions, filtering by maturity criterion (≥50 PRs). Action selection from ~240 candidates is also described." 202 }, 203 "recruitment_methods_described": { 204 "applies": true, 205 "answer": true, 206 "justification": "Repository selection: top-ranked code review actions from GitHub Marketplace sorted by popularity, 240 examined → 20 candidates → 16 after exclusion. Repositories identified via GitHub API search for workflow files referencing target actions. For annotations: one co-author and an external graduate student independently labeled, with a third co-author resolving disagreements." 207 }, 208 "data_pipeline_documented": { 209 "applies": true, 210 "answer": true, 211 "justification": "The full pipeline is documented with counts at each stage: 718 matched repos → 178 mature → 22,326 comments (Table IV). For addressing analysis: 16,762 from merged PRs → 4,229 English → 4,195 first-in-thread → 4,486 with valid context → 5,652 with human comments added. Table V shows file change distributions. Each filtering criterion is specified." 212 } 213 }, 214 "conflicts_of_interest": { 215 "funding_disclosed": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding sources, grants, or acknowledgments are mentioned anywhere in the paper." 219 }, 220 "affiliations_disclosed": { 221 "applies": true, 222 "answer": true, 223 "justification": "All author affiliations are clearly listed: Nanjing University (China), University of Bayreuth (Germany), and Singapore Management University (Singapore). None of the authors are affiliated with the tools being studied." 224 }, 225 "funder_independent_of_outcome": { 226 "applies": true, 227 "answer": false, 228 "justification": "No funding is disclosed, making it impossible to assess funder independence. The authors are academic researchers with no apparent commercial interest in the tools studied, but without explicit disclosure this cannot be verified." 229 }, 230 "financial_interests_declared": { 231 "applies": true, 232 "answer": false, 233 "justification": "No competing interests or financial interests statement is included in the paper." 234 } 235 }, 236 "contamination": { 237 "training_cutoff_stated": { 238 "applies": false, 239 "answer": false, 240 "justification": "This paper does not evaluate a pre-trained model's capability on a benchmark. LLMs are used as classification tools for annotation automation, not evaluated for their intrinsic knowledge or capability." 241 }, 242 "train_test_overlap_discussed": { 243 "applies": false, 244 "answer": false, 245 "justification": "Not applicable — the paper is a mining study that uses LLMs as classification tools, not a benchmark evaluation of model capability." 246 }, 247 "benchmark_contamination_addressed": { 248 "applies": false, 249 "answer": false, 250 "justification": "Not applicable — no benchmark evaluation of model capability is conducted. The 150-comment annotated dataset is newly created and specific to this study." 251 } 252 }, 253 "human_studies": { 254 "pre_registered": { 255 "applies": false, 256 "answer": false, 257 "justification": "This is a mining study of GitHub repositories with no human participants. Annotators coding data are researchers, not study participants." 258 }, 259 "irb_or_ethics_approval": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants — the study mines publicly available GitHub data." 263 }, 264 "demographics_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 }, 269 "inclusion_exclusion_criteria": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in the study." 273 }, 274 "randomization_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants or experimental conditions." 278 }, 279 "blinding_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants or experimental conditions." 283 }, 284 "attrition_reported": { 285 "applies": false, 286 "answer": false, 287 "justification": "No human participants in the study." 288 } 289 }, 290 "cost_and_practicality": { 291 "inference_cost_reported": { 292 "applies": true, 293 "answer": false, 294 "justification": "The study uses multiple LLM APIs (gpt-4.1, o3-mini, etc.) to classify 5,652 comments, plus 5 evaluation runs on 150 comments across 7 models. No API costs, token counts, or wall-clock times are reported." 295 }, 296 "compute_budget_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "No computational budget is stated. The hardware used for running the Random Forest, LDA, and LLM API calls is not described." 300 } 301 } 302 }, 303 "claims": [ 304 { 305 "claim": "37.1% of mature repositories declared an AI code review action but showed no generated comments, indicating a gap between declaration and actual use.", 306 "evidence": "Table IV: 178 mature repositories, 66 (37.1%) had no observed review activity despite having workflow files referencing AI review actions (Section IV-A).", 307 "supported": "strong" 308 }, 309 { 310 "claim": "Usage is highly concentrated on four popular actions, which account for 91.1% of reviewed repositories, 95.2% of PRs, and 98.9% of generated comments.", 311 "evidence": "Table IV aggregates show Actions ID-1 through ID-4 dominate across all metrics (Section IV-A).", 312 "supported": "strong" 313 }, 314 { 315 "claim": "The two-stage LLM-assisted framework achieves 86.1% overall accuracy and 76.7% Cohen's κ on the full 6-class comment addressing classification task.", 316 "evidence": "Table VII reports performance of gpt-4.1 (Stage-1) + o3-mini (Stage-2) combination on 150 annotated comments across three sources (Section IV-B).", 317 "supported": "moderate" 318 }, 319 { 320 "claim": "Human-authored review comments are much more likely to be addressed (60%) than AI-generated ones (0.9%–19.2% depending on the tool).", 321 "evidence": "Table VIII shows Valid-Fully addressed rates: human 56.0%, coderabbitai/ai-pr-reviewer 15.0%, anc95/ChatGPT-CodeReview 0.5%, mattzcarey/code-review-gpt 0.1% (Section IV-C).", 322 "supported": "strong" 323 }, 324 { 325 "claim": "Comments that are concise, contain code snippets, and are manually triggered — particularly from hunk-level review tools — are more likely to result in code changes.", 326 "evidence": "SHAP analysis in Table X shows Source Features (Is Human, Trigger auto, Is File Level Action) and Comment Features (Code Text Ratio, Text Length) are top predictors. Table XII shows addressing rate jumps when Code Text Ratio exceeds 0.5 (Section IV-C).", 327 "supported": "moderate" 328 }, 329 { 330 "claim": "AI-generated reviews may be more helpful for newcomers: the addressing rate for comments on code by the least experienced contributors was five times higher than for the most experienced.", 331 "evidence": "Table XII shows 16.1% addressing rate for Author Prior Commits ≤30 vs 3.3% for >1013 commits. SHAP directionality for Author Prior Commits is ρ=-0.67 (Section IV-C).", 332 "supported": "moderate" 333 }, 334 { 335 "claim": "82.6% of mature repositories customized at least one optional parameter, and configuration evolution is an ongoing process with 22.5% continuing adjustments beyond one month.", 336 "evidence": "Section IV-A reports 147/178 repositories customized parameters; temporal analysis shows 40 repositories continued adjustments beyond one month.", 337 "supported": "strong" 338 } 339 ], 340 "red_flags": [ 341 { 342 "flag": "Very small annotation sample relative to dataset", 343 "detail": "Only 150 comments were manually annotated from a pool of 22,326 AI-generated and 1,166 human comments (<1%). The 150 comments were sampled only from those with subsequent file modifications, introducing survivorship bias in the ground truth. The first author examined an additional 250 samples informally but this is not a formal validation." 344 }, 345 { 346 "flag": "Language filtering removes 75% of data", 347 "detail": "Filtering to English-only comments removed approximately 75% of comments (12,533 from 16,762) and 47 of 100 repositories. Thirty-two excluded repositories contained only Korean comments. The remaining sample may not represent the broader user population of these tools." 348 }, 349 { 350 "flag": "No variance reported despite multiple runs", 351 "detail": "LLM evaluations were run five times each 'for robust evaluation,' but no variance, standard deviation, or range across runs is reported. Only point estimates appear in Table VII. This makes it impossible to assess the stability of the automated classification results." 352 }, 353 { 354 "flag": "Causal framing exceeds observational evidence", 355 "detail": "The title asks 'Does AI Code Review Lead to Code Changes?' using causal language, but the study is purely observational. While the authors acknowledge this in Section VI, the framing throughout (including 'factors that influence,' 'impact the likelihood') implies causal relationships that the study design cannot establish." 356 } 357 ], 358 "cited_papers": [ 359 { 360 "title": "Llama-reviewer: Advancing code review automation with large language models through parameter-efficient fine-tuning", 361 "authors": ["J. Lu", "L. Yu", "X. Li", "L. Yang", "C. Zuo"], 362 "year": 2023, 363 "relevance": "Directly relevant as an example of LLM-based automated code review, demonstrating parameter-efficient fine-tuning of LLaMA for review comment generation." 364 }, 365 { 366 "title": "Automating code review activities by large-scale pre-training", 367 "authors": ["Z. Li", "S. Lu", "D. Guo", "N. Duan"], 368 "year": 2022, 369 "relevance": "Proposes large-scale pre-training for automating code review activities including determining if changes need review and generating comments." 370 }, 371 { 372 "title": "Using pre-trained models to boost code review automation", 373 "authors": ["R. Tufano", "S. Masiero", "A. Mastropaolo", "L. Pascarella", "D. Poshyvanyk", "G. Bavota"], 374 "year": 2022, 375 "relevance": "Explores pre-trained models (T5) for code review automation including comment generation and revised code generation." 376 }, 377 { 378 "title": "Improving automated code reviews: Learning from experience", 379 "authors": ["H. Y. Lin", "P. Thongtanunam", "C. Treude", "W. Charoenwet"], 380 "year": 2024, 381 "relevance": "Proposes improvements to automated code review through experience-based learning, contributing benchmark datasets." 382 }, 383 { 384 "title": "Vulrepair: a t5-based automated software vulnerability repair", 385 "authors": ["M. Fu", "C. Tantithamthavorn", "T. Le", "V. Nguyen", "D. Phung"], 386 "year": 2022, 387 "relevance": "T5-based automated vulnerability repair demonstrating LLM application in code quality improvement." 388 }, 389 { 390 "title": "Inferfix: End-to-end program repair with llms", 391 "authors": ["M. Jin", "S. Shahriar", "M. Tufano", "X. Shi", "S. Lu", "N. Sundaresan", "A. Svyatkovskiy"], 392 "year": 2023, 393 "relevance": "End-to-end LLM-based program repair system demonstrating practical deployment of AI code fixing." 394 }, 395 { 396 "title": "Expectation vs. experience: Evaluating the usability of code generation tools powered by large language models", 397 "authors": ["P. Vaithilingam", "T. Zhang", "E. L. Glassman"], 398 "year": 2022, 399 "relevance": "Studies developer expectations and experiences with GitHub Copilot, directly relevant to understanding developer response to AI coding tools." 400 }, 401 { 402 "title": "Automated code review in practice", 403 "authors": ["U. Cihan", "V. Haratian", "A. İçöz", "M. K. Gül"], 404 "year": 2024, 405 "arxiv_id": "2412.18531", 406 "relevance": "Industrial study of automated code review tools providing complementary evidence on AI code review adoption and effectiveness in practice." 407 }, 408 { 409 "title": "Characteristics of useful code reviews: An empirical study at microsoft", 410 "authors": ["A. Bosu", "M. Greiler", "C. Bird"], 411 "year": 2015, 412 "relevance": "Foundational study on what makes code review comments useful, providing baseline understanding of human code review effectiveness." 413 }, 414 { 415 "title": "Out of sight, out of mind: Better automatic vulnerability repair by broadening input ranges and sources", 416 "authors": ["X. Zhou", "K. Kim", "B. Xu", "D. Han", "D. Lo"], 417 "year": 2024, 418 "relevance": "Advances LLM-based vulnerability repair methodology relevant to automated code quality improvement." 419 }, 420 { 421 "title": "The code review comprehension assessment for large language models", 422 "authors": ["H. Y. Lin", "C. Liu", "H. Gao", "P. Thongtanunam", "C. Treude"], 423 "year": 2025, 424 "relevance": "Assesses LLM comprehension capability for code review tasks, directly relevant to understanding AI code review quality." 425 }, 426 { 427 "title": "Predicting usefulness of code review comments using textual features and developer experience", 428 "authors": ["M. M. Rahman", "C. K. Roy", "R. G. Kula"], 429 "year": 2017, 430 "relevance": "Predicts code review comment usefulness using features similar to those in this study, providing methodological foundation for the SHAP analysis approach." 431 } 432 ], 433 "engagement_factors": { 434 "practical_relevance": { 435 "score": 2, 436 "justification": "Provides concrete, actionable design recommendations for AI code review tool builders and adopters, including specific feature priorities like hunk-level granularity and manual triggering." 437 }, 438 "surprise_contrarian": { 439 "score": 1, 440 "justification": "Finding that most AI code review tools have near-zero impact (0.9% addressing) is mildly surprising, though the general suspicion about AI tool effectiveness is not new." 441 }, 442 "fear_safety": { 443 "score": 0, 444 "justification": "No AI risk, security, or safety concerns are raised by this study." 445 }, 446 "drama_conflict": { 447 "score": 1, 448 "justification": "Reveals a large gap between AI code review marketing promises and actual developer response (0.9%–19.2% vs 60% human addressing rate), which could generate discussion about AI tool hype." 449 }, 450 "demo_ability": { 451 "score": 1, 452 "justification": "Scripts and dataset are released in an online appendix but there is no interactive demo or installable tool." 453 }, 454 "brand_recognition": { 455 "score": 1, 456 "justification": "Studies well-known GitHub ecosystem tools (CodeRabbit, ChatGPT-CodeReview) but authors are from universities without major brand recognition." 457 } 458 } 459 }