scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (30904B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Does AI Code Review Lead to Code Changes? A Case Study of GitHub Actions",
      6     "authors": [
      7       "Kexin Sun",
      8       "Hongyu Kuang",
      9       "Sebastian Baltes",
     10       "Xin Zhou",
     11       "He Zhang",
     12       "Xiaoxing Ma",
     13       "Guoping Rong",
     14       "Dong Shao",
     15       "Christoph Treude"
     16     ],
     17     "year": 2025,
     18     "venue": "arXiv.org",
     19     "arxiv_id": "2508.18771",
     20     "doi": "10.48550/arXiv.2508.18771"
     21   },
     22   "checklist": {
     23     "claims_and_evidence": {
     24       "abstract_claims_supported": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The abstract's claims about growing adoption (Table IV), effectiveness variation (Table VIII), and that concise, code-rich, manually triggered hunk-level comments are more effective (Table X, SHAP analysis) are all supported by the results in Sections IV-A through IV-C.",
     28         "source": "opus"
     29       },
     30       "causal_claims_justified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The title asks 'Does AI Code Review Lead to Code Changes?' — causal framing. The paper uses associational language ('associated with,' 'more likely') but also causal language throughout ('lead to,' 'influence,' 'impact'). The study design is purely observational with no causal identification strategy. While the authors acknowledge in Section VI that 'these interpretations describe associations...not causal effects,' the overall framing exceeds what the observational design supports.",
     34         "source": "opus"
     35       },
     36       "generalization_bounded": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Section VI explicitly bounds generalization: English-language comments only (75% excluded), repositories with ≥50 PRs, primarily small-to-medium projects (≤50 non-bot contributors), only GitHub Actions platform, data from early February 2025. The authors state 'our findings may not generalize to very large-scale projects.'",
     40         "source": "opus"
     41       },
     42       "alternative_explanations_discussed": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Section VI discusses multiple alternative explanations: github-actions[bot] misattribution, file-level change detection insufficiency, language filtering bias, concentration on four popular actions, incomplete feature engineering, Random Forest vs logistic regression model choice, and association vs causation distinction.",
     46         "source": "opus"
     47       },
     48       "proxy_outcome_distinction": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper measures file-level code changes as a proxy for comment effectiveness. Section V explicitly acknowledges: 'not all valid comments necessarily need to result in immediate code changes to be useful. Some AI-generated suggestions, even if ultimately not adopted, may still prompt reflection, discussion, or future improvements.' Section VI also notes that 'comment addressing based on file-level code changes may be insufficient for certain edge cases.'",
     52         "source": "opus"
     53       }
     54     },
     55     "limitations_and_scope": {
     56       "limitations_section_present": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section VI 'Threats to Validity' is a substantial section organized into Construct, Internal, External, and Conclusion validity subsections, spanning approximately one full page.",
     60         "source": "opus"
     61       },
     62       "threats_to_validity_specific": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Threats are specific to this study: github-actions[bot] may include unrelated comments, 150-comment annotation sample from 22,000+, English-only filtering removed 75% of comments and 47 repositories (32 Korean-only), focus on four popular actions limits generalizability, small-to-medium projects only, binary addressed/not-addressed loses nuance, feature engineering may be incomplete.",
     66         "source": "opus"
     67       },
     68       "scope_boundaries_stated": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Explicit boundaries stated: only GitHub Actions (not other CI/CD platforms), only English comments, only repositories with ≥50 PRs, only four actions analyzed in depth for RQ2-3, primarily small-to-medium projects, data collected through early February 2025, findings 'may not fully reflect the current landscape.'",
     72         "source": "opus"
     73       }
     74     },
     75     "conflicts_of_interest": {
     76       "funding_disclosed": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No funding sources, grants, or acknowledgments are mentioned anywhere in the paper.",
     80         "source": "opus"
     81       },
     82       "affiliations_disclosed": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "All author affiliations are clearly listed: Nanjing University (China), University of Bayreuth (Germany), and Singapore Management University (Singapore). None of the authors are affiliated with the tools being studied.",
     86         "source": "opus"
     87       },
     88       "funder_independent_of_outcome": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No funding is disclosed, making it impossible to assess funder independence. The authors are academic researchers with no apparent commercial interest in the tools studied, but without explicit disclosure this cannot be verified.",
     92         "source": "opus"
     93       },
     94       "financial_interests_declared": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No competing interests or financial interests statement is included in the paper.",
     98         "source": "opus"
     99       }
    100     },
    101     "scope_and_framing": {
    102       "key_terms_defined": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Key terms are defined: 'hunk' is defined via GNU diffutils reference, 'addressed' is defined through a precise two-stage annotation scheme (None/General/Valid-Uncertain/Valid-Unaddressed/Valid-Partially/Valid-Fully), and review granularity levels are defined with examples.",
    106         "source": "haiku"
    107       },
    108       "intended_contribution_clear": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Three contributions are explicitly listed: first systematic study of AI code review action adoption, an LLM-assisted addressing detection framework, and an interpretable analysis of influencing factors.",
    112         "source": "haiku"
    113       },
    114       "engagement_with_prior_work": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Section VII explicitly positions the work relative to prior research on comment generation quality vs. downstream impact, and on human developer response to feedback, explaining how this study fills the gap between these lines.",
    118         "source": "haiku"
    119       }
    120     }
    121   },
    122   "type_checklist": {
    123     "empirical": {
    124       "artifacts": {
    125         "code_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Reference [12] provides a GitHub repository URL (brinnarlyne8585/AIReviewActionAnalysis) containing 'dataset, annotations, and scripts for LLM-assisted analysis.' The paper states it was accessed 30-05-2025, indicating the repository was live.",
    129           "source": "opus"
    130         },
    131         "data_released": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "The online appendix (reference [12]) includes the dataset and annotations. Section III states: 'We provide an online appendix, including our dataset, annotations, and scripts.'",
    135           "source": "opus"
    136         },
    137         "environment_specified": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No requirements.txt, Dockerfile, or environment specification is described. The paper mentions using PyYAML, FastText, difflib, and the GitHub REST API, but provides no comprehensive dependency or environment setup information.",
    141           "source": "opus"
    142         },
    143         "reproduction_instructions": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "Scripts are available in the online appendix but the paper does not describe step-by-step reproduction instructions. No README with commands or a 'Reproducing Results' section is described.",
    147           "source": "opus"
    148         }
    149       },
    150       "statistical_methodology": {
    151         "confidence_intervals_or_error_bars": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "All results are reported as point estimates. Table VII reports accuracy and Cohen's κ without confidence intervals or error bars. Despite running LLM evaluations five times, no uncertainty measures are provided.",
    155           "source": "opus"
    156         },
    157         "significance_tests": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Fisher's exact test is used in Table XI to compare addressing rates across trigger modes and LLM series, with p-values reported (p≤0.05 and p>0.05).",
    161           "source": "opus"
    162         },
    163         "effect_sizes_reported": {
    164           "applies": true,
    165           "answer": true,
    166           "justification": "Effect sizes are reported with sufficient context throughout: absolute addressing rates by tool (0.9%–19.2% for AI vs 60% for human), Cohen's κ for inter-rater agreement (0.674–0.764), and Macro-F1 (0.854) for the Random Forest. Comparisons include base rates enabling magnitude assessment.",
    167           "source": "opus"
    168         },
    169         "sample_size_justified": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "The annotation sample of 150 comments (50 per category) is not justified with a power analysis or explicit rationale for why 50 per category is sufficient. The ≥50 PRs maturity threshold references prior work [19] but no justification is given for the annotation sample size.",
    173           "source": "opus"
    174         },
    175         "variance_reported": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "LLM evaluations were run five times 'for robust evaluation' but no variance, standard deviation, or range across runs is reported. Table VII shows only point estimates. The Random Forest reports single accuracy/F1 values without cross-validation variance.",
    179           "source": "opus"
    180         }
    181       },
    182       "evaluation_design": {
    183         "baselines_included": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Human-authored review comments serve as a comparison baseline, collected from the same 51 repositories during the same time period (Section IV-B, Phase II). Cross-tool comparisons across four actions also serve as baselines against each other.",
    187           "source": "opus"
    188         },
    189         "baselines_contemporary": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Human review comments are drawn from the same repositories and time windows as the AI comments (Section IV-B). The four AI tools studied are current and popular as of January 2025.",
    193           "source": "opus"
    194         },
    195         "ablation_study": {
    196           "applies": true,
    197           "answer": false,
    198           "justification": "No ablation study is performed. Different LLM models are compared for the classification framework (model selection), but no components of the two-stage framework or the feature engineering pipeline are systematically removed to assess their contribution.",
    199           "source": "opus"
    200         },
    201         "multiple_metrics": {
    202           "applies": true,
    203           "answer": true,
    204           "justification": "Multiple metrics are used: overall accuracy and Cohen's κ for the LLM classification (Table VII), and accuracy and Macro-F1 for the Random Forest (Section IV-C). SHAP importance and directionality are also reported.",
    205           "source": "opus"
    206         },
    207         "human_evaluation": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "150 comments were manually annotated by two independent raters with a third resolving disagreements (Section IV-B, Phase III). Inter-rater agreement (Cohen's κ) reached 0.674–0.764. An additional 250 samples were examined by the first author for representativeness.",
    211           "source": "opus"
    212         },
    213         "held_out_test_set": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "For the Random Forest classifier in RQ3, an 80/20 train/test split was used: 'we trained a Random Forest classifier (80% training data) that achieved 88.5% overall accuracy on the test set' (Section IV-C).",
    217           "source": "opus"
    218         },
    219         "per_category_breakdown": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Results are broken down by review granularity (PR/file/hunk-level), by individual action (ID-1 through ID-4), by trigger type (auto/manual), by LLM series (GPT-3.5/GPT-4), by code-text ratio bins, and by author experience bins (Tables IV, V, VIII, XI, XII).",
    223           "source": "opus"
    224         },
    225         "failure_cases_discussed": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Section V-A discusses common failure modes: vague comments like 'Without more context, it is difficult to provide further suggestions,' overly generic summaries, hallucinated style warnings, and redundant reviews. The causes of invalid comments and the 'one-in-one-out paradigm' are analyzed.",
    229           "source": "opus"
    230         },
    231         "negative_results_reported": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "Multiple negative results are prominently reported: mattzcarey/code-review-gpt had only 0.9% valid comments addressed; 37.1% of repositories declared an action but showed no generated comments; most AI comments are not addressed; and automatically triggered comments perform worse than manually triggered ones.",
    235           "source": "opus"
    236         }
    237       },
    238       "setup_transparency": {
    239         "model_versions_specified": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "The paper lists model names — gpt-4.1, gpt-4o, o4-mini, o3-mini, claude-3-sonnet, claude-3-haiku, deepseek-r1, deepseek-v3 — but defers exact version details to the appendix: 'The specific API endpoints and model versions used are documented in our online appendix scripts.' Some names (e.g., 'gpt-4o') are marketing names without snapshot dates.",
    243           "source": "opus"
    244         },
    245         "prompts_provided": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "The paper states: 'The details of the LLM-assisted framework with specific prompts are available in the online appendix for other researchers to use' (Section IV-B). The online appendix GitHub repository includes the scripts containing prompts.",
    249           "source": "opus"
    250         },
    251         "hyperparameters_reported": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "Temperature is set to 0 for LLM evaluations (Section IV-B). Reasoning effort is 'medium' for o3-mini and o4-mini. LDA topic count is 6, selected via hyperparameter tuning following prior work [25]. Random Forest uses 80/20 split. While not exhaustive, key generation parameters are stated.",
    255           "source": "opus"
    256         },
    257         "scaffolding_described": {
    258           "applies": false,
    259           "answer": false,
    260           "justification": "No agentic scaffolding is used in this study. The LLMs are called directly for classification with single prompts, not through agentic workflows.",
    261           "source": "opus"
    262         },
    263         "data_preprocessing_documented": {
    264           "applies": true,
    265           "answer": true,
    266           "justification": "The data pipeline is documented in detail across multiple phases: 718 repos → 178 mature (≥50 PRs), 16,762 comments → 4,229 English (FastText), → 4,195 first-in-thread, reconstruction of reviewed changes and subsequent modifications, categorization of file changes (Table V). Each filtering step includes counts and criteria.",
    267           "source": "opus"
    268         }
    269       },
    270       "data_integrity": {
    271         "raw_data_available": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "The online appendix (reference [12]) includes the dataset and annotations. Section III states the appendix contains 'our dataset, annotations, and scripts.'",
    275           "source": "opus"
    276         },
    277         "data_collection_described": {
    278           "applies": true,
    279           "answer": true,
    280           "justification": "Data collection is described in detail: GitHub REST API queries for workflow files, PR search queries ('repo:{repo_name} reviewed-by:github-actions[bot] is:pr'), specific API endpoints for inline and general comments, matching workflow files to actions, filtering by maturity criterion (≥50 PRs). Action selection from ~240 candidates is also described.",
    281           "source": "opus"
    282         },
    283         "recruitment_methods_described": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "Repository selection: top-ranked code review actions from GitHub Marketplace sorted by popularity, 240 examined → 20 candidates → 16 after exclusion. Repositories identified via GitHub API search for workflow files referencing target actions. For annotations: one co-author and an external graduate student independently labeled, with a third co-author resolving disagreements.",
    287           "source": "opus"
    288         },
    289         "data_pipeline_documented": {
    290           "applies": true,
    291           "answer": true,
    292           "justification": "The full pipeline is documented with counts at each stage: 718 matched repos → 178 mature → 22,326 comments (Table IV). For addressing analysis: 16,762 from merged PRs → 4,229 English → 4,195 first-in-thread → 4,486 with valid context → 5,652 with human comments added. Table V shows file change distributions. Each filtering criterion is specified.",
    293           "source": "opus"
    294         }
    295       },
    296       "contamination": {
    297         "training_cutoff_stated": {
    298           "applies": false,
    299           "answer": false,
    300           "justification": "This paper does not evaluate a pre-trained model's capability on a benchmark. LLMs are used as classification tools for annotation automation, not evaluated for their intrinsic knowledge or capability.",
    301           "source": "opus"
    302         },
    303         "train_test_overlap_discussed": {
    304           "applies": false,
    305           "answer": false,
    306           "justification": "Not applicable — the paper is a mining study that uses LLMs as classification tools, not a benchmark evaluation of model capability.",
    307           "source": "opus"
    308         },
    309         "benchmark_contamination_addressed": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "Not applicable — no benchmark evaluation of model capability is conducted. The 150-comment annotated dataset is newly created and specific to this study.",
    313           "source": "opus"
    314         }
    315       },
    316       "human_studies": {
    317         "pre_registered": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "This is a mining study of GitHub repositories with no human participants. Annotators coding data are researchers, not study participants.",
    321           "source": "opus"
    322         },
    323         "irb_or_ethics_approval": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants — the study mines publicly available GitHub data.",
    327           "source": "opus"
    328         },
    329         "demographics_reported": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants in the study.",
    333           "source": "opus"
    334         },
    335         "inclusion_exclusion_criteria": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants in the study.",
    339           "source": "opus"
    340         },
    341         "randomization_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants or experimental conditions.",
    345           "source": "opus"
    346         },
    347         "blinding_described": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants or experimental conditions.",
    351           "source": "opus"
    352         },
    353         "attrition_reported": {
    354           "applies": false,
    355           "answer": false,
    356           "justification": "No human participants in the study.",
    357           "source": "opus"
    358         }
    359       },
    360       "cost_and_practicality": {
    361         "inference_cost_reported": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "The study uses multiple LLM APIs (gpt-4.1, o3-mini, etc.) to classify 5,652 comments, plus 5 evaluation runs on 150 comments across 7 models. No API costs, token counts, or wall-clock times are reported.",
    365           "source": "opus"
    366         },
    367         "compute_budget_stated": {
    368           "applies": true,
    369           "answer": false,
    370           "justification": "No computational budget is stated. The hardware used for running the Random Forest, LDA, and LLM API calls is not described.",
    371           "source": "opus"
    372         }
    373       }
    374     }
    375   },
    376   "claims": [
    377     {
    378       "claim": "Human-authored review comments are addressed at 60% rate, versus only 0.9–19.2% for AI-generated comments depending on tool.",
    379       "evidence": "Table VIII reports valid-fully + valid-partially rates: human 60%, coderabbitai 19.2%, aidar-freeed 6.5%, anc95 4.2%, mattzcarey 0.9%.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Adoption of AI code review actions is highly concentrated: 4 actions account for 91.1% of repositories, 95.2% of PRs, and 98.9% of comments.",
    384       "evidence": "Table IV shows totals for all 16 actions; the top 4 (ID-1 to ID-4) dominate usage counts.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Manually triggered AI reviews achieve significantly higher addressing rates than automatically triggered ones.",
    389       "evidence": "Table XI: for ID-1, manual 12.8% vs auto 6.8% (p≤0.05); for ID-2, manual 22.2% vs auto 0.5% (p≤0.05).",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Comments with higher code-to-text ratio (>0.5) are substantially more likely to be addressed.",
    394       "evidence": "Table XII (left): addressing rate for code ratio >0.52 is 23.2% for AI actions vs 4.2% for lowest bin; SHAP directionality ρ=0.89.",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "AI-generated review comments targeting less experienced contributors (≤124 prior commits) achieve 16% addressing rate vs 3.3% for the most experienced.",
    399       "evidence": "Table XII (right): AI comments in bottom three experience bins show 12.1–16.1% vs 3.3–4.5% for top bins.",
    400       "supported": "moderate"
    401     },
    402     {
    403       "claim": "The two-stage LLM framework achieves 86.1% overall accuracy and 76.7% Cohen's κ on the 6-class addressing classification task.",
    404       "evidence": "Table VII reports these figures for the optimal gpt-4.1 (Stage-1) + o3-mini (Stage-2) combination averaged across three comment sources.",
    405       "supported": "strong"
    406     },
    407     {
    408       "claim": "Hunk-level review actions outperform file-level actions; source features are the strongest predictor group of comment addressing (SHAP importance 0.1646 vs 0.1059 for comment features).",
    409       "evidence": "Table X shows Source Features as the top-ranked group; Is_File_Level_Action has ρ=-0.96 correlation with addressing.",
    410       "supported": "strong"
    411     }
    412   ],
    413   "methodology_tags": [
    414     "observational",
    415     "case-study"
    416   ],
    417   "key_findings": "AI-generated code review comments are addressed at dramatically lower rates (0.9–19.2%) than human-authored comments (60%), with large variation driven by tool design choices rather than the underlying LLM model. A reliable two-stage LLM classification framework (86.1% accuracy, κ=0.767) enables scalable measurement of whether comments lead to code changes. The strongest predictors of addressing are comment source (human vs. AI, hunk-level vs. file-level granularity), manual vs. automatic triggering, and high code-to-text ratio; notably, AI reviews are disproportionately useful for less experienced contributors. The paper explicitly frames all associations as correlational and recommends treating AI reviewers as complements to, not replacements for, human review.",
    418   "red_flags": [
    419     {
    420       "flag": "Data and code not yet released",
    421       "detail": "The dataset, scripts, and prompts are deferred to a Zenodo appendix 'to be published after acceptance,' making current reproducibility impossible to verify."
    422     },
    423     {
    424       "flag": "75% comment filtering creates severe selection bias",
    425       "detail": "Language filtering (FastText, English only) removed 12,533 of 16,762 comments, including 11,744 from the most-used action (anc95/ChatGPT-CodeReview, mostly Korean). The retained sample is unrepresentative of actual global usage."
    426     },
    427     {
    428       "flag": "LLM-derived labels drive main analysis",
    429       "detail": "The RQ3 factor analysis is built on 3,879 LLM-classified labels rather than human annotations; systematic LLM classification biases could propagate into the Random Forest and SHAP findings."
    430     },
    431     {
    432       "flag": "Observational confounding in design recommendations",
    433       "detail": "Key recommendations (use hunk-level, use manual triggering) are drawn from associations that may reflect confounds — e.g., hunk-level tools attracting more engaged projects, or manual triggering correlating with developer intent rather than causally improving addressing."
    434     },
    435     {
    436       "flag": "Annotation sample size unjustified",
    437       "detail": "The 150-comment gold-standard annotation (50 per category) has no power analysis; given 6-class classification and observed κ values (0.674–0.764), this may be underpowered for subgroup conclusions."
    438     }
    439   ],
    440   "cited_papers": [
    441     {
    442       "title": "Modern code review: a case study at Google",
    443       "relevance": "Foundational context on human code review practices, used as motivation for studying AI code review impact."
    444     },
    445     {
    446       "title": "Automating code review activities by large-scale pre-training",
    447       "relevance": "Prior LLM-based code review automation work that this study extends by measuring downstream real-world impact."
    448     },
    449     {
    450       "title": "GitHub actions: the impact on the pull request process",
    451       "relevance": "Direct precedent studying GitHub Actions' effect on development workflows, methodologically relevant."
    452     },
    453     {
    454       "title": "Predicting usefulness of code review comments using textual features and developer experience",
    455       "relevance": "Feature engineering inspiration for RQ3; directly cited as the basis for the 45-feature set design."
    456     },
    457     {
    458       "title": "Automated code review in practice",
    459       "relevance": "Industrial study of AI code review tools; cited for the finding on redundant feedback causing developer frustration."
    460     },
    461     {
    462       "title": "Characteristics of useful code reviews: An empirical study at Microsoft",
    463       "relevance": "Prior work on what makes human code review comments useful, providing baseline comparison context."
    464     },
    465     {
    466       "title": "Expectations, outcomes, and challenges of modern code review",
    467       "relevance": "Classic code review study establishing human review baselines and factors affecting responsiveness."
    468     },
    469     {
    470       "title": "AutoSpearman: Automatically mitigating correlated software metrics for interpreting defect models",
    471       "relevance": "Method used for redundant feature removal in the 45→36 feature reduction step in RQ3."
    472     }
    473   ],
    474   "engagement_factors": {
    475     "practical_relevance": {
    476       "score": 2,
    477       "justification": "Provides concrete, actionable design recommendations for AI code review tool builders and adopters, including specific feature priorities like hunk-level granularity and manual triggering."
    478     },
    479     "surprise_contrarian": {
    480       "score": 1,
    481       "justification": "Finding that most AI code review tools have near-zero impact (0.9% addressing) is mildly surprising, though the general suspicion about AI tool effectiveness is not new."
    482     },
    483     "fear_safety": {
    484       "score": 0,
    485       "justification": "No AI risk, security, or safety concerns are raised by this study."
    486     },
    487     "drama_conflict": {
    488       "score": 1,
    489       "justification": "Reveals a large gap between AI code review marketing promises and actual developer response (0.9%–19.2% vs 60% human addressing rate), which could generate discussion about AI tool hype."
    490     },
    491     "demo_ability": {
    492       "score": 1,
    493       "justification": "Scripts and dataset are released in an online appendix but there is no interactive demo or installable tool."
    494     },
    495     "brand_recognition": {
    496       "score": 1,
    497       "justification": "Studies well-known GitHub ecosystem tools (CodeRabbit, ChatGPT-CodeReview) but authors are from universities without major brand recognition."
    498     }
    499   },
    500   "hn_data": {
    501     "threads": [
    502       {
    503         "hn_id": "43198812",
    504         "title": "Symmetries of Living Systems",
    505         "points": 8,
    506         "comments": 0,
    507         "url": "https://news.ycombinator.com/item?id=43198812"
    508       },
    509       {
    510         "hn_id": "45367764",
    511         "title": "Fill probability estimates in institutional bond trading with quantum computers",
    512         "points": 2,
    513         "comments": 2,
    514         "url": "https://news.ycombinator.com/item?id=45367764"
    515       },
    516       {
    517         "hn_id": "44961416",
    518         "title": "Group Sequence Policy Optimization",
    519         "points": 2,
    520         "comments": 1,
    521         "url": "https://news.ycombinator.com/item?id=44961416"
    522       },
    523       {
    524         "hn_id": "44041341",
    525         "title": "Grounded in Context: Retrieval-Based Method for Hallucination Detection",
    526         "points": 1,
    527         "comments": 0,
    528         "url": "https://news.ycombinator.com/item?id=44041341"
    529       },
    530       {
    531         "hn_id": "43242677",
    532         "title": "FastAtlas: Real-Time Compact Atlases for Texture Space Shading",
    533         "points": 1,
    534         "comments": 0,
    535         "url": "https://news.ycombinator.com/item?id=43242677"
    536       },
    537       {
    538         "hn_id": "29567026",
    539         "title": "Transient execution flaws found in AMD Zen CPUs",
    540         "points": 1,
    541         "comments": 0,
    542         "url": "https://news.ycombinator.com/item?id=29567026"
    543       }
    544     ],
    545     "top_points": 8,
    546     "total_points": 15,
    547     "total_comments": 3
    548   }
    549 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs