scan.json (24867B)
1 { 2 "paper": { 3 "title": "More Code, Less Reuse: Investigating Code Quality and Reviewer Sentiment towards AI-generated Pull Requests", 4 "authors": [ 5 "Haoming Huang", 6 "Pongchai Jaisri", 7 "Shota Shimizu", 8 "Lingfeng Chen", 9 "Sota Nakashima", 10 "Gema Rodríguez-Pérez" 11 ], 12 "year": 2026, 13 "venue": "MSR 2026 (23rd International Conference on Mining Software Repositories)", 14 "arxiv_id": "2601.21276", 15 "doi": "10.1145/3793302.3793622" 16 }, 17 "checklist": { 18 "artifacts": { 19 "code_released": { 20 "applies": true, 21 "answer": false, 22 "justification": "No repository URL, GitHub link, or Zenodo archive is provided anywhere in the paper. The paper uses tools like Radon, CodeSage-Large, and PyRef but does not release its own analysis code." 23 }, 24 "data_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "The study uses the publicly available AIDev dataset [13] and the publicly available Emotion English DistilRoBERTa-base model [8]. The underlying data source is referenced with citations and is accessible." 28 }, 29 "environment_specified": { 30 "applies": true, 31 "answer": false, 32 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions tools (Radon, CodeSage-Large, PyRef) but does not specify versions or dependencies needed to replicate the analysis." 33 }, 34 "reproduction_instructions": { 35 "applies": true, 36 "answer": false, 37 "justification": "No step-by-step reproduction instructions are provided. The methodology section describes the approach conceptually but does not include commands, scripts, or a README for replication." 38 } 39 }, 40 "statistical_methodology": { 41 "confidence_intervals_or_error_bars": { 42 "applies": true, 43 "answer": false, 44 "justification": "The paper reports point estimates (e.g., AMR of 0.2867 vs. 0.1532) and p-values but no confidence intervals or error bars. Table 1 shows averages without uncertainty bounds." 45 }, 46 "significance_tests": { 47 "applies": true, 48 "answer": true, 49 "justification": "The paper uses Mann-Whitney test for redundancy comparison (p < 0.001, Section 3.1) and reports significance levels in Table 1 with * (p < 0.05) and *** (p < 0.001) annotations." 50 }, 51 "effect_sizes_reported": { 52 "applies": true, 53 "answer": true, 54 "justification": "The paper reports the AMR for AI agents as 0.2867 vs. 0.1532 for humans, explicitly noting 'a nearly 1.87x increase' (Section 3.1). Table 1 also provides absolute differences. This provides enough context to assess magnitude, though no formal effect size metric (Cohen's d) is used." 55 }, 56 "sample_size_justified": { 57 "applies": true, 58 "answer": false, 59 "justification": "No power analysis or justification for sample sizes is provided. Dataset A has 3,858 PRs and Dataset B has 617 PRs from a single repository, but no rationale is given for why these sizes are sufficient. The choice of crewAI as the sole repository for redundancy analysis is acknowledged as a limitation but not justified with a power calculation." 60 }, 61 "variance_reported": { 62 "applies": true, 63 "answer": false, 64 "justification": "No standard deviations, interquartile ranges, or variance measures are reported for the main results. Table 1 shows only averages. The AMR values are reported as single numbers without spread measures." 65 } 66 }, 67 "evaluation_design": { 68 "baselines_included": { 69 "applies": true, 70 "answer": true, 71 "justification": "The study compares AI-generated PRs against human-generated PRs as the baseline across all metrics (code quality, redundancy, and sentiment). Human-PRs serve as the comparison group throughout." 72 }, 73 "baselines_contemporary": { 74 "applies": true, 75 "answer": true, 76 "justification": "The comparison is between AI-generated and human-generated PRs from the same dataset (AIDev, 2025), making the baselines contemporary and relevant." 77 }, 78 "ablation_study": { 79 "applies": true, 80 "answer": false, 81 "justification": "No ablation study is conducted. The MRS metric pipeline includes several components (CodeSage-Large embeddings, PyRef filtering, cosine similarity) but none are ablated to measure their individual contribution. It would be informative to know how results change without the PyRef refactoring filter, for example." 82 }, 83 "multiple_metrics": { 84 "applies": true, 85 "answer": true, 86 "justification": "The paper uses multiple metrics: LOC, multi-line strings, blank lines, cyclomatic complexity, the proposed Max Redundancy Score/AMR, and sentiment scores across seven emotion categories." 87 }, 88 "human_evaluation": { 89 "applies": true, 90 "answer": false, 91 "justification": "No human evaluation of the system's outputs is conducted. The sentiment analysis is automated using DistilRoBERTa. The paper analyzes existing human review comments as data, but humans do not evaluate the paper's own method outputs (e.g., whether detected redundancies are true positives)." 92 }, 93 "held_out_test_set": { 94 "applies": false, 95 "answer": false, 96 "justification": "This is an observational mining study analyzing an existing dataset, not a machine learning evaluation requiring train/test splits." 97 }, 98 "per_category_breakdown": { 99 "applies": true, 100 "answer": true, 101 "justification": "Results are broken down by PR type (Human vs. Agent) across multiple categories: addition vs. removal in Table 1, complexity levels in Table 2, MRS distribution in Figure 2, and per-emotion sentiment scores in Figure 3." 102 }, 103 "failure_cases_discussed": { 104 "applies": true, 105 "answer": false, 106 "justification": "No failure cases of the MRS approach are discussed. The paper does not show examples where the redundancy detection produced false positives or false negatives, nor does it discuss cases where the sentiment classifier may have been wrong." 107 }, 108 "negative_results_reported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The paper reports that traditional code metrics (LOC, CC) show minimal differences between AI and human PRs (Table 1, Table 2), which is effectively a null result for those metrics. The interesting finding emerged only from the novel redundancy metric." 112 } 113 }, 114 "claims_and_evidence": { 115 "abstract_claims_supported": { 116 "applies": true, 117 "answer": true, 118 "justification": "The abstract claims that LLM Agents 'disregard code reuse opportunities, resulting in higher levels of redundancy' (supported by AMR 0.2867 vs. 0.1532, p < 0.001) and that 'reviewers tend to express more neutral or positive emotions towards AI-generated contributions' (supported by Figure 3 sentiment analysis). These claims are backed by the results sections." 119 }, 120 "causal_claims_justified": { 121 "applies": true, 122 "answer": false, 123 "justification": "The paper makes causal claims such as 'the surface-level plausibility of AI code masks redundancy, leading to the silent accumulation of technical debt' (abstract) and 'reviewers may lower their vigilance' (Section 4.1). These are causal mechanisms proposed from observational data without controlling for confounders. The study design is correlational (comparing two groups on existing data) and cannot establish that plausibility causes lower vigilance." 124 }, 125 "generalization_bounded": { 126 "applies": true, 127 "answer": false, 128 "justification": "The title and abstract make broad claims about 'AI-generated Pull Requests' and 'AI agents' generally, but the redundancy analysis (RQ1) is conducted on a single Python repository (crewAI). The paper acknowledges this in threats to validity but the title and abstract do not bound claims to Python or to crewAI specifically." 129 }, 130 "alternative_explanations_discussed": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper proposes that RLHF sycophancy explains the disconnect (Section 4.1) but does not consider alternative explanations for the observed patterns. For example, AI-generated code might receive more neutral sentiment simply because it is more uniform in style, or because reviewers apply different standards to bot-authored PRs. No alternative explanations for the redundancy finding are discussed either (e.g., whether the crewAI repository structure encourages certain patterns)." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper specifies CodeSage-Large [24] for code embeddings and Emotion English DistilRoBERTa-base [8] for sentiment analysis, with citations to the specific model releases. However, it does not specify which LLM agents generated the Agentic-PRs (this is inherited from the AIDev dataset)." 141 }, 142 "prompts_provided": { 143 "applies": false, 144 "answer": false, 145 "justification": "The paper does not use prompting. It uses CodeSage-Large for embeddings and DistilRoBERTa for sentiment classification, neither of which requires prompt engineering in this context." 146 }, 147 "hyperparameters_reported": { 148 "applies": true, 149 "answer": false, 150 "justification": "No hyperparameters are reported. The cosine similarity threshold for redundancy is not stated (MRS uses max similarity with no threshold). The DistilRoBERTa model's maximum token length of 512 is mentioned but no other parameters. No temperature or embedding generation settings are specified." 151 }, 152 "scaffolding_described": { 153 "applies": false, 154 "answer": false, 155 "justification": "The paper does not use agentic scaffolding. It runs standard analysis tools (Radon, CodeSage-Large, PyRef, DistilRoBERTa) in a pipeline, not an agent framework." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 2.1 documents the data strategy: filtering to Python repositories with 500+ stars, separation into Human-PRs and Agentic-PRs, and the two-level dataset strategy (Dataset A: 3,858 PRs; Dataset B: 617 PRs from crewAI). Section 2.2 describes extracting pre/post file pairs from PRs. Section 2.3 describes filtering comments longer than 512 tokens and excluding bot comments." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 6 'Threats to Validity' provides a dedicated subsection discussing internal and external validity threats." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": true, 172 "justification": "The threats are specific to this study: PyRef may miss refactorings leading to false redundancy labels, crewAI may not represent all coding styles, Python-only focus may not generalize to other languages, and the sentiment model may misinterpret technical code review language (Section 6)." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": false, 177 "justification": "While the threats section mentions limitations, the paper does not explicitly state what the results do NOT show. The broad title and abstract claims are not bounded to the tested setting. There is no explicit statement like 'our results should not be interpreted as...' or 'we did not test...' beyond the brief threats section." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": true, 184 "justification": "The study uses the publicly available AIDev dataset [13], which provides the underlying PR data. An independent researcher could access the same data to verify the analysis." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "Section 2.1 describes how the dataset was obtained (AIDev dataset), the filtering criteria (Python repositories, 500+ stars), and the two-level data strategy with specific counts (3,858 PRs for Dataset A, 617 PRs from crewAI for Dataset B)." 190 }, 191 "recruitment_methods_described": { 192 "applies": false, 193 "answer": false, 194 "justification": "No human participants were recruited. The study mines public GitHub data from the AIDev dataset." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": true, 199 "justification": "The data pipeline is documented: AIDev dataset -> filter to Python repos with 500+ stars -> separate Human-PRs and Agentic-PRs -> extract file pairs (Figure 1) -> compute metrics. For redundancy: extract functions -> generate embeddings -> filter refactorings with PyRef -> compute MRS. For sentiment: extract review comments -> filter bots -> filter by token length -> classify with DistilRoBERTa." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": false, 206 "justification": "No funding is disclosed. The Acknowledgments section mentions 'discussions held during the AI-Driven SE Summit 2025' but does not mention any grants, sponsors, or funding agencies." 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "Author affiliations are clearly listed: Institute of Science Tokyo, Nara Institute of Science and Technology, Ritsumeikan University, Kyushu University, and University of British Columbia. These are academic institutions with no obvious conflict with the products being studied." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding source is disclosed, so independence cannot be assessed. The absence of a funding disclosure means this criterion is not satisfied." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests statement or financial interests declaration is present in the paper." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": false, 227 "answer": false, 228 "justification": "This is a mining study analyzing PR data. It does not evaluate a pre-trained model's capability on a benchmark. The models used (CodeSage-Large, DistilRoBERTa) are analysis tools, not subjects being benchmarked." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": false, 232 "answer": false, 233 "justification": "Not applicable for the same reason: this is a mining study, not a benchmark evaluation of model capabilities." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": false, 237 "answer": false, 238 "justification": "Not applicable: the study does not evaluate a pre-trained model on a benchmark." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants. This is a mining study of public GitHub data." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants. The study mines publicly available repository and review data." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants recruited for the study." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants recruited for the study." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants and not an experimental study with treatment assignment." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants and not an experimental study." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": true, 281 "answer": false, 282 "justification": "No inference cost or computation time is reported for the CodeSage-Large embedding generation, PyRef refactoring detection, or DistilRoBERTa sentiment classification. The paper mentions 'computational costs' as a reason for restricting RQ1's redundancy analysis to one repository (Section 6), implying significant cost, but does not quantify it." 283 }, 284 "compute_budget_stated": { 285 "applies": true, 286 "answer": false, 287 "justification": "No compute budget, GPU hours, or hardware specifications are stated. The paper acknowledges computational constraints drove the Dataset B scoping decision but never quantifies the actual resources used." 288 } 289 } 290 }, 291 "claims": [ 292 { 293 "claim": "AI agents produce significantly more redundant code than human developers, with Average Max Redundancy (AMR) of 0.2867 for AI vs. 0.1532 for humans (1.87x increase), statistically significant at p < 0.001.", 294 "evidence": "Section 3.1, Figure 2: Mann-Whitney test on MRS distributions from 617 PRs in the crewAI repository (Dataset B).", 295 "supported": "moderate" 296 }, 297 { 298 "claim": "Traditional code metrics (LOC, cyclomatic complexity) show minimal differences between AI-generated and human-generated PRs.", 299 "evidence": "Section 3.1, Tables 1 and 2: Most metrics show small or non-significant differences. 85% of code changes have zero complexity score for both groups.", 300 "supported": "strong" 301 }, 302 { 303 "claim": "Reviewers tend to express more neutral or positive emotions towards AI-generated contributions than human ones.", 304 "evidence": "Section 3.2, Figure 3: Sentiment analysis using DistilRoBERTa on review comments from Dataset A (3,858 PRs). Agentic-PRs had higher proportion of neutral, joy, and sadness; Human-PRs had higher disgust, anger, fear, and surprise.", 305 "supported": "moderate" 306 }, 307 { 308 "claim": "The surface-level plausibility of AI code masks redundancy, leading to silent accumulation of technical debt.", 309 "evidence": "Section 4.1-4.2: This is an interpretive claim connecting the RQ1 (higher redundancy) and RQ2 (positive/neutral sentiment) findings. No direct causal evidence is provided.", 310 "supported": "weak" 311 } 312 ], 313 "methodology_tags": [ 314 "observational" 315 ], 316 "key_findings": "AI-generated pull requests contain significantly more semantic code redundancy than human-written PRs (AMR 0.2867 vs. 0.1532, p < 0.001) despite similar traditional code metrics. However, human reviewers express more neutral or positive sentiment toward AI-generated code compared to human-written code. The authors argue this disconnect reveals a blind spot where AI-generated code's surface plausibility masks underlying redundancy issues, potentially leading to silent technical debt accumulation.", 317 "red_flags": [ 318 { 319 "flag": "Single-repository redundancy analysis", 320 "detail": "The key redundancy finding (RQ1, MRS/AMR) is based entirely on a single repository (crewAI). The paper acknowledges this but makes broad claims about 'AI agents' in the title and abstract without bounding the generalization." 321 }, 322 { 323 "flag": "No validation of the MRS metric", 324 "detail": "The Max Redundancy Score is a novel metric proposed in this paper, but no validation is provided. There is no ground truth evaluation, no precision/recall analysis, and only 10 manually verified samples for PyRef filtering. It is unknown how well cosine similarity of CodeSage-Large embeddings captures true semantic redundancy." 325 }, 326 { 327 "flag": "Causal interpretation from observational data", 328 "detail": "The paper interprets the disconnect between quality and sentiment causally (RLHF sycophancy -> plausible code -> reduced reviewer vigilance -> technical debt) from purely observational, cross-sectional data. Many confounders are possible but not discussed." 329 }, 330 { 331 "flag": "No variance or uncertainty quantification", 332 "detail": "Main results (AMR values, average metrics in Table 1) are reported as point estimates without confidence intervals, standard deviations, or error bars, making it impossible to assess result stability." 333 }, 334 { 335 "flag": "Sentiment classifier not validated for code review domain", 336 "detail": "The Emotion English DistilRoBERTa-base model was trained on general English text. The paper acknowledges it 'may misinterpret technical discussions' but does not validate it on code review comments. Domain mismatch could systematically bias sentiment results." 337 } 338 ], 339 "cited_papers": [ 340 { 341 "title": "The Rise of AI Teammates in Software Engineering (SE) 3.0: How Autonomous Coding Agents Are Reshaping Software Engineering", 342 "authors": ["Hao Li", "Haoxiang Zhang", "Ahmed E. Hassan"], 343 "year": 2025, 344 "arxiv_id": "2507.15003", 345 "doi": "10.48550/arXiv.2507.15003", 346 "relevance": "Introduces the AIDev dataset of autonomous AI agent pull requests used as the primary data source in this study." 347 }, 348 { 349 "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity", 350 "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"], 351 "year": 2025, 352 "arxiv_id": "2507.09089", 353 "doi": "10.48550/arXiv.2507.09089", 354 "relevance": "Studies adverse effects of AI on developer productivity, directly relevant to the survey's scope of LLM impact on software development." 355 }, 356 { 357 "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?", 358 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R. Narasimhan"], 359 "year": 2023, 360 "relevance": "Key benchmark for evaluating LLM coding agents; this paper critiques its sole focus on pass rates rather than code quality." 361 }, 362 { 363 "title": "Do Users Write More Insecure Code with AI Assistants?", 364 "authors": ["Neil Perry", "Megha Srivastava", "Deepak Kumar", "Dan Boneh"], 365 "year": 2023, 366 "doi": "10.1145/3576915.3623157", 367 "relevance": "Studies adverse effects of AI coding assistants on code security, directly relevant to evaluating AI-assisted code quality." 368 }, 369 { 370 "title": "SycEval: Evaluating LLM Sycophancy", 371 "authors": ["Aaron Fanous", "Jacob Goldberg", "Ank A. Agarwal", "Joanna Lin", "Anson Zhou", "Roxana Daneshjou", "Sanmi Koyejo"], 372 "year": 2025, 373 "arxiv_id": "2502.08177", 374 "doi": "10.48550/arXiv.2502.08177", 375 "relevance": "Evaluates LLM sycophancy behavior, relevant to the paper's claim that RLHF-trained models produce plausible but low-quality code." 376 }, 377 { 378 "title": "Towards Understanding Sycophancy in Language Models", 379 "authors": ["Mrinank Sharma", "Meg Tong", "Tomasz Korbak", "David Duvenaud", "Amanda Askell"], 380 "year": 2023, 381 "relevance": "Foundational work on LLM sycophancy, cited as explanation for why AI-generated code appears superficially correct." 382 }, 383 { 384 "title": "Training Language Models to Follow Instructions with Human Feedback", 385 "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"], 386 "year": 2022, 387 "relevance": "Core RLHF paper cited as the mechanism behind AI code's tendency to optimize for human preference over correctness." 388 }, 389 { 390 "title": "Open Problems and Fundamental Limitations of Reinforcement Learning from Human Feedback", 391 "authors": ["Stephen Casper", "Xander Davies", "Claudia Shi"], 392 "year": 2023, 393 "relevance": "Discusses fundamental limitations of RLHF that may cause AI agents to prioritize perceived helpfulness over code quality." 394 }, 395 { 396 "title": "Code Representation Learning At Scale", 397 "authors": ["Dejiao Zhang", "Wasi Ahmad", "Ming Tan", "Hantian Ding"], 398 "year": 2024, 399 "arxiv_id": "2402.01935", 400 "doi": "10.48550/arXiv.2402.01935", 401 "relevance": "Describes CodeSage-Large, the code embedding model used for semantic redundancy detection in this paper." 402 } 403 ] 404 }