scan.json (20939B)
1 { 2 "paper": { 3 "title": "A Task-Level Evaluation of AI Agents in Open-Source Projects", 4 "authors": ["Shojibur Rahman", "Md Fazle Rabbi", "Minhaz F. Zibran"], 5 "year": 2026, 6 "venue": "MSR '26", 7 "arxiv_id": "2602.02345" 8 }, 9 "scan_version": 2, 10 "active_modules": [], 11 "methodology_tags": ["observational"], 12 "key_findings": "Codex achieves the highest PR acceptance rate (0.83 average) with the lowest variability across task types, while Copilot has the lowest acceptance rate (0.45) and triggers the most review comments. Commit message quality is inversely related to acceptance: Claude produces the highest-quality commit messages (0.68) while Codex produces the lowest (0.32). Most PRs (90.6%) receive zero review comments, with 98.2% of Codex PRs having no recorded review discussion.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "A replication package is provided on Figshare (ref [16], https://figshare.com/s/4bbb22ff17f9250659a3?file=60502964)." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The study uses the publicly available AIDev-pop dataset (refs [10, 11]), and a replication package is provided." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "No environment specifications, requirements files, or dependency versions are mentioned in the paper." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "The paper references a replication package but does not include step-by-step reproduction instructions in the paper itself. The Figshare link points to a file but no instructions are described." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "Standard deviations are reported for acceptance rates and commit quality across task types, but no confidence intervals or error bars are provided for the main results." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": true, 45 "justification": "Mann-Whitney-Wilcoxon tests are used for pairwise comparisons in all three RQs (e.g., Codex vs Cursor for RQ1, p = 4.27 × 10^-63; Copilot vs Devin for RQ2, p = 7.09 × 10^-50; Claude vs Cursor for RQ3, p = 4.55 × 10^-31)." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": false, 50 "justification": "Raw acceptance rates and proportions are reported but no formal effect size measures (Cohen's d, odds ratios, etc.) are provided alongside the significance tests." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "The dataset size (33,549 PRs) is stated but not justified. No power analysis or discussion of whether sample sizes per agent-task cell are adequate (e.g., Claude has only 3 perf PRs, 5 ci PRs)." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "Standard deviations across task types are reported for each agent in Tables 2 and 4 (e.g., Codex SD = 0.06 for acceptance, SD = 0.16 for commit quality)." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "The five agents serve as mutual baselines/comparisons against each other. The study is comparative by design." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "All five agents (Codex, Devin, Copilot, Cursor, Claude) are contemporary autonomous coding agents as of 2025-2026." 73 }, 74 "ablation_study": { 75 "applies": false, 76 "answer": false, 77 "justification": "No system with components to ablate — this is an observational comparison of existing tools." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Three metrics are used: PR acceptance rate (RQ1), review comment volume (RQ2), and commit message quality via C-Good classifier (RQ3)." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": false, 87 "justification": "No human evaluation of outputs. Commit message quality is assessed entirely via the automated C-Good classifier. No manual inspection of PRs or code quality." 88 }, 89 "held_out_test_set": { 90 "applies": false, 91 "answer": false, 92 "justification": "This is an observational study of real-world PR data, not a predictive modeling study requiring train/test splits." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results are broken down by both agent and task type (10 task categories) in Tables 2, 3, and 4." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": false, 102 "justification": "No qualitative analysis of why specific PRs were rejected, why commit messages scored low, or what went wrong in specific cases." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper reports that Codex has the lowest commit quality despite highest acceptance, and that 90.6% of PRs receive zero review comments, questioning the review metric's utility." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Abstract claims about Codex's high acceptance, Copilot's high review volume, and Claude/Cursor's commit quality are supported by Tables 2-4 and statistical tests." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": false, 119 "justification": "The paper uses language like 'Copilot PRs trigger the highest volume of review discussions' (abstract) implying causality, but this is observational data with many confounds (different repositories, task complexity, project norms). No causal identification strategy is used." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "Title says 'AI Agents in Open-Source Projects' broadly, but results are limited to the AIDev-pop dataset (repos with 100+ stars, data up to August 2025). The abstract says findings 'inform selection and improvements of AI agents' without bounding to the tested context. Threats section mentions open-source GitHub limitation but the abstract and conclusions overreach." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": true, 129 "justification": "Section 4 discusses that task complexity varies within task types, reviewer practices differ across projects, accepted PRs may contain quality issues, and comment volume doesn't distinguish positive/negative feedback. These are substantive alternative explanations." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": false, 134 "justification": "PR acceptance rate is used as a proxy for agent quality/performance, but the paper does not discuss what acceptance actually measures vs. what they claim it shows. The 90.6% zero-comment finding suggests many PRs are rubber-stamped, undermining acceptance as a quality signal, yet the paper still draws conclusions about agent performance from it." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": false, 140 "answer": false, 141 "justification": "The paper evaluates third-party agents' PRs from a dataset; it does not run models itself. The agents' internal model versions are not something the authors can specify." 142 }, 143 "prompts_provided": { 144 "applies": false, 145 "answer": false, 146 "justification": "The paper does not use prompting — it analyzes existing PR data from the AIDev dataset." 147 }, 148 "hyperparameters_reported": { 149 "applies": false, 150 "answer": false, 151 "justification": "No models are run by the authors. The C-Good classifier is used as-is from prior work." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "The paper evaluates third-party agents as black boxes from their PR outputs; no scaffolding is used or controllable by the authors." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 2.1 describes using AIDev-pop subset (100+ star repos), excluding 'revert' and 'other' task types (16 and 31 PRs respectively), leaving 33,549 PRs. Filtering criteria are stated." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 4 'Threats to Validity' provides a dedicated discussion of limitations." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 4 discusses specific threats: task complexity varying within type, reviewer practices varying across projects, accepted PRs potentially containing quality issues, comment volume not distinguishing feedback types, and the dataset capturing a specific temporal snapshot." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section 4 states findings may not generalize to proprietary software, other platforms, or human-AI collaboration settings, and notes the dataset captures agent behavior at a specific time." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": true, 185 "justification": "The AIDev dataset is publicly available (refs [10, 11]) and a replication package is provided (ref [16])." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 2.1 describes the AIDev dataset: PRs by five agents in open-source GitHub projects with 100+ stars, covering data up to August 1, 2025. The dataset source paper (ref [11]) is cited for full collection details." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. The data source is a standard public dataset of AI-generated PRs." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section 2 describes: AIDev-pop subset selected → revert/other excluded (with counts: 16 and 31) → 33,549 PRs analyzed. Section 2.2 details the three measurement approaches." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding information or acknowledgments section is present in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "All three authors are affiliated with Idaho State University Department of Computer Science, clearly stated. No affiliation with any of the evaluated agents' companies." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding source is disclosed, so independence cannot be assessed." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests statement or financial disclosure is included in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": false, 228 "answer": false, 229 "justification": "This is a mining study analyzing PR data. It does not evaluate a pre-trained model's capability on a benchmark." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": false, 233 "answer": false, 234 "justification": "Not a benchmark evaluation of model capability." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": false, 238 "answer": false, 239 "justification": "Not a benchmark evaluation of model capability." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants — mining study of public PR data." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "Observational mining study — no inference or compute by the authors beyond running a classifier on commit messages." 284 }, 285 "compute_budget_stated": { 286 "applies": false, 287 "answer": false, 288 "justification": "Observational mining study with minimal compute requirements." 289 } 290 } 291 }, 292 "claims": [ 293 { 294 "claim": "Codex achieves the highest and most stable PR acceptance rate (0.83 average, SD 0.06) across task types.", 295 "evidence": "Table 2 shows Codex acceptance rates ranging from 0.68 (perf) to 0.92 (docs). MWW test vs Cursor: p = 4.27 × 10^-63.", 296 "supported": "strong" 297 }, 298 { 299 "claim": "Copilot PRs receive the highest volume of both bot-generated (1.25 avg) and human-generated (1.31 avg) review comments.", 300 "evidence": "Table 3 shows Copilot averages. MWW test vs Devin: p = 7.09 × 10^-50. All other agents below 1.0 total comments.", 301 "supported": "strong" 302 }, 303 { 304 "claim": "90.6% of all PRs receive zero review comments, with 98.2% for Codex specifically.", 305 "evidence": "Reported in Section 3.2 with specific percentages.", 306 "supported": "strong" 307 }, 308 { 309 "claim": "Claude achieves the highest commit message quality (0.68 average) while Codex has the lowest (0.32).", 310 "evidence": "Table 4 shows per-task good commit rates. MWW test Claude vs Cursor: p = 4.55 × 10^-31.", 311 "supported": "moderate" 312 }, 313 { 314 "claim": "PR acceptance does not strongly depend on commit message quality.", 315 "evidence": "Codex has highest acceptance (0.83) but lowest commit quality (0.32); Claude has high commit quality (0.68) but moderate acceptance (0.66). Section 3.3 discusses this inverse pattern.", 316 "supported": "moderate" 317 } 318 ], 319 "red_flags": [ 320 { 321 "flag": "Massive confound: repository selection bias across agents", 322 "detail": "Different agents operate in different repositories with different review cultures, maintainer standards, and project norms. Codex's 21,799 PRs dwarf Claude's 459 — the agent comparison conflates agent capability with repository selection. Codex may simply target easier/more receptive repos." 323 }, 324 { 325 "flag": "98.2% zero-comment rate undermines review metric", 326 "detail": "The paper's own finding that 98.2% of Codex PRs receive zero review comments suggests most PRs are auto-merged or rubber-stamped. This raises questions about whether acceptance rate is a meaningful quality signal, yet the paper still draws performance conclusions from it." 327 }, 328 { 329 "flag": "No code-level quality assessment", 330 "detail": "The study evaluates acceptance rate and commit message quality but never examines the actual code in PRs. An accepted PR with a good commit message could still contain poor code. This is acknowledged as future work but limits current conclusions about agent effectiveness." 331 }, 332 { 333 "flag": "Extreme sample size imbalance across agents", 334 "detail": "Codex has 21,799 PRs while Claude has only 459. Several agent-task cells have very small counts (Claude: 3 perf, 5 ci, 0 style). Statistical comparisons across agents with such imbalanced samples risk misleading conclusions." 335 }, 336 { 337 "flag": "Causal language from observational data", 338 "detail": "The abstract says Copilot's PRs 'trigger' the highest review volume, implying causation. But the observational design cannot distinguish whether Copilot PRs cause more discussion or whether Copilot is used in repositories with more active review cultures." 339 } 340 ], 341 "cited_papers": [ 342 { 343 "title": "Evaluating large language models trained on code", 344 "authors": ["Mark Chen"], 345 "year": 2021, 346 "arxiv_id": "2107.03374", 347 "relevance": "Foundational Codex/HumanEval paper on LLM code generation evaluation." 348 }, 349 { 350 "title": "SWE-bench: Can language models resolve real-world GitHub issues?", 351 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"], 352 "year": 2023, 353 "arxiv_id": "2310.06770", 354 "relevance": "Major benchmark for evaluating autonomous coding agents on real GitHub issues." 355 }, 356 { 357 "title": "The Rise of AI Teammates in Software Engineering (SE) 3.0", 358 "authors": ["Hao Li", "Haoxiang Zhang", "Ahmed E. Hassan"], 359 "year": 2025, 360 "arxiv_id": "2507.15003", 361 "relevance": "Source of the AIDev dataset used in this study; characterizes autonomous AI agents as software contributors." 362 }, 363 { 364 "title": "The SWE-Bench Illusion: When State-of-the-Art LLMs Remember Instead of Reason", 365 "authors": ["Shanchao Liang", "Spandan Garg", "Roshanak Zilouchian Moghaddam"], 366 "year": 2025, 367 "arxiv_id": "2506.12286", 368 "relevance": "Challenges SWE-bench validity by showing LLMs may memorize rather than reason about benchmark solutions." 369 }, 370 { 371 "title": "Agentic AI: Autonomous intelligence for complex goals – a comprehensive survey", 372 "authors": ["Deepak Bhaskar Acharya", "Karthigeyan Kuppan", "B Divya"], 373 "year": 2025, 374 "relevance": "Survey of autonomous AI agent capabilities relevant to understanding the agent landscape." 375 }, 376 { 377 "title": "LLM-Based Multi-Agent Systems for Software Engineering: Literature Review, Vision, and the Road Ahead", 378 "authors": ["Junda He", "Christoph Treude", "David Lo"], 379 "year": 2025, 380 "relevance": "Literature review of LLM-based multi-agent systems for software engineering tasks." 381 }, 382 { 383 "title": "Agentic AI Software Engineer: Programming with Trust", 384 "authors": ["Abhik Roychoudhury", "Corina Pasareanu", "Michael Pradel", "Baishakhi Ray"], 385 "year": 2025, 386 "arxiv_id": "2502.13767", 387 "relevance": "Examines trust and reliability in autonomous coding agents." 388 }, 389 { 390 "title": "On the use of agentic coding: An empirical study of pull requests on GitHub", 391 "authors": ["Miku Watanabe", "Hao Li", "Yutaro Kashiwa", "Brittany Reid", "Hajimu Iida", "Ahmed E. Hassan"], 392 "year": 2025, 393 "arxiv_id": "2509.14745", 394 "relevance": "Empirical study of agentic coding PRs on GitHub, source of task type classification used in this paper." 395 }, 396 { 397 "title": "RepairAgent: An autonomous, LLM-based agent for program repair", 398 "authors": ["Islem Bouzenia", "Premkumar Devanbu", "Michael Pradel"], 399 "year": 2024, 400 "arxiv_id": "2403.17134", 401 "relevance": "Autonomous LLM-based agent for automated program repair." 402 }, 403 { 404 "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity", 405 "authors": ["METR"], 406 "year": 2025, 407 "relevance": "RCT measuring AI tool impact on developer productivity in open-source projects." 408 }, 409 { 410 "title": "What makes a good commit message?", 411 "authors": ["Yingchen Tian", "Yuxia Zhang", "Klaas-Jan Stol", "Lin Jiang", "Hui Liu"], 412 "year": 2022, 413 "relevance": "Source of the C-Good classifier used to assess commit message quality in this study." 414 } 415 ] 416 }