scan.json (25272B)
1 { 2 "paper": { 3 "title": "Benchmarking and Studying the LLM-based Code Review", 4 "authors": [ 5 "Zhengran Zeng", 6 "Ruikai Shi", 7 "Keke Han", 8 "Yixin Li", 9 "Kaicheng Sun", 10 "Yidong Wang", 11 "Zhuohao Yu", 12 "Rui Xie", 13 "Wei Ye", 14 "Shikun Zhang" 15 ], 16 "year": 2025, 17 "venue": "arXiv", 18 "arxiv_id": "2509.01494" 19 }, 20 "scan_version": 2, 21 "active_modules": ["experimental_rigor", "data_leakage"], 22 "methodology_tags": ["benchmark-eval"], 23 "key_findings": "SWR-Bench, a 1000-PR benchmark with full project context, reveals that current ACR tools achieve at most 19.38% F1, with low precision (high false positives) as the primary limitation. ACR tools detect functional errors better than evolutionary/stylistic issues. A Multi-Review aggregation strategy improves F1 by up to 43.67% by synthesizing multiple independent review passes. Reasoning-enhanced LLMs perform better at code review than standard LLMs.", 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "An anonymous repository link is provided (reference [32]: anonymous.4open.science/status/swrbench-1D0E) for code and benchmark." 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": true, 34 "justification": "SWR-Bench is released via the same anonymous link. The benchmark includes 1000 PRs with metadata, change-points, and codebase checkpoints." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "No environment specifications (requirements.txt, Dockerfile, dependency versions) are mentioned in the paper." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "No step-by-step reproduction instructions are provided in the paper. The anonymous repo may contain them, but the paper itself does not describe how to replicate experiments." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": false, 51 "justification": "All results in Tables III and IV are point estimates with no confidence intervals or error bars." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper makes comparative claims (e.g., PR-Review outperforms others, reasoning LLMs are better) based solely on comparing point estimates with no statistical tests." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Percentage improvements are reported with baseline context, e.g., 'increasing issue detection F1 score by up to 43.67%' and absolute F1 values are provided (from 15.25% to 21.91%)." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "The benchmark size of 1000 PRs (500 Change, 500 Clean) is stated but no justification is given for why this number is sufficient." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": false, 71 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Results appear to be single-run point estimates." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "Five ACR approaches are compared: LLM-Review, SWR-Agent, CR-Agent, Hybrid-Review, and PR-Review (Section IV-B)." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": true, 83 "justification": "Baselines include recent tools (PR-Agent/PR-Review, CR-Agent from EMNLP 2024) and SOTA LLMs (GPT-o3, Gemini-2.5-Pro, Claude-4-Opus, DeepSeek-R1)." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": true, 88 "justification": "The Multi-Review experiment varies the number of aggregated reports (n=0,1,3,5,10) and compares Self-Agg vs Multi-Agg strategies (Figure 7), effectively ablating the aggregation component." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results report Precision, Recall, F1, average count, and average severity for both overall and functional change-points." 94 }, 95 "human_evaluation": { 96 "applies": true, 97 "answer": true, 98 "justification": "RQ1 validates the LLM-based evaluation method with human experts: 3 human annotators and 2 LLMs independently evaluated 100 code review reports, achieving ~90% agreement on the Hit metric (Figure 5)." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": true, 103 "justification": "SWR-Bench is used solely for evaluation, not for tuning any of the evaluated systems. All ACR tools used their default configurations." 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "Table V provides per-change-type performance breakdown across all 11 change types (E.1.1 through F.6)." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper discusses failure modes: high false positive rates, low precision, Hybrid-Review's poor performance due to unprocessed static analysis, and agent interaction overhead causing error propagation in CR-Agent." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": true, 118 "justification": "Several negative results: Hybrid-Review performs worst (4.48% F1), CR-Agent's multi-agent approach underperforms simpler methods, Claude-4-Opus/Sonnet underperform Claude-3.7-Sonnet, and Qwen-2.5-R1-7B fails due to output formatting issues." 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "Abstract claims are supported: ~90% agreement validated in Figure 5a, 43.67% F1 improvement shown in Figure 7, and ACR underperformance demonstrated in Tables III-IV." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": true, 130 "justification": "The claim that Multi-Review 'improves' performance is supported by controlled experiments varying n with other factors held constant. The claim about reasoning enhancement is supported by comparing matched model pairs (Qwen-2.5-Chat vs Qwen-2.5-R1)." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": false, 135 "justification": "The title claims to benchmark 'LLM-based Code Review' generally, but the benchmark uses only 12 Python projects aligned with SWE-Bench. This language-specific limitation is not explicitly acknowledged as a scope boundary." 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": false, 140 "justification": "The Threats to Validity section (Section V) is brief and does not discuss alternative explanations for observed results. For example, Claude-4's underperformance vs Claude-3.7 is noted as speculative without analysis of confounds." 141 }, 142 "proxy_outcome_distinction": { 143 "applies": true, 144 "answer": true, 145 "justification": "The paper measures change-point detection (hit/not-hit) and frames results in terms of issue detection F1. The claims match the measurement granularity — they do not claim to measure broader 'code review quality' beyond issue detection." 146 } 147 }, 148 "setup_transparency": { 149 "model_versions_specified": { 150 "applies": true, 151 "answer": true, 152 "justification": "Specific model versions with snapshot dates are provided: 'o3-0416', 'o4-mini-0416', 'GPT-4o-1120', 'Gemini-2.5-Pro-0325', 'Gemini-2.5-Flash-0427', 'Claude-3.7-Sonnet-0219', 'Claude-4-Sonnet-0514', 'Claude-4-Opus-0514', 'DeepSeek-R1-0120', 'DeepSeek-V3-0324'." 153 }, 154 "prompts_provided": { 155 "applies": true, 156 "answer": false, 157 "justification": "Prompts are described in natural language (e.g., 'instructing it to identify instances corresponding to the 11 predefined change-point types') but actual prompt text is not provided in the paper." 158 }, 159 "hyperparameters_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Temperature is set to 0.2 for the two baselines they built. ACR tools used official default configurations." 163 }, 164 "scaffolding_described": { 165 "applies": true, 166 "answer": true, 167 "justification": "SWR-Agent's scaffold is described (adapted from SWE-Agent, uses tools to explore codebase and execute snippets). CR-Agent's multi-agent architecture is described (two agents for format vs functional defects with inter-agent debate). PR-Review's prompt engineering approach is described." 168 }, 169 "data_preprocessing_documented": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section III-A documents a detailed 4-step pipeline: source collection and filtering (21,350 PRs), LLM-based verification with majority voting, SZZ filtering and resampling, and manual verification by 5 annotators." 173 } 174 }, 175 "limitations_and_scope": { 176 "limitations_section_present": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section V 'Threats to Validity' addresses internal, external, and construct validity." 180 }, 181 "threats_to_validity_specific": { 182 "applies": true, 183 "answer": true, 184 "justification": "Threats are somewhat specific: 'potential bugs in our implementation', choosing 'representative tools based on an extensive literature review', selecting '12 popular, well-maintained open-source GitHub projects', and human validation confirming evaluation method reliability." 185 }, 186 "scope_boundaries_stated": { 187 "applies": true, 188 "answer": false, 189 "justification": "The paper does not explicitly state what the results do NOT show. Missing: Python-only limitation, restriction to open-source projects, exclusion of non-LLM and proprietary tools (acknowledged in tool selection but not as a scope boundary on conclusions)." 190 } 191 }, 192 "data_integrity": { 193 "raw_data_available": { 194 "applies": true, 195 "answer": true, 196 "justification": "The full SWR-Bench dataset including PR source data, ground truth change-points, and codebase checkpoints is available via the anonymous repository link." 197 }, 198 "data_collection_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section III-A describes data collection in detail: GitHub API crawling of 12 projects, PR metadata collected (titles, descriptions, commits, diffs, comments, statuses), with specific filtering criteria." 202 }, 203 "recruitment_methods_described": { 204 "applies": true, 205 "answer": false, 206 "justification": "Human annotators are described only as 'five experienced graduate students majoring in computer science' with no details on recruitment, selection criteria, or potential biases. Project selection is described as 'aligned with SWE-Bench' but without explaining why these 12 specifically." 207 }, 208 "data_pipeline_documented": { 209 "applies": true, 210 "answer": true, 211 "justification": "The pipeline is documented with counts: 21,350 PRs after initial filtering → 3,500 after LLM verification and quality filtering → 1,000 Change + 1,000 Clean manually verified → final 500+500. Figure 2 shows the pipeline." 212 } 213 }, 214 "conflicts_of_interest": { 215 "funding_disclosed": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding information or acknowledgments section is present in the paper." 219 }, 220 "affiliations_disclosed": { 221 "applies": true, 222 "answer": true, 223 "justification": "Author affiliations are clearly listed: Peking University and Northwestern Polytechnical University. No company affiliations with evaluated products." 224 }, 225 "funder_independent_of_outcome": { 226 "applies": true, 227 "answer": false, 228 "justification": "No funding is disclosed, so independence cannot be assessed." 229 }, 230 "financial_interests_declared": { 231 "applies": true, 232 "answer": false, 233 "justification": "No competing interests or financial interests statement is present in the paper." 234 } 235 }, 236 "contamination": { 237 "training_cutoff_stated": { 238 "applies": true, 239 "answer": false, 240 "justification": "Training data cutoff dates are not stated for any of the evaluated models, despite using real GitHub PRs that may appear in training data." 241 }, 242 "train_test_overlap_discussed": { 243 "applies": true, 244 "answer": false, 245 "justification": "No discussion of whether the GitHub PRs in SWR-Bench appeared in the training data of evaluated LLMs, despite the projects being popular open-source repos." 246 }, 247 "benchmark_contamination_addressed": { 248 "applies": true, 249 "answer": false, 250 "justification": "SWR-Bench uses PRs from well-known GitHub projects (aligned with SWE-Bench). While the benchmark is new, the underlying PR data existed publicly before model training. This contamination risk is not discussed." 251 } 252 }, 253 "human_studies": { 254 "pre_registered": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants study. The human annotators were used for benchmark construction and evaluation method validation, not as study subjects." 258 }, 259 "irb_or_ethics_approval": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants study." 263 }, 264 "demographics_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants study." 268 }, 269 "inclusion_exclusion_criteria": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants study." 273 }, 274 "randomization_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants study." 278 }, 279 "blinding_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants study." 283 }, 284 "attrition_reported": { 285 "applies": false, 286 "answer": false, 287 "justification": "No human participants study." 288 } 289 }, 290 "cost_and_practicality": { 291 "inference_cost_reported": { 292 "applies": true, 293 "answer": true, 294 "justification": "The paper reports that 'a full SWR-Bench evaluation (evaluating all 1000 PRs) costs approximately $1.57 using Gemini-2.5-Flash.'" 295 }, 296 "compute_budget_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "No total computational budget is stated for the full experimental evaluation across all models and tools." 300 } 301 }, 302 "experimental_rigor": { 303 "seed_sensitivity_reported": { 304 "applies": true, 305 "answer": false, 306 "justification": "Main results appear to be single-run. The Venn diagram analysis (Figure 6b) shows variability across 5 runs of the same model but this is used to motivate Multi-Review, not to report result stability." 307 }, 308 "number_of_runs_stated": { 309 "applies": true, 310 "answer": false, 311 "justification": "The number of runs for main results in Tables III and IV is not stated. Only the Multi-Review experiment explicitly varies run count." 312 }, 313 "hyperparameter_search_budget": { 314 "applies": true, 315 "answer": false, 316 "justification": "No hyperparameter search is described. Tools use default configurations but no justification for not exploring alternatives." 317 }, 318 "best_config_selection_justified": { 319 "applies": true, 320 "answer": true, 321 "justification": "The paper states 'all evaluated ACR tools used their official default configurations' and sets temperature to 0.2 for baselines, avoiding cherry-picking of configurations." 322 }, 323 "multiple_comparison_correction": { 324 "applies": true, 325 "answer": false, 326 "justification": "Many comparisons are made across tools and models with no statistical tests, let alone corrections for multiple comparisons." 327 }, 328 "self_comparison_bias_addressed": { 329 "applies": true, 330 "answer": false, 331 "justification": "The authors built LLM-Review and SWR-Agent as baselines and constructed SWR-Bench itself. They do not acknowledge the bias of evaluating their own constructions." 332 }, 333 "compute_budget_vs_performance": { 334 "applies": true, 335 "answer": false, 336 "justification": "Different tools have vastly different compute profiles (e.g., Multi-Review with n=10 uses 10x the API calls) but performance is not normalized by compute." 337 }, 338 "benchmark_construct_validity": { 339 "applies": true, 340 "answer": true, 341 "justification": "RQ1 validates the evaluation method against human judgment (~90% agreement). The paper discusses what the benchmark measures (change-point detection) and why existing benchmarks are inadequate." 342 }, 343 "scaffold_confound_addressed": { 344 "applies": true, 345 "answer": false, 346 "justification": "Table III compares tools with different scaffolds (single-turn vs agent vs multi-agent) using different LLMs, confounding scaffold and model effects. Table IV isolates models by fixing PR-Review as the scaffold, but Table III's conclusions about tools don't control for this." 347 } 348 }, 349 "data_leakage": { 350 "temporal_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether the GitHub PRs in SWR-Bench predate the training data of evaluated models, which could mean models have seen the PR discussions and solutions." 354 }, 355 "feature_leakage_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of whether providing full PR metadata (including review comments in the construction phase) could leak information about what change-points exist." 359 }, 360 "non_independence_addressed": { 361 "applies": true, 362 "answer": false, 363 "justification": "PRs come from 12 projects aligned with SWE-Bench. No discussion of whether models trained on SWE-Bench data have exposure to these same repositories." 364 }, 365 "leakage_detection_method": { 366 "applies": true, 367 "answer": false, 368 "justification": "No leakage detection or prevention method is applied beyond the SZZ algorithm for filtering missed changes." 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "The objective LLM-as-Judge evaluation method achieves ~90% agreement with human expert judgment on the Hit metric.", 375 "evidence": "Figure 5a shows Hit-Agreement between 89.2% and 94.9% across all pairwise combinations of 3 human annotators and 2 LLMs (RQ1).", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Current ACR tools achieve at most 19.38% F1 on SWR-Bench, limited primarily by high false positive rates.", 380 "evidence": "Table III shows PR-Review with Gemini-2.5-Pro achieves the highest F1 of 19.38%, with most tools having precision below 10%.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "ACR tools are more adept at detecting functional errors than evolutionary changes.", 385 "evidence": "Table V shows functional change types (F.1-F.6) achieve F1 scores of 19.60-27.65%, while evolutionary types (E.1.1-E.3.2) achieve 6.05-16.45%.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Reasoning-enhanced LLMs perform better at code review than standard LLMs.", 390 "evidence": "Table IV shows Qwen-2.5-R1-14B achieves 15.95% F1 vs Qwen-2.5-14B at 9.01%. Gemini-2.5-Pro (reasoning) achieves 19.38% F1. However, the comparison is confounded by other model differences.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Multi-Review aggregation strategy increases F1 scores by up to 43.67%.", 395 "evidence": "Figure 7 shows Gemini-2.5-Flash Self-Agg (n=10) achieves 21.91% F1, up from baseline 15.25% — a 43.67% relative increase.", 396 "supported": "strong" 397 }, 398 { 399 "claim": "Claude-4-Opus and Claude-4-Sonnet underperform Claude-3.7-Sonnet on code review.", 400 "evidence": "Table IV: Claude-3.7-Sonnet achieves 18.23% F1 vs Claude-4-Opus at 16.99% and Claude-4-Sonnet at 16.61%.", 401 "supported": "weak" 402 } 403 ], 404 "red_flags": [ 405 { 406 "flag": "No statistical tests for any comparisons", 407 "detail": "All comparative claims (tool rankings, reasoning enhancement benefits, model comparisons) are based on single-run point estimates with no significance tests, confidence intervals, or variance measures. The Venn diagram analysis (Figure 6b) demonstrates substantial run-to-run variability, yet main results report single runs." 408 }, 409 { 410 "flag": "Contamination risk from SWE-Bench-aligned projects", 411 "detail": "SWR-Bench uses PRs from 12 projects aligned with SWE-Bench. Models trained on data including these repositories may have seen the PR discussions, code changes, and reviewer comments. This is not discussed." 412 }, 413 { 414 "flag": "Python-only scope not acknowledged as limitation", 415 "detail": "All 12 projects are Python. The paper titles itself as benchmarking 'LLM-based Code Review' without bounding conclusions to Python." 416 }, 417 { 418 "flag": "Self-evaluation bias", 419 "detail": "Authors built SWR-Bench, the evaluation method, and two of the five baselines (LLM-Review, SWR-Agent). Their benchmark design choices directly affect which tools look better or worse." 420 } 421 ], 422 "cited_papers": [ 423 { 424 "title": "SWE-bench: Can language models resolve real-world GitHub issues?", 425 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"], 426 "year": 2024, 427 "relevance": "Foundational benchmark for LLM-based software engineering that SWR-Bench's project selection is aligned with." 428 }, 429 { 430 "title": "CodeAgent: Autonomous Communicative Agents for Code Review", 431 "authors": ["Xunzhu Tang", "Kisub Kim"], 432 "year": 2024, 433 "relevance": "Multi-agent ACR system evaluated as a baseline, representing agent-based approaches to code review." 434 }, 435 { 436 "title": "Automating code review activities by large-scale pre-training", 437 "authors": ["Zhiyu Li", "Shuai Lu", "Daya Guo"], 438 "year": 2022, 439 "doi": "10.1145/3540250.3549081", 440 "relevance": "CodeReviewer dataset and model — one of the main prior benchmarks that SWR-Bench aims to supersede." 441 }, 442 { 443 "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering", 444 "authors": ["John Yang", "Carlos E. Jimenez"], 445 "year": 2024, 446 "relevance": "Agent framework adapted to create the SWR-Agent baseline for code review evaluation." 447 }, 448 { 449 "title": "Large Language Model-Based Agents for Software Engineering: A Survey", 450 "authors": ["Junwei Liu", "Kaixin Wang"], 451 "year": 2024, 452 "arxiv_id": "2409.02977", 453 "relevance": "Survey of LLM-based agents in software engineering, providing context for ACR tools." 454 }, 455 { 456 "title": "Combining Large Language Models with Static Analyzers for Code Review Generation", 457 "authors": ["Imen Jaoua", "Omar Ben Sghaier"], 458 "year": 2025, 459 "arxiv_id": "2502.06633", 460 "relevance": "Hybrid-Review approach combining LLMs with static analysis, evaluated as a baseline." 461 }, 462 { 463 "title": "BitSAI-CR: Automated Code Review via LLM in Practice", 464 "authors": ["Tao Sun", "Jiaqi Xu"], 465 "year": 2025, 466 "arxiv_id": "2501.15134", 467 "relevance": "Industrial ACR tool deployment study, representing proprietary approaches excluded from benchmark evaluation." 468 }, 469 { 470 "title": "Why do Multi-Agent LLM Systems Fail?", 471 "authors": ["Mehmet Cemri"], 472 "year": 2025, 473 "arxiv_id": "2503.13657", 474 "relevance": "Analysis of multi-agent system failures, cited to explain CR-Agent's underperformance." 475 }, 476 { 477 "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code", 478 "authors": ["Naman Jain", "King Han"], 479 "year": 2025, 480 "relevance": "Contamination-aware code benchmark; paper notes LLM rankings on SWR-Bench diverge from LiveCodeBench." 481 }, 482 { 483 "title": "Deep Assessment of Code Review Generation Approaches: Beyond Lexical Similarity", 484 "authors": ["Yanlin Jiang", "Hui Liu"], 485 "year": 2025, 486 "arxiv_id": "2501.05176", 487 "relevance": "Critiques text-similarity metrics for code review evaluation, motivating SWR-Bench's objective evaluation method." 488 } 489 ] 490 }