scan.json (25176B)
1 { 2 "paper": { 3 "title": "A Survey of Code Review Benchmarks and Evaluation Practices in Pre-LLM and LLM Era", 4 "authors": [ 5 "Taufiqul Islam Khan", 6 "Shaowei Wang", 7 "Haoxiang Zhang", 8 "Tse-Hsun Chen" 9 ], 10 "year": 2026, 11 "venue": "arXiv", 12 "arxiv_id": "2602.13377", 13 "doi": "10.1145/nnnnnnn.nnnnnnn" 14 }, 15 "checklist": { 16 "artifacts": { 17 "code_released": { 18 "applies": true, 19 "answer": false, 20 "justification": "No repository URL, Zenodo archive, or any code release is mentioned in the paper. The survey could have released its data extraction scripts or analysis code but did not." 21 }, 22 "data_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "No dataset download link or supplementary data file is provided. The survey's extracted metadata (datasets, metrics, task classifications for 99 papers) is not released as a structured artifact." 26 }, 27 "environment_specified": { 28 "applies": true, 29 "answer": false, 30 "justification": "No environment or dependency specifications are provided. As a survey with potential analysis scripts, this could have been documented but was not." 31 }, 32 "reproduction_instructions": { 33 "applies": true, 34 "answer": false, 35 "justification": "No step-by-step reproduction instructions are included. The paper describes the methodology at a high level but does not provide instructions for reproducing the search, screening, or classification pipeline." 36 } 37 }, 38 "statistical_methodology": { 39 "confidence_intervals_or_error_bars": { 40 "applies": false, 41 "answer": false, 42 "justification": "This is a survey paper that does not run experiments or report quantitative results requiring confidence intervals." 43 }, 44 "significance_tests": { 45 "applies": false, 46 "answer": false, 47 "justification": "This is a survey paper. It does not make comparative claims between systems that would require significance tests." 48 }, 49 "effect_sizes_reported": { 50 "applies": false, 51 "answer": false, 52 "justification": "This is a survey paper that does not run experiments and therefore has no effect sizes to report." 53 }, 54 "sample_size_justified": { 55 "applies": false, 56 "answer": false, 57 "justification": "This is a survey paper. The 99 papers analyzed are determined by the search and screening process, not a statistical sampling procedure." 58 }, 59 "variance_reported": { 60 "applies": false, 61 "answer": false, 62 "justification": "This is a survey paper that does not run experiments across multiple runs." 63 } 64 }, 65 "evaluation_design": { 66 "baselines_included": { 67 "applies": true, 68 "answer": true, 69 "justification": "The survey explicitly compares against prior survey work in Section 2.2, discussing Wang et al. [99] (2011-2019 code review studies), Paul et al. [69] (code generation benchmarks), and Wang et al. [101] (SDLC benchmarks). These serve as baselines for the survey's contribution." 70 }, 71 "baselines_contemporary": { 72 "applies": true, 73 "answer": true, 74 "justification": "The related surveys discussed include Wang et al. [101] from 2025, Paul et al. [69] from 2024, and Wang et al. [99] from 2021, which are reasonably contemporary." 75 }, 76 "ablation_study": { 77 "applies": false, 78 "answer": false, 79 "justification": "A survey paper has no system components to ablate." 80 }, 81 "multiple_metrics": { 82 "applies": false, 83 "answer": false, 84 "justification": "This is a survey paper without a system evaluation requiring metrics." 85 }, 86 "human_evaluation": { 87 "applies": false, 88 "answer": false, 89 "justification": "This is a survey paper that does not produce outputs requiring human evaluation." 90 }, 91 "held_out_test_set": { 92 "applies": false, 93 "answer": false, 94 "justification": "This is a survey paper with no experimental evaluation requiring train/test splits." 95 }, 96 "per_category_breakdown": { 97 "applies": true, 98 "answer": true, 99 "justification": "The paper provides extensive per-category breakdowns across its taxonomy: 5 high-level domains, 18 sub-tasks, with separate detailed tables for each sub-task showing datasets, metrics, and data points for both Pre-LLM and LLM eras (Tables 3-28). Figure 6 provides comparative distribution of datasets across sub-tasks." 100 }, 101 "failure_cases_discussed": { 102 "applies": true, 103 "answer": true, 104 "justification": "Section 7.1 discusses limitations of existing benchmarks in the LLM era, identifying specific failure cases: missing task coverage (impact analysis, commit decomposition vanished), reliance on static metrics that fail to capture functional correctness (e.g., 'An LLM might suggest a fix that introduces a deadlock but still receives a high score'), and lack of granular benchmarking." 105 }, 106 "negative_results_reported": { 107 "applies": true, 108 "answer": true, 109 "justification": "The paper reports several negative findings: Change Understanding and Analysis tasks have 'nearly vanished' in the LLM era (from 14 datasets to 1), single-language datasets decreased from 59% to 24%, and current benchmarks fail to capture functional correctness through static metrics. The dominance of Peer Review (60% of LLM-era datasets) is presented as a problematic imbalance." 110 } 111 }, 112 "claims_and_evidence": { 113 "abstract_claims_supported": { 114 "applies": true, 115 "answer": true, 116 "justification": "The abstract claims about analyzing 99 papers (58 Pre-LLM, 41 LLM), proposing a multi-level taxonomy with 5 domains and 18 tasks, observing a shift toward generative peer review, increasing multilingual coverage, and declining change understanding tasks are all supported with data in the body. Section 6 (RQ3) provides the comparative analysis backing abstract claims." 117 }, 118 "causal_claims_justified": { 119 "applies": true, 120 "answer": false, 121 "justification": "The paper makes implicit causal claims such as 'researchers probably consider that understanding is no longer a separate research goal, instead it is a prerequisite that is now bundled into Comment Generation' (Section 6.1). This causal interpretation of why Change Understanding tasks declined is speculative and not supported by evidence (e.g., interviews with researchers or analysis of paper content). The language 'moving from helping humans understand code to having machines perform the task directly end-to-end' implies causation from LLM availability." 122 }, 123 "generalization_bounded": { 124 "applies": true, 125 "answer": false, 126 "justification": "The title 'A Survey of Code Review Benchmarks and Evaluation Practices in Pre-LLM and LLM Era' suggests comprehensive coverage, but the search is limited to specific databases and keywords (Section 3.1). The paper acknowledges 'the rapid growth of AI-driven code review means new datasets are published weekly, making absolute exhaustiveness a moving target' (Section 7.2) but does not bound its generalizations to the specific search scope or time window." 127 }, 128 "alternative_explanations_discussed": { 129 "applies": true, 130 "answer": false, 131 "justification": "The paper offers one interpretation for trends (e.g., Change Understanding vanished because LLMs bundle it into generation) without considering alternatives. For instance, the decline could be due to publication venue bias, keyword search limitations, or researcher familiarity. The threats-to-validity section (Section 7.2) mentions potential omissions but does not discuss alternative explanations for the observed trends." 132 } 133 }, 134 "setup_transparency": { 135 "model_versions_specified": { 136 "applies": false, 137 "answer": false, 138 "justification": "This is a survey paper that does not use any AI models for its analysis." 139 }, 140 "prompts_provided": { 141 "applies": false, 142 "answer": false, 143 "justification": "This is a survey paper that does not use prompting." 144 }, 145 "hyperparameters_reported": { 146 "applies": false, 147 "answer": false, 148 "justification": "This is a survey paper that does not use any models requiring hyperparameters." 149 }, 150 "scaffolding_described": { 151 "applies": false, 152 "answer": false, 153 "justification": "No agentic scaffolding is used in this survey paper." 154 }, 155 "data_preprocessing_documented": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 3.1 describes the four-stage paper collection process: (1) Terminology Summary with 20 search terms listed, (2) Literature Retrieval across 7 databases yielding 160 publications, (3) Literature Screening with four explicit inclusion criteria reducing to 77 papers, (4) Snowball Expansion adding 25 studies for a final 99 papers. The criteria at each stage are stated." 159 } 160 }, 161 "limitations_and_scope": { 162 "limitations_section_present": { 163 "applies": true, 164 "answer": true, 165 "justification": "Section 7.2 'Threats to Validity' discusses internal and construct validity threats in a dedicated subsection." 166 }, 167 "threats_to_validity_specific": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 7.2 discusses specific threats: 'there is a risk that certain papers might have been missed' despite snowballing, 'the rapid growth of AI-driven code review means new datasets are published weekly', and 'Human bias or fatigue could lead to inconsistent labeling across the survey.' The paper describes specific mitigation steps (dual-reviewer protocol, third arbitrator, cross-referencing with prior surveys)." 171 }, 172 "scope_boundaries_stated": { 173 "applies": true, 174 "answer": false, 175 "justification": "The paper does not explicitly state what is out of scope or what the results do NOT show. It does not clarify boundaries such as which types of code review work are excluded (e.g., purely industrial tools without published papers), or acknowledge that findings are bounded to the specific databases searched. The abstract and title imply comprehensive coverage without stating explicit boundaries." 176 } 177 }, 178 "data_integrity": { 179 "raw_data_available": { 180 "applies": true, 181 "answer": false, 182 "justification": "The raw data (list of 99 papers with extracted metadata, task labels, dataset details, metric classifications) is not available for download. Only aggregated results in tables and figures are presented." 183 }, 184 "data_collection_described": { 185 "applies": true, 186 "answer": true, 187 "justification": "Section 3.1 describes the data collection in detail: 20 search terms used, 7 academic databases searched (IEEE Xplore, ACM DL, Elsevier ScienceDirect, ACL Anthology, arXiv, Google Scholar, SpringerLink), time period 2015-2025, four-stage filtering process with counts at each stage (160 → 77 → 99 with snowballing)." 188 }, 189 "recruitment_methods_described": { 190 "applies": false, 191 "answer": false, 192 "justification": "This is a survey of papers, not a study with human participants. No recruitment is applicable." 193 }, 194 "data_pipeline_documented": { 195 "applies": true, 196 "answer": true, 197 "justification": "Section 3.1 documents the pipeline: keyword search → 160 papers → title/abstract/full-text screening with 4 criteria → 77 papers → snowball expansion → 99 papers. Section 3.3 documents the taxonomy construction pipeline with open coding, selective coding, and inter-rater agreement (Cohen's Kappa = 0.81)." 198 } 199 }, 200 "conflicts_of_interest": { 201 "funding_disclosed": { 202 "applies": true, 203 "answer": false, 204 "justification": "No funding information, grants, or acknowledgments section mentioning funding sources is present in the paper." 205 }, 206 "affiliations_disclosed": { 207 "applies": true, 208 "answer": true, 209 "justification": "Author affiliations are clearly listed: University of Manitoba (Khan, Wang), Huawei Canada Centre for Software Excellence (Zhang), Concordia University (Chen). One author is from Huawei, a major technology company with interests in code review tools." 210 }, 211 "funder_independent_of_outcome": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding source is disclosed at all, so independence cannot be assessed. One author is affiliated with Huawei Canada, a company that develops code review tools and has a potential interest in the survey's findings about LLM-based code review." 215 }, 216 "financial_interests_declared": { 217 "applies": true, 218 "answer": false, 219 "justification": "No competing interests or financial interests statement is present in the paper." 220 } 221 }, 222 "contamination": { 223 "training_cutoff_stated": { 224 "applies": false, 225 "answer": false, 226 "justification": "This is a survey paper that does not evaluate any pre-trained model on a benchmark." 227 }, 228 "train_test_overlap_discussed": { 229 "applies": false, 230 "answer": false, 231 "justification": "This is a survey paper that does not evaluate any pre-trained model on a benchmark." 232 }, 233 "benchmark_contamination_addressed": { 234 "applies": false, 235 "answer": false, 236 "justification": "This is a survey paper that does not evaluate any pre-trained model on a benchmark." 237 } 238 }, 239 "human_studies": { 240 "pre_registered": { 241 "applies": false, 242 "answer": false, 243 "justification": "This is a survey of papers, not a human subjects study." 244 }, 245 "irb_or_ethics_approval": { 246 "applies": false, 247 "answer": false, 248 "justification": "This is a survey of papers, not a human subjects study." 249 }, 250 "demographics_reported": { 251 "applies": false, 252 "answer": false, 253 "justification": "This is a survey of papers, not a human subjects study." 254 }, 255 "inclusion_exclusion_criteria": { 256 "applies": false, 257 "answer": false, 258 "justification": "This is a survey of papers, not a human subjects study. Paper inclusion/exclusion criteria are assessed under data_preprocessing_documented." 259 }, 260 "randomization_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "This is a survey of papers, not an experimental study with human participants." 264 }, 265 "blinding_described": { 266 "applies": false, 267 "answer": false, 268 "justification": "This is a survey of papers, not an experimental study with human participants." 269 }, 270 "attrition_reported": { 271 "applies": false, 272 "answer": false, 273 "justification": "This is a survey of papers, not a human subjects study." 274 } 275 }, 276 "cost_and_practicality": { 277 "inference_cost_reported": { 278 "applies": false, 279 "answer": false, 280 "justification": "This is a survey paper with no computational method whose cost would be relevant." 281 }, 282 "compute_budget_stated": { 283 "applies": false, 284 "answer": false, 285 "justification": "This is a survey paper with no significant computational requirements." 286 } 287 } 288 }, 289 "claims": [ 290 { 291 "claim": "The survey analyzed 99 research papers (58 Pre-LLM era and 41 LLM era) spanning 2015-2025.", 292 "evidence": "Section 3.1 describes the collection process yielding 99 papers. Section 3.2 and Table 1 list all papers by venue. The abstract states '99 research papers (58 Pre-LLM era and 41 LLM era).'", 293 "supported": "strong" 294 }, 295 { 296 "claim": "Change Understanding and Analysis has transitioned from a research cornerstone (14 datasets in Pre-LLM) to nearly absent (1 dataset in LLM era).", 297 "evidence": "Figure 6 shows the comparative distribution. Section 6.1 states: 'In the Pre-LLM era, this was a cornerstone of research (14 datasets), while in the LLM era, it has nearly vanished as a standalone topic (1 dataset).'", 298 "supported": "strong" 299 }, 300 { 301 "claim": "Peer Review tasks dominate the LLM era, accounting for nearly 60% of all datasets.", 302 "evidence": "Section 6.1 and Figure 6 show the distribution. The paper states: 'The most obvious trend is the sheer dominance of Peer Review, which accounts for nearly 60% of datasets in LLM era.'", 303 "supported": "strong" 304 }, 305 { 306 "claim": "The LLM era shows a significant shift from language-specific to cross-language generalization, with single-language datasets dropping from 59% to 24%.", 307 "evidence": "Section 6.2 and Figure 7: 'In the Pre-LLM era, research was highly concentrated on single-language studies, with nearly three-fifths (59%) of all datasets restricted to a single programming language.' 'single-language datasets remain present, their relative proportion has plummeted to 24%.'", 308 "supported": "strong" 309 }, 310 { 311 "claim": "Datasets covering nine or more programming languages increased from 2% (Pre-LLM) to 34% (LLM era).", 312 "evidence": "Section 6.2 and Figure 7: 'datasets covering nine or more languages have become a dominant category in the LLM era (34%), whereas they were virtually non-existent (2%) in the earlier period.'", 313 "supported": "strong" 314 }, 315 { 316 "claim": "The decline in Change Understanding tasks is because researchers consider understanding is now bundled into end-to-end generation.", 317 "evidence": "Section 6.1 states: 'researchers probably consider that understanding is no longer a separate research goal, instead it is a prerequisite that is now bundled into Comment Generation.' The word 'probably' acknowledges this is speculative.", 318 "supported": "weak" 319 } 320 ], 321 "methodology_tags": [ 322 "meta-analysis", 323 "qualitative" 324 ], 325 "key_findings": "This survey of 99 code review papers (58 Pre-LLM, 41 LLM era, 2015-2025) proposes a multi-level taxonomy of 5 domains and 18 sub-tasks. The key finding is a dramatic structural shift: Change Understanding and Analysis collapsed from 14 datasets to 1, while Peer Review tasks grew to dominate 60% of LLM-era datasets, suggesting a move from human-centric assistance to end-to-end generation. The LLM era also shows a shift from single-language studies (59% to 24%) to highly multilingual benchmarks covering 9+ languages (2% to 34%). The authors identify critical gaps including missing macro-level review tasks, over-reliance on static text-matching metrics, and lack of runtime/dynamic evaluation in current benchmarks.", 326 "red_flags": [ 327 { 328 "flag": "Industry affiliation without conflict disclosure", 329 "detail": "One author (Haoxiang Zhang) is affiliated with Huawei Canada's Centre for Software Excellence. Huawei develops code review tools and has commercial interests in LLM-based code review. No conflicts of interest statement or funding disclosure is provided." 330 }, 331 { 332 "flag": "No structured quality assessment of surveyed papers", 333 "detail": "The survey collects and organizes 99 papers but does not assess their methodological quality. The paper acknowledges authors conducted a 'quality assessment' to verify papers were 'methodologically sound' (Section 3.1) but provides no details on how quality was assessed or what criteria were used. This risks laundering the signal-to-noise ratio of its sources." 334 }, 335 { 336 "flag": "Causal interpretations unsupported by evidence", 337 "detail": "The paper offers causal explanations for observed trends (e.g., Change Understanding declined because LLMs bundle it into generation) without evidence. These are post-hoc rationalizations based on counting dataset frequencies, not supported by analysis of paper content or researcher interviews." 338 }, 339 { 340 "flag": "Inconsistent paper counts between abstract and body", 341 "detail": "The abstract says '99 research papers (58 Pre-LLM era and 41 LLM era)' but Section 3.1 says 'we collected 61 Pre-LLM code review papers and 45 LLM-based papers' before filtering. The final count of 99 = 58 + 41 is clear, but the intermediate numbers (61 + 45 = 106) are not reconciled with the 77 from screening plus 25 from snowballing = 102. The pipeline count discrepancies (160 → 77 + 25 = 102 or 106 → 99) are confusing." 342 } 343 ], 344 "cited_papers": [ 345 { 346 "title": "Automating code review activities by large-scale pre-training", 347 "authors": ["Zhiyu Li", "Shuai Lu", "Daya Guo"], 348 "year": 2022, 349 "relevance": "Introduces CodeReviewer, a foundational LLM pre-training approach for code review that is extensively referenced as a benchmark dataset source in this survey." 350 }, 351 { 352 "title": "CodeReviewQA: The Code Review Comprehension Assessment for Large Language Models", 353 "authors": ["Hong Yi Lin", "Chunhua Liu", "Haoyu Gao"], 354 "year": 2025, 355 "relevance": "Proposes a code review comprehension benchmark for LLMs, directly relevant to evaluating LLM capabilities in code review tasks." 356 }, 357 { 358 "title": "CodeFuse-CR-Bench: A Comprehensiveness-aware Benchmark for End-to-End Code Review Evaluation in Python Projects", 359 "authors": ["Hanyang Guo", "Xunjin Zheng", "Zihan Liao"], 360 "year": 2025, 361 "arxiv_id": "2509.14856", 362 "relevance": "End-to-end code review benchmark for LLMs covering structured review generation across multiple dimensions." 363 }, 364 { 365 "title": "BitsAI-CR: Automated Code Review via LLM in Practice", 366 "authors": ["Tao Sun", "Jian Xu", "Yuanpeng Li"], 367 "year": 2025, 368 "relevance": "Industrial-scale LLM-based code review system deployed in practice, relevant to understanding real-world LLM code review deployment." 369 }, 370 { 371 "title": "Deep Learning-Based Code Reviews: A Paradigm Shift or a Double-Edged Sword?", 372 "authors": ["Rosalia Tufano", "Alberto Martin-Lopez", "Ahmad Tayeb"], 373 "year": 2025, 374 "relevance": "Empirical study on how DL-based code reviews affect human reviewer performance, relevant to evaluating AI-assisted code review impact." 375 }, 376 { 377 "title": "CodeAgent: Autonomous Communicative Agents for Code Review", 378 "authors": ["Xunzhu Tang", "Kisub Kim", "Yewei Song"], 379 "year": 2024, 380 "relevance": "Proposes autonomous agent-based approach to code review with security focus, relevant to agentic AI code review evaluation." 381 }, 382 { 383 "title": "AI-Assisted Fixes to Code Review Comments at Scale", 384 "authors": ["Chandra Maddila", "Negar Ghorbani", "James Saindon"], 385 "year": 2025, 386 "arxiv_id": "2507.13499", 387 "relevance": "Large-scale industrial study of AI-assisted code revision from review comments, relevant to understanding AI productivity in code review workflows." 388 }, 389 { 390 "title": "Benchmarking and Studying the LLM-based Code Review", 391 "authors": ["Zhengran Zeng", "Ruikai Shi", "Keke Han"], 392 "year": 2025, 393 "arxiv_id": "2509.01494", 394 "relevance": "Benchmark study specifically for LLM-based code review with line-level issue localization evaluation." 395 }, 396 { 397 "title": "Can we benchmark Code Review studies? A systematic mapping study of methodology, dataset, and metric", 398 "authors": ["Dong Wang", "Yuki Ueda", "Raula Gaikovina Kula"], 399 "year": 2021, 400 "doi": "10.1016/j.jss.2021.111009", 401 "relevance": "Prior systematic mapping study of code review methodologies that serves as a baseline for this survey's contribution." 402 }, 403 { 404 "title": "Benchmarks and Metrics for Evaluations of Code Generation: A Critical Review", 405 "authors": ["Debalina Ghosh Paul", "Hong Zhu", "Ian Bayley"], 406 "year": 2024, 407 "relevance": "Critical review of code generation evaluation practices, providing context for how benchmark quality issues extend beyond code review." 408 }, 409 { 410 "title": "Software Development Life Cycle Perspective: A Survey of Benchmarks for Code Large Language Models and Agents", 411 "authors": ["Kaixin Wang", "Tianlin Li", "Xiaoyu Zhang"], 412 "year": 2025, 413 "arxiv_id": "2505.05283", 414 "relevance": "SDLC-wide benchmark survey identifying code review as a major gap, directly motivating this survey's contribution." 415 }, 416 { 417 "title": "Fine-tuning and prompt engineering for large language models-based code review automation", 418 "authors": ["Chanathip Pornprasit", "Chakkrit Tantithamthavorn"], 419 "year": 2024, 420 "doi": "10.1016/j.infsof.2024.107523", 421 "relevance": "Study on fine-tuning and prompting strategies for LLM-based code review, relevant to understanding LLM evaluation methodology." 422 } 423 ] 424 }