scan.json (23366B)
1 { 2 "paper": { 3 "title": "Code Review Automation using Retrieval Augmented Generation", 4 "authors": ["Qianru Meng", "Xiao Zhang", "Zhaochun Ren", "Joost Visser"], 5 "year": 2025, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2511.05302" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "An anonymous repository link is provided: https://anonymous.4open.science/r/GAR-9EE2, mentioned in both Section 1 (contributions) and Section 10 (Data Availability)." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses two publicly available datasets: the CodeReviewer dataset and the Tufano dataset, both cited with original references. The anonymous repository also claims to contain the data." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper mentions using 4 NVIDIA A100 GPUs (Section 4.6) and specific model links on HuggingFace, but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided in the paper. The anonymous repository is referenced but no README or reproduction guide is described in the paper itself." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "All results in Tables 4, 5, and 6 are reported as point estimates (e.g., BLEU-4 of 12.32) with no confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims RARe 'outperforms state-of-the-art methods' and reports percentage improvements, but no statistical significance tests (p-values, t-tests, etc.) are conducted to validate these differences." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports percentage improvements with baseline context throughout, e.g., '153% increase in BLEU-4' from 5.12 to 12.96 (Table 5), and absolute scores for all methods, allowing readers to assess effect magnitude." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification is given for why 100 samples were chosen for the human evaluation (Section 6.2), nor is there a power analysis or acknowledgment that this may be insufficient." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "Section 4.6 states 'we averaged the results across three runs for each experiment,' but no standard deviations, error bars, or variance measures are reported in any table." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Table 6 compares RARe against six baselines: Tufano et al., CodeT5, CodeReviewer, CommentFinder, AUGER, and LLaMA-Reviewer." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include recent methods such as LLaMA-Reviewer (2023), CommentFinder (2022), AUGER (2022), and CodeReviewer (2022), which represent the state of the art for this task." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Table 5 provides a detailed ablation comparing the contributions of retrieval augmentation across three models and two settings (direct inference vs. fine-tuning). Table 6 also ablates retrieval strategy (random, top-1, top-3, top-5)." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Three evaluation metrics are used: BLEU-4, ROUGE-L, and METEOR (Section 4.3). Human evaluation adds a fourth dimension." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": true, 83 "justification": "Section 6.2 reports a human evaluation on 100 samples from each dataset, with two expert annotators categorizing reviews as Perfect Prediction, Semantically Equivalent, Alternative Solution, or Incorrect." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "Table 1 shows explicit train/test/validation splits for both datasets. Section 4.2 states 'The data split ratios are consistent with those used in previous works.'" 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down by dataset (CRer. vs. Tuf.), by retriever type (Table 4), by generator model and setting (Table 5), and by retrieval strategy (Table 6)." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 6.1 discusses cases where BLEU scores are misleading, Section 6.2 shows 'Incorrect' review counts, and Section 9 (Conclusion) discusses dependencies on retrieval accuracy as a failure mode." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that CodeGemma benefits very little from retrieval augmentation (Section 5.2: 'the benefits of retrieval are very limited'), and that increasing retrieved reviews from 1 to 5 decreases performance (Section 5.3)." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims BLEU-4 scores of 12.32 and 12.96 on two benchmarks, which match Table 6 (RARe top1: 12.32 on CRer., 12.96 on Tuf.). Human evaluation and interpretability claims are supported in Section 6." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper makes causal claims about retrieval augmentation improving performance. These are supported by controlled ablation: same models with and without retrieval (Table 5), and random vs. top-k retrieval (Table 6), constituting adequate single-variable manipulation." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "Section 8 (External Validity) explicitly states that 'the data for each language in the multi-language dataset are relatively limited' and raises 'concerns about the model's ability to generalize effectively to other programming languages or broader contexts.'" 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section 8 (Internal Validity) discusses alternative explanations: pre-training data biases, quality of the external knowledge base, and the construct validity of BLEU/ROUGE/METEOR metrics for measuring review quality." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Section 4.5.2 provides specific HuggingFace model links: meta-llama/Meta-Llama-3.1-8B-Instruct, mistralai/Mistral-7B-Instruct-v0.3, and google/codegemma-7b-it, which identify exact model versions." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Table 3 provides the actual prompt templates for both direct inference and RARe, including the instruction text and the structure of the retrieved review placeholder." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "Section 4.6 reports LoRA fine-tuning hyperparameters (5 epochs, learning rate 10^-4, batch size 16), but does not report inference-time parameters such as temperature, top-p, or max tokens for the LLM generation." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "Section 3 describes the full RARe pipeline: the retriever component (NDR, GDR, DPR) and the generator component, with prompt construction detailed in Section 4.5.3 and Table 3. The architecture is shown in Figure 1." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 4.2 states 'We do not apply any preprocessing to these datasets, since they have been already cleaned to reduce duplicates. The data split ratios are consistent with those used in previous works.' This is sufficient transparency about preprocessing." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 7 includes a dedicated 'Limitations' subsection, and Section 8 provides a full 'Threats to Validity' section covering internal, construct, and external validity." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Section 8 discusses specific threats: pre-training data opacity for the LLMs used, reliance on training data as knowledge base rather than a purpose-built corpus, limitations of BLEU/ROUGE/METEOR for measuring review quality, and limited language coverage in the multi-language dataset." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 7 (Limitations) explicitly states scope boundaries: limited to three LLMs and dense retrieval only, limited case studies, and only two annotators. Section 8 (External Validity) notes limited per-language data in the multi-language dataset." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 10 (Data Availability) states 'The data and code that support the findings of this study are available in anonymous repository.' The datasets are publicly available benchmarks (CodeReviewer and Tufano datasets)." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 4.2 describes the datasets: CRer. collects code from GitHub across nine languages; Tuf. consists of Java code from GitHub and Gerrit. Dataset statistics with train/test/val splits are in Table 1." 182 }, 183 "recruitment_methods_described": { 184 "applies": true, 185 "answer": false, 186 "justification": "For the human evaluation (Section 6.2), two expert annotators are mentioned (6 and 8 years of SE experience), but their recruitment method is not described — how they were selected and whether this could introduce bias is not discussed." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The paper documents the pipeline: datasets are taken as-is from prior work (no preprocessing), the retriever selects top-k reviews, these are fed to the generator via a prompt. Each stage is described in Section 3 and 4." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding sources, grants, or acknowledgments section is present in the paper." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly stated: Leiden University and University of Groningen. No commercial product is being evaluated, so there is no product-affiliation conflict." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": false, 207 "answer": false, 208 "justification": "No funding is disclosed, and the authors are at academic institutions with no apparent financial stake in the outcome. Likely unfunded academic work." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement or financial interests declaration is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper uses Llama 3.1, Mistral, and CodeGemma without stating their training data cutoff dates. These models may have been pre-trained on the benchmark datasets." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of whether the CodeReviewer or Tufano datasets (both publicly available since 2022) appeared in the pre-training data of the LLMs used." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "Both benchmark datasets were published in 2022 and are publicly available on GitHub. All three LLMs were trained after 2022 and could have seen this data. The paper does not address this contamination risk. Section 8 (Internal Validity) mentions pre-training data opacity but does not specifically address benchmark contamination." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": true, 236 "answer": false, 237 "justification": "The human evaluation in Section 6.2 involves two annotators rating generated reviews. No pre-registration is mentioned." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": true, 241 "answer": false, 242 "justification": "No IRB or ethics board approval is mentioned for the human evaluation study." 243 }, 244 "demographics_reported": { 245 "applies": true, 246 "answer": true, 247 "justification": "Section 6.2 reports that the two annotators have '6 or 8 years of software engineering experience,' providing relevant professional demographics." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": true, 251 "answer": false, 252 "justification": "No inclusion or exclusion criteria for the annotators are stated. The paper does not describe how they were selected or what qualifications were required beyond SE experience." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "This is not an experimental study with treatment/control groups for human participants. The annotators are evaluating outputs, not being assigned to experimental conditions." 258 }, 259 "blinding_described": { 260 "applies": true, 261 "answer": false, 262 "justification": "It is not stated whether the annotators knew which method (DI, RADI, FT, RAFT) generated each review. If they did, this could bias their ratings." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "With only two annotators performing a rating task, attrition is not a meaningful concern. This is not a participant study with dropout risk." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inference cost, latency, or tokens consumed are reported. The paper does not mention the practical cost of running RARe despite it requiring both retrieval and LLM generation steps." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "Section 4.6 mentions '4 NVIDIA A100 GPUs' for training, but does not state total GPU hours, training time, or total computational budget." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "RARe achieves state-of-the-art BLEU-4 scores of 12.32 and 12.96 on the CodeReviewer and Tufano datasets, respectively.", 286 "evidence": "Table 6 shows RARe (top1) achieving 12.32 BLEU-4 on CRer. and 12.96 on Tuf., compared to the next best baselines CommentFinder (9.47/12.71) and Tufano et al. (-/12.31).", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Retrieval augmentation improves Llama 3.1 direct inference by 153% BLEU-4, 240% ROUGE-L, and 228% METEOR on the Tufano dataset.", 291 "evidence": "Table 5 shows Llama 3.1 DI scores of 5.12/2.55/2.67 increasing to 12.96/8.67/8.76 with retrieval augmentation on the Tuf. dataset.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "RARe significantly increases the proportion of valuable human-evaluated reviews, with 275% increase on CRer. and 266% on Tuf. for direct inference.", 296 "evidence": "Table 8 shows valuable reviews (PP+SE+AS) increasing from 12 to 45 on CRer. (275%) and from 15 to 55 on Tuf. (267%) when adding retrieval to direct inference.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "The most relevant single retrieved review (top-1) contributes most to RARe's performance; more retrieved reviews decrease performance.", 301 "evidence": "Table 6 shows RARe (top1) outperforming RARe (top3) and RARe (top5) on all metrics across both datasets.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "RARe is the first to apply RAG to the code review generation task.", 306 "evidence": "Section 1 contributions and Section 2 literature review claim no prior work combines retrieval-based and generative methods for code review. This is a novelty claim that cannot be fully verified.", 307 "supported": "moderate" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "RARe, a retrieval-augmented generation system for automated code review, outperforms existing retrieval-only and generation-only baselines on two benchmark datasets (CodeReviewer and Tufano), achieving BLEU-4 scores of 12.32 and 12.96 respectively. Retrieval augmentation yields the largest improvements when applied to direct inference (up to 153% BLEU-4 increase), while fine-tuned models see more modest gains (up to 24%). Human evaluation on 200 samples confirms that retrieval augmentation substantially reduces incorrect reviews and increases the proportion of semantically equivalent or alternative solution reviews. Notably, using only the single most relevant retrieved review (top-1) outperforms using multiple retrieved reviews (top-3, top-5).", 312 "red_flags": [ 313 { 314 "flag": "No variance reported despite multiple runs", 315 "detail": "Section 4.6 states results are averaged over three runs, but no standard deviation, confidence interval, or variance measure is reported in any table. This makes it impossible to assess result stability or whether differences between methods are meaningful." 316 }, 317 { 318 "flag": "No statistical significance testing", 319 "detail": "All comparative claims ('outperforms', 'significantly outperforms') are based on comparing point estimates without any significance tests. Given the lack of variance reporting, the claimed improvements may not be statistically significant." 320 }, 321 { 322 "flag": "Benchmark contamination risk unaddressed", 323 "detail": "Both datasets (CodeReviewer 2022, Tufano 2022) are publicly available and were published before the training of Llama 3.1, Mistral, and CodeGemma. The paper does not discuss whether these models may have seen the benchmark data during pre-training, which could inflate scores." 324 }, 325 { 326 "flag": "Very small human evaluation sample", 327 "detail": "Only 100 samples per dataset with only 2 annotators. No inter-annotator agreement metric (e.g., Cohen's kappa) is reported. Disagreements are resolved by discussion, which may introduce bias. No power analysis justifies this sample size." 328 }, 329 { 330 "flag": "Missing inference hyperparameters", 331 "detail": "Temperature, top-p, and max token settings for LLM inference are not reported. These parameters significantly affect generation quality and reproducibility." 332 } 333 ], 334 "cited_papers": [ 335 { 336 "title": "Automating code review activities by large-scale pre-training", 337 "authors": ["Zhiyu Li", "Shuai Lu", "Daya Guo", "Nan Duan"], 338 "year": 2022, 339 "relevance": "Key baseline for automated code review using pre-trained models; provides one of the two benchmark datasets used." 340 }, 341 { 342 "title": "Using pre-trained models to boost code review automation", 343 "authors": ["Rosalia Tufano", "Simone Masiero", "Antonio Mastropaolo"], 344 "year": 2022, 345 "relevance": "Provides the Tufano benchmark dataset and T5-based code review generation baseline." 346 }, 347 { 348 "title": "LLaMA-Reviewer: Advancing Code Review Automation with Large Language Models through Parameter-Efficient Fine-Tuning", 349 "authors": ["Junyi Lu", "Lei Yu", "Xiaojia Li"], 350 "year": 2023, 351 "relevance": "Baseline using LLM fine-tuning (LoRA) for code review generation; demonstrates LLM applicability to code review." 352 }, 353 { 354 "title": "AUGER: automatically generating review comments with pre-training models", 355 "authors": ["Lingwei Li", "Li Yang", "Huaxi Jiang"], 356 "year": 2022, 357 "relevance": "Pre-training-based code review generation baseline, part of the state-of-the-art comparison." 358 }, 359 { 360 "title": "Commentfinder: a simpler, faster, more accurate code review comments recommendation", 361 "authors": ["Yang Hong", "Chakkrit Tantithamthavorn", "Patanamon Thongtanunam", "Aldeida Aleti"], 362 "year": 2022, 363 "relevance": "Retrieval-based code review baseline; strongest competitor to RARe on the Tufano dataset." 364 }, 365 { 366 "title": "Retrieval-augmented generation for large language models: A survey", 367 "authors": ["Yunfan Gao", "Yun Xiong", "Xinyu Gao"], 368 "year": 2023, 369 "arxiv_id": "2312.10997", 370 "relevance": "Survey of RAG techniques for LLMs; provides theoretical foundation for the RAG approach used in RARe." 371 }, 372 { 373 "title": "Retrieval Augmented Code Generation and Summarization", 374 "authors": ["Md Rizwan Parvez", "Wasi Ahmad", "Saikat Chakraborty"], 375 "year": 2021, 376 "relevance": "Applies RAG to code generation and summarization tasks; provides DPR hyperparameters used in this work." 377 }, 378 { 379 "title": "Retrieval-based prompt selection for code-related few-shot learning", 380 "authors": ["Noor Nashid", "Mifta Sintaha", "Ali Mesbah"], 381 "year": 2023, 382 "relevance": "CEDAR framework using retrieval-augmented strategies for code-related tasks including program repair." 383 }, 384 { 385 "title": "No more fine-tuning? an experimental evaluation of prompt tuning in code intelligence", 386 "authors": ["Chaozheng Wang", "Yuanhang Yang", "Cuiyun Gao"], 387 "year": 2022, 388 "relevance": "Evaluates prompt tuning vs. fine-tuning for code intelligence tasks; informs the generation approach in RARe." 389 }, 390 { 391 "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages", 392 "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"], 393 "year": 2020, 394 "relevance": "Pre-trained code model used as the encoder in the NDR and DPR retrievers in this work." 395 } 396 ] 397 }