scan.json (25104B)
1 { 2 "paper": { 3 "title": "Combining Large Language Models with Static Analyzers for Code Review Generation", 4 "authors": [ 5 "Imen Jaoua", 6 "Oussama Ben Sghaier", 7 "Houari Sahraoui" 8 ], 9 "year": 2025, 10 "venue": "arXiv preprint", 11 "arxiv_id": "2502.06633" 12 }, 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "A replication package is provided at https://github.com/ImenJaoua/Hybrid-Code-Review, mentioned in the footnote on page 1 and in references [48]." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The dataset is available at https://zenodo.org/records/14061110, mentioned in the footnote on page 1 and in references [49]." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "The paper mentions four NVIDIA RTX 3090 GPUs and 4-bit quantization with QLoRA, but does not provide a requirements.txt, Dockerfile, or detailed library versions sufficient to recreate the environment." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "While the replication package is referenced, the paper itself does not contain step-by-step reproduction instructions. The reader must rely on the external repository without guidance in the paper." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "Results are reported as percentages in bar charts (Figures 8, 9, 10) without confidence intervals or error bars." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper makes comparative claims (e.g., 'RAG significantly outperforms Mi') but does not use any statistical significance tests. Cohen's kappa is used for inter-rater agreement (RQ2), but no significance tests compare the approaches." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": false, 50 "justification": "Results are presented as bar charts showing percentage breakdowns of accuracy categories, but no formal effect sizes (Cohen's d, odds ratios, etc.) are reported. The differences are shown visually but not quantified with magnitude measures." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "The test set of 1,245 samples and the 10% manual evaluation subset (~125 samples) are used without justification of why these sizes are sufficient. No power analysis is discussed." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "No variance, standard deviation, or spread measures are reported across any experiments. The LLM-based evaluation results are single-run numbers with no indication of variability." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Two baselines are included: the standalone KBS (static analyzers PMD and Checkstyle) and the standalone LBS (fine-tuned CodeLlama-7b model Mi). All three hybrid approaches are compared against both baselines." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": false, 72 "justification": "The LBS baseline uses CodeLlama-7b fine-tuned on the dataset from Li et al. [25] (2022). No comparison is made against more recent code review approaches or larger LLMs. PMD and Checkstyle are well-established but not the most recent static analysis tools." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "The three hybrid strategies (DAT, RAG, NCO) effectively serve as ablations showing different integration points for combining KBS and LBS. Each strategy modifies a different component of the pipeline (training data, inference prompt, post-processing)." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Two distinct evaluation dimensions are used: accuracy (RQ1, RQ3) classifying reviews as accurate/partially accurate/not accurate, and coverage (RQ4) ranking approaches by issue detection breadth." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": true, 87 "justification": "RQ1 involves manual evaluation by two human evaluators on a 10% sample of the test set, classifying reviews as accurate, partially accurate, or not accurate. Reviews were anonymized and randomized." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "The augmented dataset Da was split into 80% training, 10% validation, and 10% test. The evaluation was conducted on a test set of 1,245 samples unseen by the fine-tuned model MFT (Section IV.B)." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": false, 97 "justification": "Results are presented as aggregate distributions across all samples. There is no breakdown by issue type, rule category, project, or code complexity." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section IV.D discusses cases where NCO's coverage suffers due to conflicting LBS and KBS feedback, with a concrete example of an unused import case. The bimodal distribution of DAT is also discussed as a failure pattern." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper reports that DAT and NCO did not improve accuracy over Mi (RQ1, RQ3), and that DAT shows a polarized bimodal performance pattern with many Rank 5 results (Section IV.C.4). NCO showed only marginal coverage improvement." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims that 'hybrid strategies enhance the relevance, completeness, and overall quality of review comments.' The results support this for RAG (accuracy improvement) and DAT/RAG (coverage improvement), though NCO shows only modest gains. The abstract is somewhat generous but not contradicted." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper makes causal claims through ablation-style comparisons (DAT, RAG, NCO each modify one aspect of the pipeline). The controlled experimental design — same test set, same base model, varying only the integration strategy — supports these causal claims adequately." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title says 'Code Review Generation' broadly, but the study tests only on Java code with PMD and Checkstyle using CodeLlama-7b. The threats to validity mention Java specificity, but the title and abstract do not bound the claims to this setting." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper does not discuss alternative explanations for the results. For example, the accuracy improvement from RAG could be partly due to the static analyzer output providing correct answers that the model parrots without understanding. The threats section discusses evaluator bias and tool choice but not alternative explanations for the observed performance differences." 130 } 131 }, 132 "setup_transparency": { 133 "model_versions_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper specifies 'CodeLlama-7b' and 'Llama3-70B' but does not provide exact model versions, snapshot dates, or API versions. These are marketing-level names without version specificity." 137 }, 138 "prompts_provided": { 139 "applies": true, 140 "answer": true, 141 "justification": "Figure 6 shows the prompt used for RAG-based review generation. Figure 4 shows the LLM-as-judge prompt structure. The prompts include the actual text with placeholders clearly indicated and filled from the dataset." 142 }, 143 "hyperparameters_reported": { 144 "applies": true, 145 "answer": true, 146 "justification": "Section III.B reports detailed hyperparameters: batch size 4 per device, gradient accumulation step size 4, 4-bit quantization, QLoRA with r=16, alpha=32, dropout=0.05. The LLM-as-judge uses a 10-point scale with threshold 8." 147 }, 148 "scaffolding_described": { 149 "applies": false, 150 "answer": false, 151 "justification": "No agentic scaffolding is used. The approach uses standard fine-tuning and inference pipelines without agent loops, tool use, or retry logic." 152 }, 153 "data_preprocessing_documented": { 154 "applies": true, 155 "answer": true, 156 "justification": "Section III.C describes the data augmentation pipeline in detail: generation of 4 reviews per code change, static analysis application, merging, filtering using Llama3-70B with a threshold of 8/10, balancing KBS/LBS reviews, and discarding overrepresented rules. The final dataset Da has 78,776 samples." 157 } 158 }, 159 "limitations_and_scope": { 160 "limitations_section_present": { 161 "applies": true, 162 "answer": true, 163 "justification": "Section IV.E 'Threats to Validity' is a dedicated subsection discussing evaluator bias, generalizability, dependency on LLM judgment, choice of static analyzers, and evaluation metrics limitations." 164 }, 165 "threats_to_validity_specific": { 166 "applies": true, 167 "answer": true, 168 "justification": "The threats are specific to this study: Java-only evaluation limiting generalizability, reliance on Llama3-70B as evaluator with potential misalignment, the specific choice of PMD and Checkstyle, and the win-tie-loss ranking system requiring a two-level difference threshold." 169 }, 170 "scope_boundaries_stated": { 171 "applies": true, 172 "answer": false, 173 "justification": "The threats section mentions Java specificity as a limitation, but does not explicitly state what the results do NOT show. There is no explicit bounding of claims to the tested models, dataset, or languages — the conclusion speaks broadly about 'combining static analysis tools with LLMs' without qualifying." 174 } 175 }, 176 "data_integrity": { 177 "raw_data_available": { 178 "applies": true, 179 "answer": true, 180 "justification": "The dataset is available at https://zenodo.org/records/14061110, and the replication package at https://github.com/ImenJaoua/Hybrid-Code-Review, enabling independent verification." 181 }, 182 "data_collection_described": { 183 "applies": true, 184 "answer": true, 185 "justification": "The data collection is described: the original dataset DMi from Li et al. [25] is a real-world code review dataset. The augmented dataset Da is generated through the pipeline in Section III.C with explicit filtering criteria." 186 }, 187 "recruitment_methods_described": { 188 "applies": true, 189 "answer": false, 190 "justification": "Two human evaluators assessed accuracy in RQ1, described only as having 'good expertise in code review.' No details on how they were recruited, their backgrounds, or potential selection bias." 191 }, 192 "data_pipeline_documented": { 193 "applies": true, 194 "answer": true, 195 "justification": "The data pipeline is well-documented: from the original 27,267 Java entries in Do, to augmented data generation, Llama3-70B filtering with threshold 8, rule balancing, and final 78,776 samples in Da (Section III.C, Figures 3, 5). The test set filtering to 1,245 common samples is also explained." 196 } 197 }, 198 "conflicts_of_interest": { 199 "funding_disclosed": { 200 "applies": true, 201 "answer": false, 202 "justification": "No funding sources are mentioned in the paper. There is no acknowledgments section disclosing grants or funding." 203 }, 204 "affiliations_disclosed": { 205 "applies": true, 206 "answer": true, 207 "justification": "All three authors are affiliated with Université de Montréal, clearly disclosed on the first page. They are not evaluating a commercial product they are affiliated with." 208 }, 209 "funder_independent_of_outcome": { 210 "applies": false, 211 "answer": false, 212 "justification": "No funding is disclosed, and the authors are at a university with no apparent commercial stake in the outcome. This appears to be unfunded academic work." 213 }, 214 "financial_interests_declared": { 215 "applies": true, 216 "answer": false, 217 "justification": "No competing interests or financial interests statement is present in the paper." 218 } 219 }, 220 "contamination": { 221 "training_cutoff_stated": { 222 "applies": true, 223 "answer": false, 224 "justification": "The paper fine-tunes CodeLlama-7b and uses Llama3-70B as an evaluator, but does not state the training data cutoff dates for either model. This is relevant since the code review dataset could overlap with training data." 225 }, 226 "train_test_overlap_discussed": { 227 "applies": true, 228 "answer": false, 229 "justification": "No discussion of whether CodeLlama-7b or Llama3-70B may have seen the code review dataset during pre-training. The test data from Li et al. [25] was published in 2022, likely before these models' training cutoffs." 230 }, 231 "benchmark_contamination_addressed": { 232 "applies": true, 233 "answer": false, 234 "justification": "The code review dataset from Li et al. [25] was published in 2022, before the training cutoff of both CodeLlama and Llama3. This contamination risk is not addressed." 235 } 236 }, 237 "human_studies": { 238 "pre_registered": { 239 "applies": true, 240 "answer": false, 241 "justification": "The study involves two human evaluators rating code reviews (RQ1). No pre-registration is mentioned." 242 }, 243 "irb_or_ethics_approval": { 244 "applies": true, 245 "answer": false, 246 "justification": "Human evaluators participate in the study but no IRB or ethics board approval is mentioned." 247 }, 248 "demographics_reported": { 249 "applies": true, 250 "answer": false, 251 "justification": "The two evaluators are described only as having 'good expertise in code review.' No demographics, years of experience, or other characterization is provided." 252 }, 253 "inclusion_exclusion_criteria": { 254 "applies": true, 255 "answer": false, 256 "justification": "No inclusion or exclusion criteria for the evaluators are stated. They were not recruited from a broader pool with stated criteria." 257 }, 258 "randomization_described": { 259 "applies": false, 260 "answer": false, 261 "justification": "This is not an experimental study with treatment/control assignment for participants. The evaluators both assessed the same reviews." 262 }, 263 "blinding_described": { 264 "applies": true, 265 "answer": true, 266 "justification": "Section IV.B states: 'All reviews were anonymized and presented to the evaluators in a randomized order. This approach minimizes confirmation bias, preventing evaluators from unconsciously favoring certain methods by being unaware of the source of each review.'" 267 }, 268 "attrition_reported": { 269 "applies": false, 270 "answer": false, 271 "justification": "Only two evaluators participated in a single assessment task. Attrition is not applicable in this context." 272 } 273 }, 274 "cost_and_practicality": { 275 "inference_cost_reported": { 276 "applies": true, 277 "answer": false, 278 "justification": "No inference cost, API cost, tokens consumed, or wall-clock time for generating reviews is reported, despite the method involving multiple LLM inference calls per code change." 279 }, 280 "compute_budget_stated": { 281 "applies": true, 282 "answer": false, 283 "justification": "The paper mentions four NVIDIA RTX 3090 GPUs were used for training, but does not report total training time, GPU hours, or the computational cost of the Llama3-70B evaluation runs." 284 } 285 } 286 }, 287 "claims": [ 288 { 289 "claim": "The RAG approach significantly outperforms the standalone LBS (Mi) in terms of accuracy for code review generation.", 290 "evidence": "Figure 8 (manual evaluation, RQ1) and Figure 9 (LLM evaluation, RQ3) show RAG has a higher proportion of accurate reviews compared to Mi. Cohen's kappa of 0.72 validates LLM-human agreement.", 291 "supported": "moderate" 292 }, 293 { 294 "claim": "DAT and RAG achieve the best coverage, together accounting for nearly 80% of Rank 1 reviews.", 295 "evidence": "Figure 10 shows DAT achieving 49% Rank 1 and RAG achieving approximately 30% Rank 1 (70% in top 2 ranks). Win-tie-loss analysis in Figure 11 confirms DAT and RAG outperform baselines in coverage.", 296 "supported": "moderate" 297 }, 298 { 299 "claim": "NCO does not notably improve accuracy and offers only marginal increase in coverage compared to the standalone LBS.", 300 "evidence": "Figures 8 and 9 show NCO accuracy comparable to Mi. Figure 10 shows NCO primarily in middle ranks (Rank 3). Section IV.D discusses how NCO's LBS component can contradict KBS findings.", 301 "supported": "moderate" 302 }, 303 { 304 "claim": "Llama3-70B closely mirrors human evaluations with Cohen's kappa of 0.72 (substantial agreement).", 305 "evidence": "Section IV.C.2 reports Cohen's kappa of 0.72 on the 10% manual evaluation subset, comparing human and LLM ratings on the same accuracy scale.", 306 "supported": "moderate" 307 }, 308 { 309 "claim": "Hybrid approaches bridge the gap between rule-based tools and deep learning models for code review.", 310 "evidence": "The combined results across RQ1-RQ4 show RAG improves accuracy over LBS while maintaining broader coverage than KBS. However, the claim is broad and the evidence is limited to one dataset and one programming language.", 311 "supported": "weak" 312 } 313 ], 314 "methodology_tags": [ 315 "benchmark-eval" 316 ], 317 "key_findings": "The paper proposes three strategies for combining static analyzers (KBS) with fine-tuned LLMs (LBS) for automated code review: Data-Augmented Training (DAT), Retrieval-Augmented Generation (RAG), and Naive Concatenation of Outputs (NCO). RAG emerged as the most effective strategy, improving both accuracy and coverage over standalone LBS by incorporating static analysis results into inference prompts. DAT achieved broad coverage (49% Rank 1) but showed a polarized bimodal performance pattern, while NCO provided only marginal improvements. The evaluation used both human assessment and LLM-as-a-judge (Llama3-70B) with Cohen's kappa of 0.72 indicating substantial agreement between the two evaluation methods.", 318 "red_flags": [ 319 { 320 "flag": "No statistical significance tests", 321 "detail": "The paper claims RAG 'significantly outperforms' Mi but provides no statistical tests to support this. All comparisons are based on visual inspection of bar charts and percentage distributions without any hypothesis testing." 322 }, 323 { 324 "flag": "Small evaluator pool", 325 "detail": "Only two human evaluators assessed accuracy, with no characterization beyond 'good expertise in code review.' This is insufficient for establishing the reliability of the manual evaluation, and inter-rater agreement between the two humans is not reported (only human-LLM agreement via Cohen's kappa)." 326 }, 327 { 328 "flag": "Contamination risk unaddressed", 329 "detail": "The code review dataset from Li et al. (2022) was published before CodeLlama and Llama3 training cutoffs. Neither model's training cutoff is stated, and no contamination analysis is performed. The evaluation results could be inflated if the models saw this data during pre-training." 330 }, 331 { 332 "flag": "LLM evaluating LLM outputs", 333 "detail": "Llama3-70B is used both as the data filtering judge (threshold 8/10 for DAT data quality) and as the evaluation judge (RQ3, RQ4). This creates a circular dependency: the training data was curated by a model from the same family as the evaluator, potentially inflating scores for approaches trained on LLM-curated data." 334 }, 335 { 336 "flag": "No variance or reproducibility measures", 337 "detail": "All results appear to be single-run. No standard deviations, confidence intervals, or multi-seed experiments are reported, making it impossible to assess result stability." 338 } 339 ], 340 "cited_papers": [ 341 { 342 "title": "Code review automation: strengths and weaknesses of the state of the art", 343 "authors": ["R. Tufano", "O. Dabic", "A. Mastropaolo", "M. Ciniselli", "G. Bavota"], 344 "year": 2024, 345 "relevance": "Evaluates state-of-the-art code review automation approaches, directly relevant to assessing methodology in LLM-based code review research." 346 }, 347 { 348 "title": "Core: Resolving code quality issues using llms", 349 "authors": ["N. Wadhwa", "J. Pradhan", "A. Sonwane", "S. P. Sahu", "N. Natarajan", "A. Kanade", "S. Parthasarathy", "S. Rajamani"], 350 "year": 2024, 351 "relevance": "Uses LLMs for code quality issue resolution, relevant to understanding LLM capabilities in code review tasks." 352 }, 353 { 354 "title": "Automating code review activities by large-scale pre-training", 355 "authors": ["Z. Li", "S. Lu", "D. Guo", "N. Duan", "S. Jannu", "G. Jenks", "D. Majumder", "J. Green", "A. Svyatkovskiy", "S. Fu"], 356 "year": 2022, 357 "relevance": "Pre-trains CodeT5 for code review tasks; the dataset used in this paper originates from this work." 358 }, 359 { 360 "title": "Llama-reviewer: Advancing code review automation with large language models through parameter-efficient fine-tuning", 361 "authors": ["J. Lu", "L. Yu", "X. Li", "L. Yang", "C. Zuo"], 362 "year": 2023, 363 "relevance": "Applies parameter-efficient fine-tuning of LLMs specifically for code review, directly comparable methodology." 364 }, 365 { 366 "title": "RepairAgent: An autonomous, llm-based agent for program repair", 367 "authors": ["I. Bouzenia", "P. Devanbu", "M. Pradel"], 368 "year": 2024, 369 "arxiv_id": "2403.17134", 370 "relevance": "Combines LLMs with static analysis for autonomous program repair, demonstrating agentic approaches in software engineering." 371 }, 372 { 373 "title": "STALL+: Boosting llm-based repository-level code completion with static analysis", 374 "authors": ["J. Liu", "Y. Chen", "M. Liu", "X. Peng", "Y. Lou"], 375 "year": 2024, 376 "arxiv_id": "2406.10018", 377 "relevance": "Integrates static analyzers with LLMs for code completion through multi-phase approach, closely related hybrid methodology." 378 }, 379 { 380 "title": "Judging llm-as-a-judge with mt-bench and chatbot arena", 381 "authors": ["L. Zheng", "W.-L. Chiang", "Y. Sheng", "S. Zhuang", "Z. Wu", "Y. Zhuang", "Z. Lin", "Z. Li", "D. Li", "E. Xing"], 382 "year": 2023, 383 "relevance": "Foundational work on LLM-as-a-judge methodology used in this paper's evaluation design." 384 }, 385 { 386 "title": "Improving the learning of code review successive tasks with cross-task knowledge distillation", 387 "authors": ["O. Ben Sghaier", "H. Sahraoui"], 388 "year": 2024, 389 "relevance": "Applies knowledge distillation to code review tasks, from the same research group, relevant to code review automation methodology." 390 }, 391 { 392 "title": "SkipAnalyzer: A tool for static code analysis with large language models", 393 "authors": ["M. M. Mohajer", "R. Aleithan", "N. S. Harzevili", "M. Wei", "A. B. Belle", "H. V. Pham", "S. Wang"], 394 "year": 2023, 395 "arxiv_id": "2310.18532", 396 "relevance": "Combines static analysis with LLMs to reduce false positives in bug detection, relevant hybrid approach for code quality." 397 }, 398 { 399 "title": "No more manual tests? Evaluating and improving ChatGPT for unit test generation", 400 "authors": ["Z. Yuan", "Y. Lou", "M. Liu", "S. Ding", "K. Wang", "Y. Chen", "X. Peng"], 401 "year": 2023, 402 "arxiv_id": "2305.04207", 403 "relevance": "Evaluates LLM capabilities in test generation with iterative repair, relevant to understanding LLM limitations in SE tasks." 404 }, 405 { 406 "title": "Adaptive test generation using a large language model", 407 "authors": ["M. Schäfer", "S. Nadi", "A. Eghbali", "F. Tip"], 408 "year": 2023, 409 "arxiv_id": "2302.06527", 410 "relevance": "Uses LLMs for adaptive test generation combining static analysis context, relevant hybrid approach for software testing." 411 } 412 ] 413 }