scan.json (25563B)
1 { 2 "paper": { 3 "title": "Improving Automated Secure Code Reviews: A Synthetic Dataset for Code Vulnerability Flaws", 4 "authors": [ 5 "Leonardo Centellas-Claros", 6 "Juan J. Alonso-Lecaros", 7 "Juan Pablo Sandoval Alcocer", 8 "Andres Neyem" 9 ], 10 "year": 2025, 11 "venue": "arXiv.org", 12 "arxiv_id": "2504.16310", 13 "doi": "10.48550/arXiv.2504.16310" 14 }, 15 "scan_version": 3, 16 "active_modules": [], 17 "methodology_tags": ["theoretical"], 18 "key_findings": "This paper proposes a methodology for generating synthetic vulnerability-focused code review datasets using LLMs, but presents no completed experiments. Early data collection identified 5,973 Java repositories from GitHub and mined 3,827,517 commits, filtering to 35,950 potentially security-related commits via keyword matching. The authors plan to evaluate GPT-4o, Claude 3.5 Sonnet, Flan-T5, and Qwen 2.5 with three prompting strategies (zero-shot, CoT, self-reflection) and fine-tune CodeReviewer models, but none of this work has been executed.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "No source code, scripts, or repository URLs are provided in the paper. The data collection pipeline and planned experiments have no associated code release." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "The synthetic dataset is the proposed contribution but has not been created or released. The filtered commits (35,950) are not made available. No dataset download links are provided." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "No environment specifications, dependency files, or setup instructions are provided. The paper does not describe any technical environment details." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No reproduction instructions are included. The methodology section describes planned steps but provides no scripts, commands, or concrete instructions for reproducing the work." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": false, 45 "answer": false, 46 "justification": "The paper presents no experimental results — only descriptive counts from data collection (5,973 repos, 3.8M commits, 35,950 filtered). No quantitative results require uncertainty measures." 47 }, 48 "significance_tests": { 49 "applies": false, 50 "answer": false, 51 "justification": "No comparative claims are made based on data. The paper is a proposal with no experimental results to test for significance." 52 }, 53 "effect_sizes_reported": { 54 "applies": false, 55 "answer": false, 56 "justification": "No effects have been measured. The paper is a research proposal with no completed experiments." 57 }, 58 "sample_size_justified": { 59 "applies": false, 60 "answer": false, 61 "justification": "This is a proposal/theoretical paper with no completed experiments. The planned sample of 100 commits for evaluation is justified contextually ('to make manual review feasible') but no power analysis or formal justification is provided." 62 }, 63 "variance_reported": { 64 "applies": false, 65 "answer": false, 66 "justification": "No experimental runs have been conducted. The paper is a research proposal with no results to report variance for." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": false, 73 "justification": "The paper describes planned baselines (CodeReviewer, Lin et al.'s version, GPT-4o) in Section III.C.1, but no baseline comparisons have been executed. Only plans are presented." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": false, 78 "justification": "Planned baselines include CodeReviewer (2022) and Lin et al. (2024), which are reasonably contemporary. However, since no comparisons have been run, this criterion is not satisfied." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": false, 83 "justification": "The system has multiple components (4 LLMs × 3 prompting strategies). The planned comparison across these combinations could serve as an ablation, but no experiments have been conducted." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": false, 88 "justification": "The paper plans to use BLEU-4 and manual evaluation (semantic equivalence, applicability) per Section III.C.4, but no metrics have been computed since no experiments were run." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": false, 93 "justification": "Human evaluation by two authors is planned for both RQ1 (1,200 reviews) and RQ2 (manual assessment), with Cohen's Kappa for inter-rater agreement. However, none of this evaluation has been conducted." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": false, 98 "justification": "The paper plans to use a filtered subset of Li et al.'s test partition (Section III.C.3), identifying 43 and 63 potentially security-related samples. This test set has been preliminarily identified but not used in any evaluation." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": false, 103 "justification": "No results are presented, so no per-category breakdowns exist. The planned evaluation across 4 LLMs and 3 prompt strategies would provide breakdowns, but this work has not been done." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": false, 108 "justification": "No failure cases are discussed since no experiments have been conducted. The paper only presents data collection counts." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": false, 113 "justification": "No results of any kind (positive or negative) are reported. The paper is entirely a proposal." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": false, 120 "justification": "The abstract states 'We anticipate that the synthetic dataset will improve the performance of the original code review models.' This anticipatory claim has no supporting evidence — no experiments have been run to test this hypothesis." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": false, 125 "justification": "The abstract claims the synthetic dataset will 'improve the performance' of code review models, which is a causal claim. No experimental evidence supports this, and no causal identification strategy is described beyond a planned before/after comparison." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The title 'Improving Automated Secure Code Reviews: A Synthetic Dataset for Code Vulnerability Flaws' and the abstract frame the approach as general, mentioning no language restriction. The Java-only scope is revealed only in Section III.B.1, and the title does not reflect this substantial limitation." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "No alternative explanations are discussed since no results exist. The threats to validity section (Section V) discusses methodological limitations but does not consider alternative explanations for any findings." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper plans to use BLEU-4 as a primary metric for code review quality. BLEU is a well-known imperfect proxy for text generation quality, and no discussion is provided of the gap between BLEU scores and actual usefulness of security code reviews." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper lists 'GPT-4o from OpenAI', 'Claude 3.5 Sonnet from Anthropic', 'Flan-T5 from Google' and 'Qwen 2.5' (Section III.B.4). These are marketing names without specific version identifiers, snapshot dates, or model sizes (especially for Flan-T5 and Qwen 2.5)." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": false, 152 "justification": "One zero-shot prompt template is provided in Section III.B.5 with {{Diff}} and {{Message}} placeholders. However, the CoT and Self-Reflection prompts are only described in natural language ('a technique designed to enhance the model's performance'), and the actual prompt texts are not provided." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": false, 157 "justification": "No hyperparameters are reported for any of the planned LLM experiments (temperature, top-p, max tokens). For the planned fine-tuning, the paper states they will 'use the same hyperparameters as those employed in the original CodeReviewer fine-tuning tasks' without listing them." 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "No agentic scaffolding is used. The proposed approach involves direct prompting of LLMs without tool use, memory, or multi-step agent workflows." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "The data preprocessing pipeline is well-documented across Sections III.B.1-3 and IV: project selection (Java, ≥50 PRs), commit filtering (single .java file, no merge commits, no non-source changes), keyword-based vulnerability filtering from Alfadel et al. [23], and test file exclusion. Counts at each stage: 5,973 repos → 3,827,517 commits → 43,131 keyword-filtered → 35,950 after test exclusion." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section V 'Threats to Validity' is a dedicated section discussing both internal and external validity threats." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "Specific threats are discussed: subjective evaluation bias from author-based review (mitigated by two independent reviewers), keyword-based filtering potentially missing security commits, prompt refinement sample of 50 commits potentially unrepresentative, and Java-only limitation. These are specific to this study." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": true, 184 "justification": "Section V explicitly states: 'The study's focus on Java projects inherently limits the generalizability of its findings to other programming languages' and 'the study targets vulnerability-fixing commits—a specific subset of code review scenarios.' These are specific scope boundaries." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "No raw data is released. The 35,950 filtered commits, keyword lists, or repository identifiers are not made available for independent verification." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Data collection is described in detail in Sections III.B.1-3: Java projects from GitHub with ≥50 PRs (Tufano et al.'s criteria), single-file commits excluding merge commits and non-source files, keyword-based vulnerability filtering with iterative refinement by two authors." 197 }, 198 "recruitment_methods_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Repository selection criteria are described: GitHub Java projects with at least 50 pull requests, following criteria from Tufano et al. [13]. The selection method and its rationale ('targeting repositories with significant development activity') are documented." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "The pipeline is documented with counts at each stage in Section IV: 5,973 repos → 3,827,517 commits → 43,131 keyword-filtered → 35,950 after test file exclusion. Filtering criteria are described for each step." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "The Acknowledgments section discloses funding: ANID Doctoral Scholarship (2024-21240734), CENIA (BASAL FB210017), and Programa de Inserción Académica 2022 at PUC Chile." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "All four authors are listed as affiliated with the Department of Computer Science, Pontificia Universidad Católica de Chile. No product from the authors' institution is being evaluated." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": true, 223 "justification": "Funding comes from Chilean national research agencies (ANID) and an academic AI center (CENIA). These funders have no commercial interest in the outcome of this code review research." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial interests statement is included in the paper. Absence of disclosure is not absence of conflict." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": false, 234 "answer": false, 235 "justification": "No model evaluation on any benchmark has been conducted. The paper is a proposal; the LLMs are planned for use as synthetic data generators, and no evaluation results are presented." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": false, 239 "answer": false, 240 "justification": "No model evaluation has been conducted. The paper proposes future experiments but has not executed them, so train/test overlap is not yet a concern to address." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": false, 244 "answer": false, 245 "justification": "No benchmark evaluation has been performed. The paper is a research proposal with only data collection counts reported." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study. The planned evaluations involve the paper's authors reviewing generated outputs, not external human subjects." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants. The study mines public GitHub repositories and plans author-based evaluation of generated text." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants and not an experimental study with conditions." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants and not an experimental study." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in the study." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": false, 288 "answer": false, 289 "justification": "This is a proposal/theoretical paper with no completed experiments. No inference or API costs are applicable since no models have been run." 290 }, 291 "compute_budget_stated": { 292 "applies": false, 293 "answer": false, 294 "justification": "This is a proposal paper with no completed experiments requiring compute budget reporting." 295 } 296 } 297 }, 298 "claims": [ 299 { 300 "claim": "LLMs can produce human-like code reviews from vulnerability-related commits with high accuracy, creating a valuable resource for enhancing automated code-to-comment review models.", 301 "evidence": "This is the paper's overarching hypothesis stated in Section III. No experimental evidence is provided — the claim is entirely aspirational.", 302 "supported": "unsupported" 303 }, 304 { 305 "claim": "The synthetic dataset will improve the performance of existing code review models when used for fine-tuning.", 306 "evidence": "Abstract states 'We anticipate that the synthetic dataset will improve the performance of the original code review models.' No experiments have been conducted to test this.", 307 "supported": "unsupported" 308 }, 309 { 310 "claim": "Security-focused reviews comprise less than 4% of existing code review datasets.", 311 "evidence": "Section I cites Yu et al. [8] who found only 614 security-related comments out of 20,000 code review comments (3.07%). This is based on cited literature, not the authors' own analysis.", 312 "supported": "moderate" 313 }, 314 { 315 "claim": "35,950 potentially security-related commits were identified from 5,973 Java repositories on GitHub.", 316 "evidence": "Section IV reports: 5,973 repositories → 3,827,517 commits → 43,131 keyword-filtered → 35,950 after test file exclusion. These are descriptive counts from the data collection pipeline.", 317 "supported": "moderate" 318 } 319 ], 320 "red_flags": [ 321 { 322 "flag": "Proposal paper with no completed experiments", 323 "detail": "The paper presents no experimental results — only a planned methodology and early data collection counts. All research questions (RQ1, RQ2) remain unanswered. This is a research proposal, not a completed study." 324 }, 325 { 326 "flag": "Claims significantly outrun evidence", 327 "detail": "The abstract claims 'We anticipate that the synthetic dataset will improve the performance of the original code review models' despite having zero experimental evidence. The entire evaluation framework is planned but unexecuted." 328 }, 329 { 330 "flag": "Title and abstract overgeneralize", 331 "detail": "The title 'Improving Automated Secure Code Reviews: A Synthetic Dataset for Code Vulnerability Flaws' and abstract present the approach as general, but the study is restricted to Java-only projects. This limitation is only revealed in Section III.B.1." 332 }, 333 { 334 "flag": "Unvalidated filtering methodology", 335 "detail": "The keyword-based commit filtering (yielding 35,950 commits) has not been validated. The planned iterative refinement with two independent reviewers and Cohen's Kappa measurement has not been executed, so the precision of the current keyword list is unknown." 336 } 337 ], 338 "cited_papers": [ 339 { 340 "title": "Automating code review activities by large-scale pre-training", 341 "authors": ["Z. Li", "S. Lu", "D. Guo", "N. Duan", "S. Jannu", "G. Jenks", "D. Majumder", "J. Green", "A. Svyatkovskiy", "S. Fu", "N. Sundaresan"], 342 "year": 2022, 343 "doi": "10.1145/3540250.3549081", 344 "relevance": "Introduces CodeReviewer, a large-scale pre-trained model for code review automation, used as a baseline in this work." 345 }, 346 { 347 "title": "Auger: automatically generating review comments with pre-training models", 348 "authors": ["L. Li", "L. Yang", "H. Jiang", "J. Yan", "T. Luo", "Z. Hua", "G. Liang", "C. Zuo"], 349 "year": 2022, 350 "doi": "10.1145/3540250.3549099", 351 "relevance": "Proposes automated review comment generation using GitHub data and data augmentation, directly related to code review automation." 352 }, 353 { 354 "title": "LLaMA-Reviewer: Advancing Code Review Automation with Large Language Models through Parameter-Efficient Fine-Tuning", 355 "authors": ["J. Lu", "L. Yu", "X. Li", "L. Yang", "C. Zuo"], 356 "year": 2023, 357 "doi": "10.1109/ISSRE59848.2023.00026", 358 "relevance": "Demonstrates parameter-efficient fine-tuning of LLMs for code review, relevant to LLM-based code review automation." 359 }, 360 { 361 "title": "D-act: Towards diff-aware code transformation for code review under a time-wise evaluation", 362 "authors": ["C. Pornprasit", "C. Tantithamthavorn", "P. Thongtanunam", "C. Chen"], 363 "year": 2023, 364 "relevance": "Proposes diff-aware code transformation for automated code review, contributing to the code review automation literature." 365 }, 366 { 367 "title": "Code review automation: Strengths and weaknesses of the state of the art", 368 "authors": ["R. Tufano", "O. Dabić", "A. Mastropaolo", "M. Ciniselli", "G. Bavota"], 369 "year": 2024, 370 "doi": "10.1109/TSE.2023.3348172", 371 "relevance": "Evaluates state-of-the-art code review tools, finding up to 25% training data noise and poor performance on specific tasks." 372 }, 373 { 374 "title": "Improving automated code reviews: Learning from experience", 375 "authors": ["H. Y. Lin", "P. Thongtanunam", "C. Treude", "W. Charoenwet"], 376 "year": 2024, 377 "doi": "10.1145/3643991.3644910", 378 "relevance": "Proposes improvements to CodeReviewer through better training data, used as a baseline model in this work." 379 }, 380 { 381 "title": "Using pre-trained models to boost code review automation", 382 "authors": ["R. Tufano", "S. Masiero", "A. Mastropaolo", "L. Pascarella", "D. Poshyvanyk", "G. Bavota"], 383 "year": 2022, 384 "doi": "10.1145/3510003.3510621", 385 "relevance": "Demonstrates pre-training techniques for code review models with 1.4M samples, foundational work in automated code review." 386 }, 387 { 388 "title": "Security Defect Detection via Code Review: A Study of the OpenStack and Qt Communities", 389 "authors": ["J. Yu", "L. Fu", "P. Liang", "A. Tahir", "M. Shahin"], 390 "year": 2023, 391 "doi": "10.1109/ESEM56168.2023.10304852", 392 "relevance": "Quantifies the scarcity of security-related code reviews (614/20,000), directly motivating the need for security-focused review datasets." 393 }, 394 { 395 "title": "Exploring the potential of ChatGPT in automated code refinement: An empirical study", 396 "authors": ["Q. Guo", "J. Cao", "X. Xie", "S. Liu", "X. Li", "B. Chen", "X. Peng"], 397 "year": 2024, 398 "doi": "10.1145/3597503.3623306", 399 "relevance": "Empirical study of ChatGPT for code refinement, relevant to LLM capabilities in code review tasks." 400 }, 401 { 402 "title": "Synthetic dialogue dataset generation using LLM agents", 403 "authors": ["Y. Abdullin", "D. Molla-Aliod", "B. Ofoghi", "J. Yearwood", "Q. Li"], 404 "year": 2024, 405 "arxiv_id": "2401.17461", 406 "relevance": "Demonstrates LLM-based synthetic dataset generation via agent dialogues, directly relevant to the synthetic data generation methodology proposed." 407 }, 408 { 409 "title": "Unnatural instructions: Tuning language models with (almost) no human labor", 410 "authors": ["O. Honovich", "T. Scialom", "O. Levy", "T. Schick"], 411 "year": 2023, 412 "relevance": "Shows effectiveness of training models on synthetic instruction data, supporting the viability of synthetic dataset approaches." 413 }, 414 { 415 "title": "ZeroGen: Efficient zero-shot learning via dataset generation", 416 "authors": ["J. Ye", "J. Gao", "Q. Li", "H. Xu", "J. Feng", "Z. Wu", "T. Yu", "L. Kong"], 417 "year": 2022, 418 "arxiv_id": "2202.07922", 419 "relevance": "Introduces LLM-based dataset generation for zero-shot learning with task-specific prompts, relevant to synthetic data generation methodology." 420 } 421 ], 422 "engagement_factors": { 423 "practical_relevance": { 424 "score": 1, 425 "justification": "Proposes a method to generate security-focused review data, potentially useful for practitioners building code review tools, but nothing is released or available yet." 426 }, 427 "surprise_contrarian": { 428 "score": 0, 429 "justification": "Confirms the well-known observation that security reviews are underrepresented in datasets and that LLMs can generate synthetic data — neither claim is surprising." 430 }, 431 "fear_safety": { 432 "score": 1, 433 "justification": "Touches on code security vulnerabilities but does not demonstrate novel attacks or raise new safety concerns beyond the known gap in security review coverage." 434 }, 435 "drama_conflict": { 436 "score": 0, 437 "justification": "No controversy, no challenges to existing work, no adversarial framing." 438 }, 439 "demo_ability": { 440 "score": 0, 441 "justification": "No code, dataset, model, or demo is released. The paper is purely a research proposal." 442 }, 443 "brand_recognition": { 444 "score": 0, 445 "justification": "From Pontificia Universidad Católica de Chile, not a widely recognized AI lab. No famous products or models are the focus." 446 } 447 } 448 }