scan.json (30889B)
1 { 2 "paper": { 3 "title": "The Right Prompts for the Job: Repair Code-Review Defects with Large Language Model", 4 "authors": [ 5 "Zelin Zhao", 6 "Zhaogui Xu", 7 "Jialong Zhu", 8 "Peng Di", 9 "Yuan Yao", 10 "Xiaoxing Ma" 11 ], 12 "year": 2023, 13 "venue": "ICSE 2024", 14 "arxiv_id": "2312.17485", 15 "doi": "10.48550/arXiv.2312.17485" 16 }, 17 "scan_version": 3, 18 "active_modules": ["experimental_rigor", "data_leakage"], 19 "methodology_tags": ["benchmark-eval"], 20 "key_findings": "CodeLLaMA finetuned with the most informative prompt (combining review comments and fix range) achieves 72.97% exact-code-match repair rate on code review defects, a 25pp improvement over the baseline prompt with no hints. Fix range information provides the largest single improvement (avg +7.69pp), while cross-dataset transferability is poor—models finetuned on reviewer comments lose ~50% accuracy when tested on automated checker defects and vice versa. The 6-7B parameter range offers the best efficiency-effectiveness tradeoff across the CodeT5+ model family.", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": false, 26 "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper. No mention of releasing the finetuning code or evaluation scripts." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": false, 31 "justification": "The reviewer dataset (RD) is sourced from Tufano et al. [32] which is public, but the PMD dataset (PD) of 14,935 entries was custom-collected by the authors and is not released. No download link or archive is provided for PD." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "Hardware is mentioned (2x NVIDIA A100-SXM4-80GB, AMD EPYC 7T83 CPU, 200GB memory) and HuggingFace transformers library is referenced, but no software versions, requirements.txt, Dockerfile, or detailed dependency list is provided." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. A reader would have to reverse-engineer the experimental setup from the paper text." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "All results in Tables 1, 2, and 5 are reported as point estimates (e.g., '72.97%') with no confidence intervals, error bars, or uncertainty quantification." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "The paper makes numerous comparative claims (e.g., 'CodeLLaMA outperformed the others', 'ChatGPT-4 surpasses ChatGPT-3.5') based solely on comparing raw percentages without any statistical significance tests." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "Effect sizes are reported as absolute ECM improvements with baseline context. Table 4 shows improvements between adjacent prompts (e.g., average +7.69pp from P4 to P5). The last row of Table 2 shows total improvement (#P7-#P3) for each model." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "No power analysis or justification for the dataset sizes (~16K reviewer comments, ~15K PMD entries). The RD dataset size is inherited from Tufano et al., and the PD dataset was sized 'to maintain a similar magnitude' without further justification." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "No standard deviation, variance, or spread measures are reported. There is no mention of multiple training runs or seeds. All results appear to be from single runs." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "Nine models are compared (ChatGPT-3.5, ChatGPT-4, InCoder, CodeT5+, CodeFuse, LLaMA, CodeGen-2, CodeLLaMA, CodeReviewer). Zero-shot ChatGPT serves as a baseline, and the minimal prompt P3 serves as the baseline prompt configuration." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "The models were contemporary at the time of writing (2023): ChatGPT-4 (March 2023), CodeLLaMA (2023), LLaMA (2023), CodeGen-2 (2023). These represent the state of the art for code generation at that time." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "The prompt comparison (P3→P7) is a systematic ablation that incrementally adds information components (bug location, fix range, review comments). Table 4 quantifies the contribution of each added component. RQ4 ablates the dataset composition." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Two metrics are used: exact-code-match (ECM) based on AST comparison, and Code BLEU. Both are reported in Tables 1 and 2." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": false, 95 "justification": "No human evaluation of the generated patches is performed. Evaluation is entirely automated using ECM and Code BLEU. The authors acknowledge that ECM may miss semantically correct but syntactically different fixes, but do not address this with human review." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "The datasets are split 9:1 into training and validation sets (28,063 training, 3,100 validation). The same hyperparameters are used across all models ('the same hyperparameters, including two training epochs, a learning rate of 2e-4, and a batch size of 16'), so the validation set is not used for tuning." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Table 3 provides per-defect-type ECM breakdowns for all 30 PMD defect types, showing lowest and highest scores across all finetuning configurations. This reveals substantial variation (0% to 100%)." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Figure 7 shows an incorrect fix generated by CodeLLaMA. Figure 6 and Table 3 discuss the 'catch exception when parsing double' defect type with 0-6.7% fix rate. The paper explains why certain defect types are hard to fix." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Several negative results are reported: ChatGPT-4 performs worse than ChatGPT-3.5 on P1 (6.61% vs 14.10%), cross-dataset transferability is poor (~50% drop in Table 5), CodeLLaMA-13B shows a minor decrease vs 7B (-0.29%), and some defect types have 0% fix rate." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "The abstract's main claim of '72.97% repair rate with the best prompt' is directly supported by Table 2 (CodeLLaMA with P7). The claim of 'substantial improvement in effectiveness' is supported by the P3→P7 comparison showing >20pp gains." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "The primary causal claims concern prompt components causing performance changes. The experimental design—same models tested with systematically varying prompts (P3-P7)—is a controlled single-variable manipulation adequate for these causal claims. Table 4 quantifies each component's contribution." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper tests exclusively on Java code (PMD checker for Java, reviewer dataset primarily Java from GitHub/Gerrit), but the title and claims generalize to 'Code-Review Defects' without language qualification. The abstract makes no mention of the Java-only scope." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": false, 137 "justification": "The Threats section (Section 5) is brief and does not discuss specific alternative explanations for the results. For instance, it does not consider whether the results might be driven by data memorization, the simplicity of PMD-type fixes, or the quality of review comments rather than model capabilities." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": false, 142 "justification": "ECM measures syntactic equivalence (via AST comparison) but the paper frames this as 'repair rate' and 'fix rate.' They acknowledge that string comparison is unreliable and use AST-based comparison, but do not discuss that semantically correct fixes with different syntax are counted as failures, inflating the actual failure rate." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": false, 149 "justification": "ChatGPT-3.5 and ChatGPT-4 are referenced by marketing names without API version strings or snapshot dates. Open-source models specify parameter counts (e.g., 'CodeLLaMA 7B', 'CodeT5+ 6B') but not specific checkpoint versions or release dates." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": true, 154 "justification": "Full prompt text is provided for all 7 prompts in Section 3.2. Figure 4 shows the prompt template, and each prompt variant (P1-P7) has its exact task instruction text quoted. Example inputs are shown in Figures 2-5." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": false, 159 "justification": "Finetuning hyperparameters are reported: 'two training epochs, a learning rate of 2e-4, and a batch size of 16' with LoRA and mixed precision. However, inference parameters (temperature, top-p, max tokens) are not reported for any model, including the ChatGPT API calls." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "No agentic scaffolding is used. The approach is direct single-turn prompting of LLMs." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 3.3.1 describes RD dataset pruning rules (removing entries with no code intersection, eliminating meaningless comments like 'done' or 'fixed'), reducing from ~17K to 16,228. Section 3.3.2 describes PD dataset creation: 30 PMD rules selected, 20K snippets scanned, 6 developers created fixes, resulting in 14,935 entries." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 5 'THREATS' provides a dedicated discussion of validity threats, covering model size choices and data collection biases." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": false, 181 "justification": "The threats section is brief (one paragraph) and mostly generic. It acknowledges focusing on 6-7B models and potential data biases but does not discuss specific threats like contamination risk, Java-only generalization limits, single-run reliability, or the impact of ECM as a proxy metric." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": false, 186 "justification": "The paper does not explicitly state what the results do NOT show. There is no mention that results are limited to Java, that ECM underestimates true fix rates, or that the approach has not been validated in actual CI/CD pipelines." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": false, 193 "justification": "The RD dataset references Tufano et al. [32] which is publicly available, but the authors' pruned version (16,228 entries) is not released. The PD dataset of 14,935 entries is not available for download." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Section 3.3 describes both datasets: RD from Tufano et al. with manual pruning rules, PD from scanning open-source Java projects on GitHub with 30 PMD rules and having 6 developers create fixes. The five components of each data instance are listed." 199 }, 200 "recruitment_methods_described": { 201 "applies": true, 202 "answer": false, 203 "justification": "The six developers who created fixes for the PD dataset are described only as 'experienced developers' with no information about how they were selected, their expertise level, or potential biases. The selection of 'popular open-source Java projects' is not further specified." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "The pipeline is documented with counts: RD goes from ~17K raw to 16,228 after pruning (with rules described). PD goes from 20,000 selected snippets to 14,935 after developer review. The 9:1 train/validation split yields 28,063 training and 3,100 validation samples." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": false, 215 "justification": "No funding statement, acknowledgments section, or grant numbers are present in the paper." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Author affiliations are clearly listed: four authors from Ant Group, two from Nanjing University, and one with dual affiliation. Ant Group develops CodeFuse, one of the evaluated models." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": false, 225 "justification": "Four of six authors are from Ant Group, which develops CodeFuse—one of the models evaluated. Ant Group has a commercial interest in demonstrating LLM capabilities for code repair. No independence statement is provided." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial disclosure statement is present in the paper. The conflict between Ant Group affiliation and CodeFuse evaluation is not explicitly acknowledged." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": true, 236 "answer": false, 237 "justification": "No training data cutoff dates are stated for any of the models. ChatGPT-3.5/4 release dates are mentioned but not their training data cutoffs. Open-source model pre-training data temporal scope is not discussed." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": true, 241 "answer": false, 242 "justification": "No discussion of whether the Tufano et al. dataset (from GitHub/Gerrit) or the PMD dataset (from GitHub Java projects) could overlap with the pre-training data of the evaluated models, many of which were trained on GitHub code." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": true, 246 "answer": false, 247 "justification": "The Tufano et al. dataset was published in 2021, before the training cutoffs of most evaluated models. ChatGPT and others may have seen these code review examples during pre-training. This contamination risk is not discussed." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in the study. The six developers who annotated the PMD dataset are data creators, not study participants." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in the study." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in the study." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in the study." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in the study." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in the study." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in the study." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "No inference costs, latency measurements, or per-example timing are reported. The paper mentions models can 'infer within seconds' in the threats section but provides no specific numbers despite the stated goal of CI/CD integration." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "Hardware is specified (2x A100-80GB GPUs, 200GB RAM, AMD EPYC 7T83) and training hyperparameters are given, but total GPU hours, training time, or API costs are not reported." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "No mention of random seeds, seed sensitivity analysis, or results across multiple seeds. All results appear to be from single training runs." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": false, 308 "justification": "The number of experimental runs is never stated. There is no indication of whether results are from single or multiple runs." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": false, 313 "justification": "The paper states 'the same hyperparameters' were used across all models (lr=2e-4, batch=16, 2 epochs) but does not explain how these values were selected or report any search budget." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": false, 318 "justification": "The hyperparameters appear to be pre-selected without justification. The paper does not explain why lr=2e-4, batch=16, and 2 epochs were chosen or whether alternative configurations were explored." 319 }, 320 "multiple_comparison_correction": { 321 "applies": false, 322 "answer": false, 323 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "The authors from Ant Group evaluate CodeFuse (Ant Group's model) alongside competitors. The potential bias of evaluating their own company's product is not acknowledged or addressed." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": true, 332 "answer": false, 333 "justification": "Models range from 220M (CodeReviewer) to 175B (ChatGPT-3.5) parameters with vastly different compute requirements, but performance is compared without normalizing for compute budget. Only the CodeT5+ size comparison (RQ3) touches on this." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": false, 338 "justification": "The paper does not discuss whether ECM (exact code match via AST comparison) is a valid measure of repair capability. Semantically correct but syntactically different fixes are counted as failures, but this validity gap is not analyzed." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "No scaffolding is involved—all evaluations use direct single-turn prompting." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "The Tufano et al. dataset was published in 2021, and the GitHub code used for the PMD dataset predates the models' training. No discussion of whether these examples could be in the pre-training data of ChatGPT, CodeLLaMA, or other models." 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "The fix range markers ([FIX_START], [FIX_END]) in P5 and P7 provide oracle information about where the fix should go. While review comments would be available in practice, the exact fix range is typically not known a priori. This potential leakage is not discussed." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "No discussion of whether training and validation examples share structural similarities (e.g., from the same repositories, similar PMD rule violations, or near-duplicate code patterns). The 9:1 split method is not described—random vs. repository-level splitting would affect independence." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": false, 365 "justification": "No leakage detection or prevention methods are applied. No canary strings, membership inference tests, decontamination pipelines, or n-gram overlap analysis." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "CodeLLaMA achieves a 72.97% exact-code-match repair rate with the most informative prompt (P7) combining review comments and fix range.", 372 "evidence": "Table 2 shows CodeLLaMA with P7 achieves 72.97% ECM and 0.9394 Code BLEU, the highest among all model-prompt combinations (Section 4.1.2).", 373 "supported": "strong" 374 }, 375 { 376 "claim": "Review comments and fix range information substantially enhance repair performance, with all models showing >20pp ECM improvement from P3 to P7.", 377 "evidence": "Table 2 last row shows improvements of 21.51% to 25.10% from P3 to P7 across all six finetuned models. Table 4 breaks down the incremental contribution of each prompt component (Section 4.2).", 378 "supported": "strong" 379 }, 380 { 381 "claim": "Fix range information provides the largest single improvement, with an average +7.69pp ECM increase from P4 to P5.", 382 "evidence": "Table 4 shows the P4→P5 transition (replacing bug location with fix range) yields the highest average improvement at 7.69%, compared to 5.77% for P3→P4, 3.56% for P5→P6, and 5.77% for P6→P7 (Section 4.2).", 383 "supported": "strong" 384 }, 385 { 386 "claim": "6-7B parameter models represent the optimal balance between efficiency and effectiveness for this task.", 387 "evidence": "Figure 8 shows ECM improvements plateau from 6B to 16B in CodeT5+. Section 4.3 reports CodeFuse-13B and LLaMA-13B show only minor gains (+2.2%, +1.87%) over 7B versions, while CodeLLaMA-13B slightly decreases (-0.29%).", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "Cross-dataset transferability is poor—models finetuned on one dataset lose approximately half their accuracy when tested on the other.", 392 "evidence": "Table 5 shows CodeLLaMA_RD achieves 65.10% on RD_t but only 44.18% on PD_t. CodeLLaMA_PD achieves 80.65% on PD_t but only 36.31% on RD_t (Section 4.4).", 393 "supported": "strong" 394 }, 395 { 396 "claim": "ChatGPT-4 outperforms ChatGPT-3.5 when review comments are provided (P2), but underperforms without them (P1).", 397 "evidence": "Table 1 shows P1: ChatGPT-3.5 14.10% vs ChatGPT-4 6.61%; P2: ChatGPT-3.5 49.42% vs ChatGPT-4 55.10% (Section 4.1.1).", 398 "supported": "moderate" 399 } 400 ], 401 "red_flags": [ 402 { 403 "flag": "Company evaluating its own product", 404 "detail": "Four of six authors are from Ant Group, which develops CodeFuse—one of the nine evaluated models. While CodeFuse does not top the rankings (CodeLLaMA does), the potential for bias in experimental setup, hyperparameter tuning, or data selection favoring their own product is not acknowledged." 405 }, 406 { 407 "flag": "No uncertainty quantification", 408 "detail": "All results are single-run point estimates with no error bars, confidence intervals, standard deviations, or multiple-seed analysis. Given that finetuning results can vary substantially across seeds, the reported rankings may not be stable." 409 }, 410 { 411 "flag": "Contamination risk unaddressed", 412 "detail": "The Tufano et al. dataset (from GitHub/Gerrit, published 2021) and the PMD dataset (from GitHub Java projects) could overlap with the pre-training data of ChatGPT, CodeLLaMA, and other models trained on GitHub code. This is never discussed despite being a fundamental threat to validity." 413 }, 414 { 415 "flag": "Overclaiming scope", 416 "detail": "The paper tests exclusively on Java code review defects but makes general claims about 'code-review defects' and 'program repair.' The title, abstract, and conclusions do not qualify results as Java-specific." 417 }, 418 { 419 "flag": "No statistical significance tests", 420 "detail": "All comparative claims ('CodeLLaMA outperformed the others', 'ChatGPT-4 surpasses ChatGPT-3.5') are based on raw percentage comparisons without any statistical testing. Small differences (e.g., 1.48pp between P5 and P6 for CodeGen-2) may not be significant." 421 } 422 ], 423 "cited_papers": [ 424 { 425 "title": "Evaluating Large Language Models Trained on Code", 426 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 427 "year": 2021, 428 "arxiv_id": "2107.03374", 429 "relevance": "Foundational LLM-for-code paper (Codex); relevant to understanding code generation model capabilities and evaluation." 430 }, 431 { 432 "title": "Code Llama: Open Foundation Models for Code", 433 "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"], 434 "year": 2023, 435 "arxiv_id": "2308.12950", 436 "relevance": "Describes CodeLLaMA, the best-performing model in this study; key to understanding LLM architecture for code tasks." 437 }, 438 { 439 "title": "InferFix: End-to-End Program Repair with LLMs", 440 "authors": ["Matthew Jin", "Syed Shahriar", "Michele Tufano"], 441 "year": 2023, 442 "arxiv_id": "2303.07263", 443 "relevance": "Closely related work using LLMs for automated program repair of checker-identified bugs, with prompt engineering and Codex finetuning." 444 }, 445 { 446 "title": "CIRCLE: Continual Repair across Programming Languages", 447 "authors": ["Wei Yuan", "Quanjun Zhang", "Tieke He"], 448 "year": 2022, 449 "doi": "10.1145/3533767.3534219", 450 "relevance": "LLM-based program repair approach using prompts with context and buggy/fixed code; directly comparable approach to CR defect repair." 451 }, 452 { 453 "title": "Automating Code Review Activities by Large-Scale Pre-Training", 454 "authors": ["Zhiyu Li", "Shuai Lu", "Daya Guo"], 455 "year": 2022, 456 "doi": "10.1145/3540250.3549081", 457 "relevance": "Describes CodeReviewer, a model pre-trained specifically for code review tasks, evaluated in this paper." 458 }, 459 { 460 "title": "Towards Automating Code Review Activities", 461 "authors": ["Rosalia Tufano", "Luca Pascarella", "Michele Tufano"], 462 "year": 2021, 463 "doi": "10.1109/ICSE43902.2021.00027", 464 "relevance": "Provides the reviewer dataset (RD) used in this study; foundational work on automating code review." 465 }, 466 { 467 "title": "An Empirical Study on Learning Bug-Fixing Patches in the Wild via Neural Machine Translation", 468 "authors": ["Michele Tufano", "Cody Watson", "Gabriele Bavota"], 469 "year": 2019, 470 "doi": "10.1145/3340544", 471 "relevance": "Early neural machine translation approach to automated program repair, establishing the buggy→fixed code translation paradigm." 472 }, 473 { 474 "title": "A Survey of Learning-based Automated Program Repair", 475 "authors": ["Quanjun Zhang", "Chunrong Fang", "Yuxiang Ma"], 476 "year": 2023, 477 "arxiv_id": "2301.03270", 478 "relevance": "Comprehensive survey of learning-based APR techniques providing context for LLM-based repair approaches." 479 }, 480 { 481 "title": "LLaMA: Open and Efficient Foundation Language Models", 482 "authors": ["Hugo Touvron", "Thibaut Lavril", "Gautier Izacard"], 483 "year": 2023, 484 "arxiv_id": "2302.13971", 485 "relevance": "Describes the LLaMA model family used as a baseline; foundational open-source LLM for code and language tasks." 486 }, 487 { 488 "title": "CodeGen2: Lessons for Training LLMs on Programming and Natural Languages", 489 "authors": ["Erik Nijkamp", "Hiroaki Hayashi", "Caiming Xiong"], 490 "year": 2023, 491 "arxiv_id": "2305.02309", 492 "relevance": "Describes CodeGen-2 model evaluated in this study; insights on training LLMs for code generation tasks." 493 }, 494 { 495 "title": "Out of the BLEU: How Should We Assess Quality of the Code Generation Models?", 496 "authors": ["Mikhail Evtikhiev", "Egor Bogomolov", "Yaroslav Sokolov"], 497 "year": 2023, 498 "doi": "10.1016/j.jss.2023.111741", 499 "relevance": "Evaluates code generation metrics including Code BLEU; supports this paper's finding that Code BLEU may not be ideal for evaluating code repair." 500 } 501 ], 502 "engagement_factors": { 503 "practical_relevance": { 504 "score": 2, 505 "justification": "Directly applicable to CI/CD pipelines for semi-automated code review defect repair, though no released tool or code limits immediate adoption." 506 }, 507 "surprise_contrarian": { 508 "score": 0, 509 "justification": "Confirms expected finding that more context (review comments, fix range) helps LLMs produce better fixes." 510 }, 511 "fear_safety": { 512 "score": 0, 513 "justification": "No security or safety concerns raised; the work addresses developer productivity, not AI risk." 514 }, 515 "drama_conflict": { 516 "score": 0, 517 "justification": "No controversy or provocative claims; straightforward empirical comparison." 518 }, 519 "demo_ability": { 520 "score": 0, 521 "justification": "No code, tool, or demo is released; results cannot be independently tried." 522 }, 523 "brand_recognition": { 524 "score": 1, 525 "justification": "Uses well-known models (ChatGPT, LLaMA) but authors are from Ant Group and Nanjing University, not top-tier ML research labs." 526 } 527 } 528 }