scan.json (30853B)
1 { 2 "paper": { 3 "title": "Harnessing Large Language Models for Curated Code Reviews", 4 "authors": [ 5 "Oussama Ben Sghaier", 6 "Martin Weyssow", 7 "Houari Sahraoui" 8 ], 9 "year": 2025, 10 "venue": "IEEE Working Conference on Mining Software Repositories", 11 "arxiv_id": "2502.03425", 12 "doi": "10.1109/MSR66628.2025.00039" 13 }, 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "Replication package released at https://github.com/OussamaSghaier/CuREV, mentioned in footnote and Data Availability section: 'We publicly release all the code, models, data, and results of our experiments.'" 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "Dataset released on Zenodo at https://zenodo.org/records/14812107, explicitly stated in the abstract footnote and Data Availability section." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions hardware (four NVIDIA RTX A5000 GPUs, 24GB each) and LoRA settings, but provides no requirements.txt, Dockerfile, or library version specifications sufficient to recreate the environment." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "The paper references a replication package but does not include step-by-step reproduction instructions in the paper itself. No README contents, commands, or scripts to replicate experiments are described." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results are reported as point estimates only (BLEU 7.71 vs 11.26, CodeBLEU 0.36 vs 0.44, EM 408 vs 445). No confidence intervals, error bars, or ± notation anywhere." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims improvements (46% BLEU, 22% CodeBLEU) based solely on comparing two numbers. No statistical significance tests (t-test, bootstrap, etc.) are applied to any comparison." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports percentage improvements with baseline context: '46% improvement in BLEU score' (7.71→11.26), '22% improvement in CodeBLEU' (0.36→0.44), and absolute differences for scoring criteria (clarity 6.89→8.96, conciseness 7.71→8.05)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The subset of 20,000 comments used for downstream experiments is not justified. No power analysis or explanation of why 20,000 was chosen from the 170,718 available samples." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "All results appear to be single-run numbers. No standard deviations, variance across seeds, or spread measures are reported for any experiment." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The original (uncurated) dataset serves as the baseline for all comparisons. Models trained on original vs curated data are compared directly under identical conditions." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "DeepSeek-Coder-6.7B-Instruct (2024) and Llama-3.1-70B (2024) are contemporary models. The baseline comparison is between dataset versions rather than competing methods, making recency of competing approaches less relevant." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": false, 78 "justification": "The curation pipeline has two components — filtering irrelevant comments and reformulating comments — but no ablation study separates their individual contributions to the improvements." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Multiple metrics are used: BLEU for comment generation (RQ3); CodeBLEU and Exact Match for code refinement (RQ4); clarity, conciseness, and relevance scores for dataset quality (RQ1/RQ2)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": true, 88 "justification": "A sanity check involving two authors manually evaluating 100 review comments was conducted (Section III-D). Cohen's kappa was computed between human and LLM judgments across all categories and criteria, validating the LLM-as-judge outputs." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Section V-A states: 'We further split each subset into 75% for training and 25% for evaluation.' The evaluation set is separate from training data." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table V provides scoring criteria broken down by type (refactoring, bugfix, testing, etc.), nature (descriptive, prescriptive, clarification), and civility subcategories. Table I shows distribution across 9 programming languages." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Table XI shows an example where the model trained on original data generates an incorrect review comment. Examples of filtered low-quality comments are shown in Section IV-A. The modest conciseness improvement is attributed to LLM verbosity." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper acknowledges the conciseness improvement was modest (only +0.34 vs +2.07 for clarity), attributed to 'the inherently verbose nature of LLMs, which tend to produce more elaborate text by default.' The Other/Nature category also increased unexpectedly from 0.01% to 9.90%." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims of '46% improvement in BLEU score' (7.71→11.26 = 46%) and '22% improvement in CodeBLEU' (0.36→0.44 = 22%) match Tables X and XII. Claims about clarity and conciseness improvements match Table VII." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper claims curation 'leads to enhanced model performance.' The experimental design is a controlled comparison: same model, same hyperparameters, same paired samples, with only the dataset version varying. This single-variable manipulation is adequate for the causal claim." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper tests only one downstream model (DeepSeek-6.7B-Instruct) and one curation LLM (Llama-3.1-70B) but claims generalized improvements for 'LLMs' broadly. The abstract says 'enhanced model performance' and conclusions refer to 'the effectiveness of LLMs for code review tasks' without bounding to the specific model tested." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The threats to validity (Section VI) discusses data noise and LLM-as-judge reliability but does not consider alternative explanations for the results. For example, curated comments could be easier to learn due to reduced variance (formulaic phrasing) rather than higher quality. No robustness checks against this." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper measures BLEU and frames it as 'accuracy' of comment generation. BLEU measures n-gram overlap with reference text, not actual comment quality or usefulness. This gap between the proxy metric and the claimed outcome (better comments) is not acknowledged." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Specific model versions are stated: 'Llama-3.1-70B-Instruct' and 'DeepSeek-Coder-6.7B-Instruct' (with citation to the DeepSeek-Coder paper). These include version numbers and size specifications." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Prompt excerpts are provided in Tables IV and VI. The paper states 'The full prompt is available in the replication package' with a GitHub URL. While only excerpts appear in the paper, the replication package contains the complete prompts." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "Fine-tuning hyperparameters are reported (batch size 4, 5 epochs, LoRA r=16, α=32, dropout=0.05) but inference settings for Llama-3.1-70B (temperature, top-p, max tokens for evaluation/reformulation of 176K samples) and DeepSeek inference (temperature for code refinement) are not stated." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The approach is direct LLM prompting and fine-tuning without agent loops, tool use, or scaffolding." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "The filtering step is documented with specific criteria: relevance threshold of 4, resulting in 5,895 samples removed (176,613→170,718). The reformulation process using Llama-3.1-70B is described with the prompt guidelines. The 20,000-sample subset selection for downstream experiments is described." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section VI 'Threats to Validity' is a dedicated section discussing potential limitations of the work." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "The threats section discusses specific issues: (1) non-English or misspelled words in review comments, mitigated by BPE tokenization, and (2) LLM-as-judge reliability, validated by sanity check on 100 samples with Cohen's kappa scores. These are specific to this study's design." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state what the results do NOT show. No mention of limitations regarding the single downstream model tested, the single curation LLM, or specific settings/populations excluded from the claims." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "Both the original dataset (from Li et al., publicly available) and the curated CuRev dataset are released on Zenodo (https://zenodo.org/records/14812107). The Data Availability section confirms: 'We publicly release all the code, models, data, and results.'" 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "The paper describes the source dataset (Li et al., largest publicly available code review dataset), its composition (176,613 samples, 9 programming languages), and content (code diffs + review comments from pull requests). Table I shows the distribution across languages." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants in the study. The data source is a standard publicly available benchmark dataset (code reviews from repositories). The two-author sanity check is a validation step, not a human subjects study." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline is documented: original dataset (176,613) → LLM evaluation → relevance filtering at threshold 4 (removing 5,895 samples) → curated dataset (170,718) → reformulation with Llama-3.1-70B → re-evaluation. For downstream tasks: 20,000 paired samples → 75/25 train/test split." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information, acknowledgments section, or grant numbers appear anywhere in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly listed: Université de Montréal (Sghaier, Sahraoui) and Singapore Management University (Weyssow). No commercial product is being evaluated, so no conflict exists." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding information is disclosed, so independence cannot be assessed. This defaults to NO since the absence of disclosure is not the same as absence of conflict." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement or financial interest disclosure appears in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "Neither Llama-3.1-70B nor DeepSeek-Coder-6.7B-Instruct training data cutoff dates are stated. The models could have been trained on the Li et al. code review dataset." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether the code review dataset (publicly available since 2022) appeared in the pre-training data of DeepSeek or Llama models. This is especially concerning for the code refinement task (RQ4) where DeepSeek is used directly for inference." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "The Li et al. code review dataset was published in 2022, before both Llama-3.1 and DeepSeek-Coder were trained. The models could have seen the test examples during pre-training. This contamination risk is not discussed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants study. The two-author sanity check on 100 samples is an inter-rater reliability validation, not a human subjects study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants study. The work analyzes publicly available code review data." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference cost, API cost, tokens consumed, or wall-clock time reported for running Llama-3.1-70B on 176,613 samples or for DeepSeek inference on 5,000 test samples." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "Hardware is mentioned (four NVIDIA RTX A5000 GPUs, 24GB each) but total training time, GPU hours, and compute budget for the full pipeline (evaluation + curation + fine-tuning + inference) are not quantified." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from a single run." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs is never stated. Results are presented without indicating whether they are from single or multiple runs." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "LoRA settings (r=16, α=32, dropout=0.05) are reported but no search budget, search method, or number of configurations tried is described." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The selected LoRA configuration and other hyperparameters are presented without explanation of how they were chosen or whether any selection process was used." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "Only a small number of comparisons are made (original vs curated on two tasks), and no statistical tests are applied in the first place." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors create the curation pipeline and evaluate it themselves. They do not acknowledge the bias of evaluating their own system. Furthermore, the same LLM (Llama-3.1-70B) that performs the curation also performs the re-evaluation, creating circular evaluation." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "No performance curves across compute levels. The curation pipeline requires running Llama-3.1-70B on 176K samples, which is substantial compute, but no comparison of cost vs benefit is provided." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper does not discuss whether BLEU and CodeBLEU actually measure comment quality and code refinement quality. BLEU has known weaknesses for evaluating natural language generation quality, but this is not acknowledged." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved; the approach uses direct prompting and fine-tuning." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of whether the code review dataset (published 2022) was included in the training data of Llama-3.1 or DeepSeek-Coder models, both trained after 2022." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information. For code refinement (RQ4), providing the review comment could leak information about the expected code change." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether train and test examples are independent. Samples from the same repository or same pull request could share structural patterns, inflating performance." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention methods (canary strings, membership inference, n-gram overlap analysis, decontamination) are applied." 359 } 360 } 361 }, 362 "scan_version": 3, 363 "active_modules": [ 364 "experimental_rigor", 365 "data_leakage" 366 ], 367 "claims": [ 368 { 369 "claim": "85.9% of code review samples are related to refactoring and bugfix, with 62.6% being prescriptive comments.", 370 "evidence": "Section III-E reports categorization results: Refactoring 80.07%, Bugfix 18.60% (multi-labeled), Prescriptive 62.6%. Figure 3 visualizes distributions.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "The curation pipeline improves clarity from 6.89 to 8.96 and conciseness from 7.71 to 8.05 on average.", 375 "evidence": "Table VII shows scoring criteria evolution across all categories. Average clarity improved by +2.07 and conciseness by +0.34. Figure 5 shows the curated distribution.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Fine-tuning on curated data yields a 46% improvement in BLEU score for comment generation (7.71→11.26).", 380 "evidence": "Table X compares BLEU scores for DeepSeek-6.7B-Instruct trained on original (7.71) vs curated (11.26) datasets. Section V-A describes the experimental setup.", 381 "supported": "weak" 382 }, 383 { 384 "claim": "Curated comments lead to 22% improvement in CodeBLEU (0.36→0.44) and higher Exact Match (408→445) for code refinement.", 385 "evidence": "Table XII reports CodeBLEU and EM scores for original vs curated review comments used as input to DeepSeek-6.7B-Instruct. Section V-B describes the setup.", 386 "supported": "weak" 387 }, 388 { 389 "claim": "LLM-as-a-Judge (Llama-3.1-70B) achieves near-perfect to substantial agreement with human evaluators on the evaluation framework.", 390 "evidence": "Section III-D reports Cohen's kappa: civility 1.0, type 0.88, nature 0.82, relevance 0.85, conciseness 0.76, clarity 0.64, based on 100 manually evaluated samples.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "The curation pipeline eliminates all uncivil comments (98.77%→100% civil) and shifts comments toward prescriptive nature (62.6%→90.2%).", 395 "evidence": "Table VIII shows curated dataset category statistics: 100% civil, 90.20% prescriptive. Compared to original dataset figures in Section III-E.", 396 "supported": "moderate" 397 } 398 ], 399 "methodology_tags": [ 400 "benchmark-eval" 401 ], 402 "key_findings": "This paper proposes CuRev, an LLM-driven curation pipeline for code review datasets that filters irrelevant comments and reformulates remaining ones for clarity, conciseness, and civility using Llama-3.1-70B. Applied to the largest public code review dataset (176K samples), the pipeline improved clarity scores from 6.89 to 8.96 and achieved 100% civil comments. Fine-tuning DeepSeek-6.7B-Instruct on curated data yielded BLEU 11.26 vs 7.71 for comment generation, and CodeBLEU 0.44 vs 0.36 for code refinement, though these improvements lack statistical significance testing and are based on a single model evaluation.", 403 "red_flags": [ 404 { 405 "flag": "Circular evaluation — same LLM curates and judges", 406 "detail": "Llama-3.1-70B is used both to reformulate review comments AND to re-evaluate the curated dataset using the same scoring criteria. The LLM is effectively judging its own output, likely inflating improvement metrics. The paper acknowledges this model is used for both tasks but does not discuss the circularity." 407 }, 408 { 409 "flag": "No statistical significance testing", 410 "detail": "All comparative claims (46% BLEU improvement, 22% CodeBLEU improvement) are based on comparing two point estimates from apparently single runs. No significance tests, confidence intervals, or variance measures are provided. The differences could be within normal random variation." 411 }, 412 { 413 "flag": "Very low absolute BLEU scores", 414 "detail": "BLEU scores of 7.71 and 11.26 are extremely low in absolute terms, suggesting neither model is generating high-quality comments. A 46% relative improvement from a very low baseline may not reflect meaningful practical improvement." 415 }, 416 { 417 "flag": "Single downstream model tested", 418 "detail": "Only DeepSeek-6.7B-Instruct is used for downstream evaluation. Claims about curation improving 'LLM performance' generally are not supported by testing on a single model. The effect could be model-specific." 419 }, 420 { 421 "flag": "No ablation of pipeline components", 422 "detail": "The curation pipeline has two distinct steps — filtering and reformulation — but their individual contributions are never separated. The filtering step removes 5,895 samples while reformulation changes all remaining 170,718. It's unclear which step drives the downstream improvements." 423 }, 424 { 425 "flag": "Tiny human validation sample", 426 "detail": "The LLM-as-judge sanity check uses only 100 samples out of 176,613 (0.057%), and clarity agreement was only 'substantial' (kappa=0.64), the lowest among all categories. This is a thin basis for trusting LLM-generated scores across the entire dataset." 427 } 428 ], 429 "cited_papers": [ 430 { 431 "title": "Automating code review activities by large-scale pre-training", 432 "authors": ["Z. Li", "S. Lu", "D. Guo"], 433 "year": 2022, 434 "relevance": "Introduced the largest public code review dataset (176K samples) used as the base dataset in this study; core work on LLM pre-training for code review automation." 435 }, 436 { 437 "title": "Towards automating code review activities", 438 "authors": ["R. Tufano", "L. Pascarella", "M. Tufano", "D. Poshyvanyk", "G. Bavota"], 439 "year": 2021, 440 "relevance": "Early work on using T5 transformers for automated review comment generation, establishing the comment generation task evaluated in this paper." 441 }, 442 { 443 "title": "Improving the learning of code review successive tasks with cross-task knowledge distillation", 444 "authors": ["O. Ben Sghaier", "H. Sahraoui"], 445 "year": 2024, 446 "relevance": "Prior work by the same first author on cross-task knowledge distillation for code review, directly building toward this curation approach." 447 }, 448 { 449 "title": "Llama-reviewer: Advancing code review automation with large language models through parameter-efficient fine-tuning", 450 "authors": ["J. Lu", "L. Yu", "X. Li", "L. Yang", "C. Zuo"], 451 "year": 2023, 452 "relevance": "Demonstrates LoRA-based fine-tuning of LLMs for code review, the same technique used in this paper's downstream experiments." 453 }, 454 { 455 "title": "DeepSeek-coder: When the large language model meets programming", 456 "authors": ["D. Guo", "Q. Zhu", "D. Yang"], 457 "year": 2024, 458 "arxiv_id": "2401.14196", 459 "relevance": "The code LLM fine-tuned and used for inference in this paper's downstream evaluation experiments." 460 }, 461 { 462 "title": "LoRA: Low-rank adaptation of large language models", 463 "authors": ["E. J. Hu", "Y. Shen", "P. Wallis", "Z. Allen-Zhu"], 464 "year": 2021, 465 "arxiv_id": "2106.09685", 466 "relevance": "Parameter-efficient fine-tuning technique used in this paper to train DeepSeek on original and curated datasets." 467 }, 468 { 469 "title": "Judging LLM-as-a-judge with MT-bench and chatbot arena", 470 "authors": ["L. Zheng", "W.-L. Chiang", "Y. Sheng"], 471 "year": 2023, 472 "relevance": "Foundational work on LLM-as-judge methodology that this paper relies on for automated evaluation of code review comments." 473 }, 474 { 475 "title": "CodeUltraFeedback: An LLM-as-a-judge dataset for aligning large language models to coding preferences", 476 "authors": ["M. Weyssow", "A. Kamanda", "H. Sahraoui"], 477 "year": 2024, 478 "arxiv_id": "2403.09032", 479 "relevance": "LLM-as-judge approach for code-related tasks by co-authors; directly informed the evaluation methodology used in this paper." 480 }, 481 { 482 "title": "Large language models for software engineering: A systematic literature review", 483 "authors": ["X. Hou", "Y. Zhao", "Y. Liu"], 484 "year": 2023, 485 "relevance": "Comprehensive survey of LLMs for software engineering tasks including code review, providing context for this work." 486 }, 487 { 488 "title": "Code review automation: strengths and weaknesses of the state of the art", 489 "authors": ["R. Tufano", "O. Dabić", "A. Mastropaolo", "M. Ciniselli", "G. Bavota"], 490 "year": 2024, 491 "relevance": "Analysis of code review automation approaches including categorization of review types, which informed this paper's evaluation framework." 492 }, 493 { 494 "title": "Exploring parameter-efficient fine-tuning techniques for code generation with large language models", 495 "authors": ["M. Weyssow", "X. Zhou", "K. Kim", "D. Lo", "H. Sahraoui"], 496 "year": 2023, 497 "arxiv_id": "2308.10462", 498 "relevance": "Study of parameter-efficient fine-tuning for code generation by co-authors, informing the LoRA-based approach used here." 499 }, 500 { 501 "title": "RepairLLaMA: Efficient representations and fine-tuned adapters for program repair", 502 "authors": ["A. Silva", "S. Fang", "M. Monperrus"], 503 "year": 2023, 504 "arxiv_id": "2312.15698", 505 "relevance": "LoRA-based fine-tuning for automated program repair, showing parameter-efficient fine-tuning effectiveness for code-related tasks." 506 } 507 ], 508 "engagement_factors": { 509 "practical_relevance": { 510 "score": 2, 511 "justification": "The curation pipeline could be applied to other noisy code review datasets, and the curated dataset is publicly released for direct use." 512 }, 513 "surprise_contrarian": { 514 "score": 0, 515 "justification": "The finding that cleaner training data improves model performance confirms conventional wisdom rather than challenging it." 516 }, 517 "fear_safety": { 518 "score": 0, 519 "justification": "No AI safety, security, or risk concerns are raised by this work." 520 }, 521 "drama_conflict": { 522 "score": 0, 523 "justification": "No controversy or confrontational claims; straightforward dataset curation paper." 524 }, 525 "demo_ability": { 526 "score": 2, 527 "justification": "Code released on GitHub and data on Zenodo; a practitioner could replicate the pipeline, though no live demo or pip-installable tool." 528 }, 529 "brand_recognition": { 530 "score": 0, 531 "justification": "University researchers from Université de Montréal and Singapore Management University; no famous lab or product association." 532 } 533 } 534 }