scan.json (30990B)
1 { 2 "paper": { 3 "title": "RepairLLaMA: Efficient Representations and Fine-Tuned Adapters for Program Repair", 4 "authors": ["André Silva", "Sen Fang", "Martin Monperrus"], 5 "year": 2023, 6 "venue": "IEEE Transactions on Software Engineering", 7 "arxiv_id": "2312.15698", 8 "doi": "10.1109/TSE.2025.3581062" 9 }, 10 "scan_version": 3, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "RepairLLaMA demonstrates that parameter-efficient fine-tuning (LoRA) with repair-specific code representations incorporating fault localization signals outperforms both full fine-tuning and zero-shot GPT-3.5/GPT-4 on three Java program repair benchmarks. The best representation (IR4xOR2) combines buggy code as comments with an infilling mask, correctly fixing 144 Defects4J v2, 109 HumanEval-Java, and 20 GitBug-Java bugs. LoRA adapters with only 4M parameters (1600x smaller than the base LLM) outperform full-parameter fine-tuning on CodeLLaMA-7B, suggesting regularization benefits. Representations without fault localization perform substantially worse, confirming that task-specific input signals are essential for fine-tuning effectiveness.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper states: 'we publish our source code, models, and artifacts at https://github.com/ASSERT-KTH/repairllama' (Section I, contributions). A working URL is provided." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "All three evaluation benchmarks (Defects4J, HumanEval-Java, GitBug-Java) are publicly available. The fine-tuning dataset Megadiff is also public. The paper additionally references the GitHub repository for supplementary data." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions hardware (4xA100 40GB GPUs for training, single A100 for inference, Ubuntu 22.04.3 LTS) and the HuggingFace transformers library, but does not provide a requirements.txt, Dockerfile, or list of library versions sufficient to recreate the environment." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "The paper describes methodology in detail (hyperparameters, data processing, inference setup) but does not provide step-by-step reproduction instructions with commands. A GitHub repository is provided but the paper itself does not contain a 'Reproducing Results' section." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Tables II, III, and IV report only point estimates (e.g., '144 bugs fixed') with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": true, 46 "justification": "McNemar tests are used for pairwise comparisons across all RQs, with exact p-values reported (e.g., 'p ≤ 1.27e−04', 'p ≈ 0.545', 'p ≤ 4.10e−08'). Full p-values are provided as supplemental material [41]." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Absolute effect sizes are reported throughout: e.g., 'plausibly repairs 49 (195 vs. 146) bugs more and correctly repairs 46 (144 vs. 98) more' (Section IV-B). Both absolute and relative differences are given with baseline context." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The sample sizes are dictated by the benchmarks (488 Defects4J, 162 HumanEval-Java, 90 GitBug-Java bugs) with no justification for whether these are adequate or any power analysis." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "All results are from single training runs. No standard deviation, variance across seeds, or spread measures are reported. While beam search inference is deterministic, training variance from different seeds is not explored." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Multiple baselines: non-fine-tuned CodeLLaMA-7B (two prompt variants), full-parameter fine-tuning, deepseek-coder-6.7b, GPT-3.5, GPT-4, Jiang et al.'s fine-tuned Incoder-6B, and RAP-Gen (Tables II-IV)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Baselines include GPT-3.5 and GPT-4 (Dec 2023 API versions), deepseek-coder-6.7b (2024), RAP-Gen (2023), and Jiang et al. (2023), all contemporary at time of experimentation." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "RQ1 systematically ablates code representations (6 input/output pairs, Table II), isolating the effect of fault localization signals and buggy code preservation. RQ2 ablates the fine-tuning method (LoRA vs full-parameter, Table III)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Four patch assessment metrics are used: plausible patches, exact match, AST match, and semantic match (Section III-C)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": true, 88 "justification": "Semantic match assessment involves manual expert evaluation: 'the two first authors independently label all plausible but not AST/Exact match patches in a first round. For the patches the two first authors disagree upon, the third author breaks the tie' (Section III-C)." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Three evaluation benchmarks (Defects4J, HumanEval-Java, GitBug-Java) are all separate from the fine-tuning dataset (Megadiff). The authors verified no identical overlap exists and excluded 3 partially overlapping samples (Section V-D)." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down per benchmark (3 benchmarks), per code representation (6 pairs in Table II), per fine-tuning method (Table III), and per model (Table IV). Multi-location bug results are also separately discussed." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper discusses catastrophic forgetting on HumanEval-Java where fine-tuned IR3xOR2 performs worse than the baseline (Section IV-A). The paper also discusses why representations without fault localization fail to perform (rows 3-5 of Table II)." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Several negative results: IR1xOR1/OR3/OR4 representations 'perform considerably worse than both the baseline and other code representations' (Section IV-A). Fine-tuned IR3xOR2 underperforms baseline on HumanEval-Java. LoRA underperforms full fine-tuning for deepseek-coder on Defects4J and GitBug-Java." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims 144 Defects4J, 109 HumanEval-Java, and 20 GitBug-Java bugs fixed, 'outperforming all baselines.' Tables II-IV confirm all numbers and the outperformance claims, with statistical significance tests supporting them." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "Causal claims (e.g., 'code representations with repair signals allow the LLM to synthesize patches more effectively') are supported by controlled ablation: 6 code representations evaluated with identical hyperparameters, isolating the representation variable. LoRA vs FFT comparisons also use controlled single-variable manipulation." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title claims 'Program Repair' generally, but all experiments are limited to Java. While the threats to validity acknowledge 'the focus on a single programming language' and 'the focus on a single LLM,' the title and framing suggest broader applicability than demonstrated." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper discusses catastrophic forgetting as an alternative explanation for HumanEval-Java results (Section IV-A), overfitting as the reason full fine-tuning underperforms (Section IV-B), and fine-tuning dataset distribution mismatch between Megadiff and HumanEval-Java." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper explicitly distinguishes between plausible patches (pass all tests — the proxy) and correct patches (exact/AST/semantic match with developer patches — closer to ground truth), reporting both separately and discussing why test-passing alone is insufficient." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Specific model versions: 'CodeLlama-7b', 'gpt-3.5-turbo-0613', 'gpt-4-0613', 'deepseek-coder-6.7b-base', 'gpt-3.5-turbo-0125' with training cutoff dates (Section III-B)." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Figure 4 shows the exact prompt used for GPT-3.5 and GPT-4. Figures 2 and 3 show the exact input/output code representations used for fine-tuned models. All information sent to models is reproducible from these figures." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Comprehensive hyperparameters: learning rate 5e-4 with cosine decay, max length 1024, epochs 2, batch size 16/GPU, Adam_W optimizer, LoRA rank 8, alpha 16, dropout 0.05, q_proj and v_proj layers, beam size 10. Full fine-tuning uses learning rate 2e-5 (Section III-B)." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. RepairLLaMA is a straightforward fine-tuning and single-pass inference approach with no iterative repair, tool use, or feedback loops." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "The Megadiff processing pipeline is described: extract function pairs → eliminate non-single-function changes → remove duplicates via textual comparison → compute code representations → filter by total length < 1024 tokens → resulting in 30,000–50,000 pairs (Section III-B)." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section V-D 'Threats to Validity' provides substantive discussion of both internal threats (data leakage, fine-tuning overlap) and external threats (single language, single LLM)." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Specific threats discussed: 3 Megadiff samples overlap with Defects4J (excluded); pre-training data may include test benchmarks (mitigated with GitBug-Java and HumanEval-Java); single language (Java); single base model (mitigated with deepseek-coder comparison); catastrophic forgetting on HumanEval-Java." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "The paper explicitly scopes to: Java bugs, functional bugs with failing test cases, single-function bugs, intra-procedural fixes (Section II-B). External threats state 'our results may not generalize to other languages' and 'the focus on a single LLM' (Section V-D)." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "Source code, models, and artifacts are published at the GitHub repository. All three benchmarks (Defects4J, HumanEval-Java, GitBug-Java) and the fine-tuning dataset (Megadiff) are publicly available. Statistical test results are provided as supplemental material [41]." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "The Megadiff dataset origin and processing are described (Section III-B). The three benchmarks are cited with their collection methodology papers. The specific subsets used (e.g., 488 single-function Defects4J bugs from 835 total) are explained." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data sources are standard public benchmarks (Defects4J, HumanEval-Java, GitBug-Java) and an existing dataset (Megadiff)." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline from Megadiff to fine-tuning data is documented with filtering stages: extract function pairs → remove non-single-function changes → deduplicate → compute representations → length filter (< 1024 tokens) → 30K–50K pairs (Section III-B). Benchmark subset selection is also documented." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "Section VIII acknowledges supercomputing resources from Berzelius/National Supercomputer Centre at Linköping University, the Knut and Alice Wallenberg Foundation, and WASP (Wallenberg AI, Autonomous Systems and Software Program)." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly stated: KTH Royal Institute of Technology (Silva, Monperrus) and NC State University (Fang). These are academic institutions with no commercial stake in the evaluated models." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": true, 218 "justification": "Funding from the Wallenberg Foundation and WASP are academic/governmental research funding bodies with no commercial interest in the performance of CodeLLaMA, GPT-3.5, or GPT-4." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": true, 230 "justification": "Training data cutoffs are explicitly stated: 'CodeLlama-7b (September 2022), gpt-4-0613 (September 2021), gpt-3.5-turbo-0613 (September 2021)' (Section III-B)." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": true, 235 "justification": "Extensive discussion: Megadiff vs Defects4J overlap was checked, 3 overlapping samples found and excluded (Section V-D). GitBug-Java contains only post-cutoff bugs. HumanEval-Java and GitBug-Java were constructed after Megadiff. Pre-training contamination is also discussed." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": true, 240 "justification": "The paper acknowledges that Defects4J bugs may be in LLM pre-training data and mitigates this by evaluating on GitBug-Java (2023 bugs, all after model training cutoffs) and HumanEval-Java. They also cite Ramos et al. [45] on LLM benchmark memorization (Section V-D)." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study. The manual patch assessment by authors is not a human subjects study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. This is a benchmark evaluation study with automated and expert-assessed patch evaluation." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference cost, latency, or time per bug is reported. The paper mentions inference uses 'a single A100 40GB GPU' but provides no wall-clock time or cost figures." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "Hardware is stated (4xA100 40GB for training, single A100 for inference) but total GPU hours, training time, or API costs for GPT experiments are not quantified." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from a single training run per configuration." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs is not stated. Results are presented without indicating whether they come from one or multiple training runs." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "Hyperparameters are stated (learning rate 5e-4, LoRA rank 8, etc.) but no description of how these were selected, whether any search was conducted, or how many configurations were tried." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "The best code representation (IR4xOR2) is selected based on systematic evaluation of all 6 representation pairs with results fully reported in Table II. Selection is transparent and based on evaluation benchmark performance." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "McNemar tests are performed for multiple pairwise comparisons across representations and models, but no correction for multiple comparisons (Bonferroni, Holm, etc.) is applied or mentioned." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors compare RepairLLaMA against their own implementations of baselines and prompt designs for GPT models without acknowledging author-evaluation bias. Comparison with RAP-Gen uses the original authors' reported numbers, which is fairer." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "RQ2 explicitly compares LoRA (4M parameters, 0.39% of full model) vs full-parameter fine-tuning (7B parameters), showing LoRA achieves better performance with ~1600x fewer parameters. The paper also notes RepairLLaMA generates 10 patches vs RAP-Gen's 100." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper does not discuss whether Defects4J, HumanEval-Java, or GitBug-Java actually measure real-world program repair capability. While it notes distribution differences between benchmarks (HumanEval-Java vs Megadiff), it does not question construct validity." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is used. RepairLLaMA is a direct fine-tuning approach with single-pass inference, no iterative agent or tool-based scaffold." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": true, 343 "justification": "Training cutoffs are stated for all models, and GitBug-Java 'exclusively contains bugs from after the training data cutoff date of all models used in our experiments' (Section III-B). HumanEval-Java is also used to mitigate this concern." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": true, 348 "justification": "The paper explicitly discusses fault localization assumptions, proposing a more realistic region-based FL instead of perfect multi-line FL assumed by prior work. They argue this prevents information leakage from unrealistic FL assumptions (Section V-B)." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": true, 353 "justification": "The authors 'meticulously compare the samples in our fine-tuning dataset, Megadiff, with those in Defects4J' and found 3 overlapping samples which were excluded. HumanEval-Java and GitBug-Java were constructed after Megadiff (Section V-D)." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": true, 358 "justification": "Textual comparison of Megadiff against Defects4J was performed as a concrete detection method, identifying 3 samples where 'patch includes a function also found in Megadiff samples' (Section V-D). Temporal separation was used for GitBug-Java." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "RepairLLaMA with IR4xOR2 correctly fixes 144 Defects4J v2, 109 HumanEval-Java, and 20 GitBug-Java bugs (semantic match), outperforming all baselines.", 365 "evidence": "Tables II, III, and IV report these numbers. Statistical significance tests confirm differences with p ≤ 1.27e−04 for most comparisons (Section IV-A, supplemental material [41]).", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Code representations with fault localization signals are essential for effective fine-tuning for program repair.", 370 "evidence": "Table II rows 3-5 (no FL) perform considerably worse than rows 6-8 (with FL). IR1xOR1 fixes 45 Defects4J bugs vs IR4xOR2's 144. The difference is statistically significant (Section IV-A).", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Parameter-efficient fine-tuning (LoRA) outperforms full-parameter fine-tuning for CodeLLaMA-7B on program repair.", 375 "evidence": "Table III: LoRA fixes 144 vs FFT's 98 Defects4J bugs (semantic match), 109 vs 100 HumanEval-Java, 20 vs 13 GitBug-Java. Statistical significance p ≤ 1.29e−03 (Section IV-B).", 376 "supported": "strong" 377 }, 378 { 379 "claim": "The benefit of parameter-efficient fine-tuning varies across base models.", 380 "evidence": "For deepseek-coder-6.7b, full fine-tuning outperforms LoRA on Defects4J (138 vs 128) and GitBug-Java (19 vs 13), while LoRA outperforms on HumanEval-Java (124 vs 119). Table III, Section IV-B.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "RepairLLaMA outperforms GPT-3.5 and GPT-4 on Defects4J and GitBug-Java.", 385 "evidence": "Table IV: RepairLLaMA fixes 144 vs GPT-4's 72 (semantic match) on Defects4J, and 20 vs 10 on GitBug-Java. Statistical significance p ≤ 4.10e−08 (Section IV-C).", 386 "supported": "strong" 387 }, 388 { 389 "claim": "RepairLLaMA's region-based fault localization enables repair of multi-location bugs.", 390 "evidence": "51 multi-location bugs correctly repaired. Examples in Figures 5 (Math-86, two distant edit locations) and 6 (STRONGEST_EXTENSION, multiple modifications). Section IV-A.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "LoRA acts as a regularizer preventing overfitting during fine-tuning.", 395 "evidence": "The hypothesis is stated in Section IV-B citing Fu et al. [21], but no direct overfitting analysis (e.g., training vs validation loss curves) is provided. The claim is inferred from performance differences between LoRA and FFT.", 396 "supported": "moderate" 397 } 398 ], 399 "red_flags": [ 400 { 401 "flag": "No variance across training runs", 402 "detail": "All results appear to be from single training runs per configuration. Without seed sensitivity analysis, it is impossible to assess whether the reported performance differences are stable or artifacts of a particular training run." 403 }, 404 { 405 "flag": "No multiple comparison correction", 406 "detail": "McNemar tests are run for many pairwise comparisons across 6+ models without applying Bonferroni or similar corrections, inflating the risk of false positives in significance claims." 407 }, 408 { 409 "flag": "Missing compute budget", 410 "detail": "Total GPU hours and training time per configuration are not reported, making it difficult to assess the true computational cost of the approach despite claims about efficiency." 411 } 412 ], 413 "cited_papers": [ 414 { 415 "title": "Impact of code language models on automated program repair", 416 "authors": ["N. Jiang", "K. Liu", "T. Lutellier", "L. Tan"], 417 "year": 2023, 418 "relevance": "Studies fine-tuning LLMs for program repair with code representations, a direct predecessor to RepairLLaMA's approach." 419 }, 420 { 421 "title": "RAP-Gen: Retrieval-augmented patch generation with CodeT5 for automatic program repair", 422 "authors": ["W. Wang", "Y. Wang", "S. Joty", "S. C. Hoi"], 423 "year": 2023, 424 "relevance": "State-of-the-art retrieval-augmented full-parameter fine-tuning baseline for program repair that RepairLLaMA outperforms." 425 }, 426 { 427 "title": "LoRA: Low-rank adaptation of large language models", 428 "authors": ["E. J. Hu", "P. Wallis", "Z. Allen-Zhu", "Y. Li", "S. Wang", "L. Wang", "W. Chen"], 429 "year": 2021, 430 "relevance": "The parameter-efficient fine-tuning technique at the core of RepairLLaMA's approach." 431 }, 432 { 433 "title": "Code LLaMA: Open foundation models for code", 434 "authors": ["B. Roziere", "J. Gehring", "F. Gloeckle"], 435 "year": 2023, 436 "arxiv_id": "2308.12950", 437 "relevance": "The base LLM used by RepairLLaMA, demonstrating code-specific pre-training with infilling support." 438 }, 439 { 440 "title": "Keep the conversation going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT", 441 "authors": ["C. S. Xia", "L. Zhang"], 442 "year": 2023, 443 "arxiv_id": "2304.00385", 444 "relevance": "Demonstrates conversational/agent-based LLM program repair, contrasted with RepairLLaMA's fine-tuning approach." 445 }, 446 { 447 "title": "SWE-agent: Agent-computer interfaces enable automated software engineering", 448 "authors": ["J. Yang", "C. E. Jimenez", "A. Wettig", "K. Lieret"], 449 "year": 2024, 450 "arxiv_id": "2405.15793", 451 "relevance": "Agent-based approach to program repair using LLMs with tool access, representing the alternative paradigm to fine-tuning." 452 }, 453 { 454 "title": "An empirical study on fine-tuning large language models of code for automated program repair", 455 "authors": ["K. Huang", "X. Meng", "J. Zhang", "Y. Liu"], 456 "year": 2023, 457 "relevance": "Studies code representations and evaluation metrics for fine-tuning LLMs for repair, but assumes unrealistic perfect multi-line fault localization." 458 }, 459 { 460 "title": "DeepSeek-Coder: When the large language model meets programming", 461 "authors": ["D. Guo", "Q. Zhu", "D. Yang"], 462 "year": 2024, 463 "arxiv_id": "2401.14196", 464 "relevance": "Alternative code LLM used as a second base model to evaluate RepairLLaMA's generalizability across architectures." 465 }, 466 { 467 "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning", 468 "authors": ["C. S. Xia", "L. Zhang"], 469 "year": 2022, 470 "relevance": "Explores zero-shot LLM-based program repair, showing the baseline capability without fine-tuning." 471 }, 472 { 473 "title": "On the effectiveness of parameter-efficient fine-tuning", 474 "authors": ["Z. Fu", "H. Yang", "A. M.-C. So", "W. Lam"], 475 "year": 2023, 476 "relevance": "Theoretical foundation for why PEFT helps prevent overfitting, cited to support RepairLLaMA's regularization claims." 477 }, 478 { 479 "title": "Are large language models memorizing bug benchmarks?", 480 "authors": ["D. Ramos", "C. Mamede", "K. Jain", "P. Canelas"], 481 "year": 2024, 482 "arxiv_id": "2411.13323", 483 "relevance": "Directly addresses benchmark contamination risk in LLM program repair evaluation, a key validity concern." 484 }, 485 { 486 "title": "AutoCodeRover: Autonomous program improvement", 487 "authors": ["Y. Zhang", "H. Ruan", "Z. Fan", "A. Roychoudhury"], 488 "year": 2024, 489 "arxiv_id": "2404.05427", 490 "relevance": "Agent-based autonomous program repair approach representing the alternative paradigm to RepairLLaMA's fine-tuning." 491 } 492 ], 493 "engagement_factors": { 494 "practical_relevance": { 495 "score": 2, 496 "justification": "Released repair adapter can be used by practitioners for Java bug fixing with CodeLLaMA, though limited to single-function bugs." 497 }, 498 "surprise_contrarian": { 499 "score": 1, 500 "justification": "Showing a 7B fine-tuned model outperforms GPT-4 is somewhat surprising but consistent with the broader fine-tuning literature." 501 }, 502 "fear_safety": { 503 "score": 0, 504 "justification": "No AI safety or security concerns raised; focused on improving automated program repair." 505 }, 506 "drama_conflict": { 507 "score": 0, 508 "justification": "No controversy; straightforward empirical comparison of fine-tuning approaches." 509 }, 510 "demo_ability": { 511 "score": 2, 512 "justification": "Code, models, and artifacts are published on GitHub, allowing replication and use of the repair adapter." 513 }, 514 "brand_recognition": { 515 "score": 1, 516 "justification": "Monperrus lab at KTH is well-known in the program repair community but not a household name in broader AI discourse." 517 } 518 } 519 }