scan.json (30874B)
1 { 2 "paper": { 3 "title": "Accelerating Automatic Program Repair with Dual Retrieval-Augmented Fine-Tuning and Patch Generation on Large Language Models", 4 "authors": [ 5 "Hanyang Guo", 6 "Xiaoheng Xie", 7 "Hong-Ning Dai", 8 "Peng Di", 9 "Yu Zhang", 10 "Bishenghui Tao", 11 "Zibin Zheng" 12 ], 13 "year": 2025, 14 "venue": "arXiv preprint", 15 "arxiv_id": "2507.10103" 16 }, 17 "checklist": { 18 "artifacts": { 19 "code_released": { 20 "applies": true, 21 "answer": false, 22 "justification": "Appendix H provides a link to https://anonymous.4open.science/r/SelRepair-5F1D/. However, this is an anonymous review platform link (anonymous.4open.science), which is ephemeral and used during peer review — not a permanent repository or archive. The schema requires 'a working URL or archive.' Anonymous review links are temporary and may be taken down after the review process. Under strict interpretation, this does not constitute a reliable, permanent code release. The paper is also an arXiv preprint, suggesting it is still under review." 23 }, 24 "data_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "The paper uses two public datasets (Tufano dataset from Tufano et al. 2019, and VulRepair dataset from Fu et al. 2022) which are publicly available benchmarks. The enterprise dataset is not released, but the primary benchmarks are public." 28 }, 29 "environment_specified": { 30 "applies": true, 31 "answer": true, 32 "justification": "Appendix E.3 states 'All experiments are conducted on a server configured with 4 GPUs of NVIDIA GeForce RTX 3090' and specifies the base model (StarCoder2-7B), optimizer (Adam), learning rate (5e-5), context window sizes, and fine-tuning epochs. Sufficient environment detail is provided." 33 }, 34 "reproduction_instructions": { 35 "applies": true, 36 "answer": false, 37 "justification": "While code is available via anonymous link and hyperparameters are reported, no step-by-step reproduction instructions (e.g., README commands to run experiments) are explicitly provided in the paper text itself." 38 } 39 }, 40 "statistical_methodology": { 41 "confidence_intervals_or_error_bars": { 42 "applies": true, 43 "answer": false, 44 "justification": "No confidence intervals or error bars are reported anywhere in the paper. All results in Tables 1-5 are point estimates only (e.g., '26.29% EM')." 45 }, 46 "significance_tests": { 47 "applies": true, 48 "answer": false, 49 "justification": "No statistical significance tests are used. The paper makes comparative claims (e.g., 'SelRepair achieves new SoTA performance... outperforming other SoTA LLMs') by comparing point estimates without any p-values, t-tests, or other significance tests." 50 }, 51 "effect_sizes_reported": { 52 "applies": true, 53 "answer": true, 54 "justification": "The paper reports percentage improvements with full baseline context throughout. For example, Table 1 shows SelRepair at 26.29% EM vs RAP-Gen at 24.80% EM, and the text reports percentage improvements like '6.01%' improvement over RAP-Gen. The schema explicitly states: 'A paper that says 12% improvement over baseline (from 45% to 57%) provides enough context for YES.' Both absolute scores and relative improvements are consistently provided across all comparisons in Tables 1-3, giving readers sufficient context to assess the magnitude of effects. Formal effect sizes like Cohen's d are not required by the schema." 55 }, 56 "sample_size_justified": { 57 "applies": true, 58 "answer": false, 59 "justification": "The test sets (5,735 samples for Tufano Subset 1, 6,447 for Subset 2, 821 for VulRepair, 200 for enterprise dataset) are not explicitly justified as to why these sizes are sufficient for the claims made. The test set sizes come directly from public dataset splits without discussion of adequacy." 60 }, 61 "variance_reported": { 62 "applies": true, 63 "answer": false, 64 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. All results appear to be single-run point estimates. There is no mention of multiple runs or seed variation." 65 } 66 }, 67 "evaluation_design": { 68 "baselines_included": { 69 "applies": true, 70 "answer": true, 71 "justification": "Multiple baselines are included: GPT-3.5, GPT-4o, DeepSeek-R1-Distill, RAP-Gen, and several ablations of the proposed method (SelRepairLlama, SelRepairT5, SelRepairLoRA). Comparisons are shown in Tables 1 and 5." 72 }, 73 "baselines_contemporary": { 74 "applies": true, 75 "answer": true, 76 "justification": "Baselines include recent state-of-the-art systems: GPT-4o, DeepSeek-R1-Distill (2025), RAP-Gen (ESEC/FSE 2023). The baselines are contemporary and include recent general-purpose LLMs as well as specialized APR approaches." 77 }, 78 "ablation_study": { 79 "applies": true, 80 "answer": true, 81 "justification": "A thorough ablation study is presented in Table 2 (Section 4.3, RQ2), testing five configurations: without RAG and fine-tuning, without fine-tuning, without RAG, without SR (semantic retriever), and without SSDR (structure/dependency retriever). All components are shown to contribute." 82 }, 83 "multiple_metrics": { 84 "applies": true, 85 "answer": true, 86 "justification": "Three evaluation metrics are used: Exact Match (EM), BLEU-4, and CodeBLEU. These measure exact correctness, n-gram similarity, and code-specific quality respectively." 87 }, 88 "human_evaluation": { 89 "applies": true, 90 "answer": false, 91 "justification": "Human evaluation is relevant to APR — automated metrics like EM only capture exact string matches and miss semantically correct alternative fixes. BLEU and CodeBLEU measure surface similarity, not semantic correctness. Human evaluation of patch quality would strengthen claims about repair effectiveness. The schema says applies=false only 'if human evaluation is clearly irrelevant to the claims.' For code repair, it is not clearly irrelevant — the paper itself acknowledges in the threats section that EM and BLEU may not fully reflect repair correctness. The paper could reasonably be expected to include human evaluation." 92 }, 93 "held_out_test_set": { 94 "applies": true, 95 "answer": true, 96 "justification": "The paper explicitly describes train/validation/test splits: 'we split 80% of the dataset as a training set, 10% as a validation set, and 10% as a test set' (Appendix E.1). Separate held-out test sets are used for final evaluation." 97 }, 98 "per_category_breakdown": { 99 "applies": true, 100 "answer": true, 101 "justification": "Results are broken down by dataset and code length (Tufano Subset 1 for <50 tokens, Tufano Subset 2 for 50-100 tokens, and VulRepair for C/C++). Table 3 further breaks down by RAG threshold settings, showing performance variation across configurations." 102 }, 103 "failure_cases_discussed": { 104 "applies": true, 105 "answer": true, 106 "justification": "Section F.2 provides a case study discussing failure modes of other approaches (SelRepairT5 and SelRepairLoRA generate same code as buggy, SelRepairLlama misunderstands the method, GPT-3.5/4o make invalid modifications). Limitations note that the method fails on cross-method/cross-module bugs." 107 }, 108 "negative_results_reported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The ablation study (Table 2) explicitly shows that 'w/o RAG & Ft' and 'w/o Ft' achieve 0.00 EM, and SelRepairLlama performs poorly (5.96 EM on Subset 1). Table 3 shows that certain threshold settings hurt performance. The paper also notes performance close to (but not exceeding) RAP-Gen on Defects4J V2.0." 112 } 113 }, 114 "claims_and_evidence": { 115 "abstract_claims_supported": { 116 "applies": true, 117 "answer": true, 118 "justification": "The abstract claims '26.29% and 17.64% in terms of exact match (EM) on different datasets while reducing inference time by at least 6.42%' are directly supported by Table 1 (EM scores) and Table 3 (inference time reductions of 6.42%, 13.77%, and 9.95%)." 119 }, 120 "causal_claims_justified": { 121 "applies": true, 122 "answer": true, 123 "justification": "The paper uses ablation studies (Table 2) to make causal claims about which components contribute to performance. Each module (SR, SSDR, fine-tuning) is systematically removed and the resulting performance drop is measured, constituting controlled single-variable manipulation adequate for component-level causal inference." 124 }, 125 "generalization_bounded": { 126 "applies": true, 127 "answer": false, 128 "justification": "The title and abstract use broad language ('Accelerating Automatic Program Repair') but the evaluation is limited to Java and C/C++ datasets at the method level. The paper acknowledges in the threats to validity that 'SelRepair showed promise in repairing Java programs, its effectiveness with other languages like Python or JavaScript is untested,' but the abstract does not adequately bound this scope." 129 }, 130 "alternative_explanations_discussed": { 131 "applies": true, 132 "answer": false, 133 "justification": "The threats to validity section (Appendix G) discusses generic limitations (external validity about language coverage, construct validity about metrics) but does not seriously consider alternative explanations for why SelRepair outperforms baselines. For example, the dataset used to fine-tune SelRepair may overlap more with the test set than RAP-Gen's training data, which is not discussed." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": true, 139 "answer": false, 140 "justification": "The base model 'StarCoder2-7B' is specified. However, 'GPT-3.5' and 'GPT-4o' are used without version numbers, snapshot dates, or API version identifiers, which significantly affects reproducibility given that model behavior changes across versions." 141 }, 142 "prompts_provided": { 143 "applies": true, 144 "answer": false, 145 "justification": "Section 3.2 shows the code-only prompt format as a template: '[BUG] RBC1 [FIX] RFC1 ... [BUG] BC [FIX]' where RBC1, RFC1, BC are placeholders for actual code. Figure 5 shows the GPT-3.5/GPT-4o prompt template with placeholder sections ('Target Buggy Code', 'Retrieved Bug-fix Pairs'). The schema explicitly states: 'A prompt TEMPLATE with placeholders (e.g., [Task Description]) does NOT count unless the actual fill values are also provided — the reader must be able to reconstruct every prompt sent to the model.' While Figure 6 shows one concrete example, the reader cannot reconstruct the full set of prompts used across thousands of test samples. The prompt structure is shown but not the actual prompts." 146 }, 147 "hyperparameters_reported": { 148 "applies": true, 149 "answer": true, 150 "justification": "Appendix E.3 reports: fine-tuning epochs (3 for large LLM, 50 for CodeT5), context window sizes (512, 1024, 1500 tokens for different datasets), learning rate (5e-5), optimizer (Adam), RAG thresholds (0.9 for Subset 1, 0.8 for Subset 2 and VulRepair), and hardware (4x NVIDIA RTX 3090)." 151 }, 152 "scaffolding_described": { 153 "applies": true, 154 "answer": true, 155 "justification": "The dual RAG architecture is described in detail in Section 3.1 including the hybrid retriever algorithm (Algorithm 1 in Appendix C), AST traversal procedure (Algorithm 2 in Appendix D), the RAG selection gate mechanism with threshold, and how retrieved pairs are incorporated into the fine-tuning prompt." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": true, 160 "justification": "Appendix E.1 describes dataset construction: random sampling of 1,000 samples for RAG codebase, 80/10/10 train/validation/test split for the remaining samples, and filtering of 'invalid samples, such as samples that were null' from VulRepair. Dataset sizes are reported in Table 4." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": true, 167 "justification": "The paper has two limitations sections: a 'Limitations' section at the end of the main paper (after the conclusion) and a dedicated 'Threats to Validity' appendix (Appendix G) covering internal, external, and construct validity." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": true, 172 "justification": "The threats to validity section names specific threats: (1) errors in bug-fix dataset and overfitting risk during fine-tuning; (2) coarse-grained threshold settings; (3) untested generalization to languages other than Java and C/C++; (4) reliance on EM and BLEU-4 metrics that may not fully reflect repair correctness. These are moderately specific to this paper." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": true, 177 "justification": "The limitations section states the method is 'limited by datasets focused on individual methods' and 'effectiveness with other languages like Python or JavaScript is untested.' The paper explicitly identifies what was not tested (cross-method/cross-module bugs, multiple programming languages beyond Java and C/C++)." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": false, 184 "justification": "While the Tufano and VulRepair benchmarks are publicly available as original datasets, the paper's specific processed versions (exact random splits, RAG codebase selection of 1,000/2,000 samples) are not verifiably released. The anonymous repository link (anonymous.4open.science) is ephemeral. The enterprise dataset of 200 samples is explicitly not released ('we intend to open-source this benchmark in the future'). For independent verification of the reported results, one needs the exact data splits and enterprise data used — the original public benchmarks alone are insufficient since random sampling was used to create splits." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "Appendix E.1 describes how each dataset was constructed: the Tufano dataset 'is collected from fix commit records from GitHub' with known splits by code length; VulRepair 'consists of bug-fix pairs combined by CVE-Fixes and BigVul.' The enterprise dataset 'consists of 200 semantic bug-fix pairs caused by enterprise developers in real development scenarios.'" 190 }, 191 "recruitment_methods_described": { 192 "applies": false, 193 "answer": false, 194 "justification": "There are no human participants in this study. The data source is a standard public benchmark (Tufano dataset from GitHub commit history) and an enterprise software dataset. NA for this criterion." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": true, 199 "justification": "The data pipeline is documented: random sampling of RAG codebase, 80/10/10 splits, filtering of null/invalid samples from VulRepair. Table 4 reports the final dataset sizes for each split, allowing verification that the pipeline is consistent." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": false, 206 "justification": "No acknowledgments or funding disclosure section is present in the paper. It is unclear whether this work was funded by Ant Group (which employs two authors) or by academic grants. The absence of a funding statement is treated as NO." 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "Author affiliations are listed on the title page: Xiaoheng Xie, Peng Di, and Yu Zhang are from Ant Group (a major tech company). This affiliation is disclosed, even though the paper does not explicitly discuss any potential conflict of interest this may represent." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": false, 216 "justification": "With three authors from Ant Group and the evaluation including an enterprise dataset 'from a software enterprise,' there is a plausible conflict of interest. The enterprise dataset may be from Ant Group itself. However, since funding is not disclosed at all, independence cannot be verified, and NO is the appropriate answer." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests statement or disclosure of financial interests (patents, equity) is present in the paper. The absence of such a statement means this criterion is not satisfied." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": true, 227 "answer": false, 228 "justification": "The training data cutoff for GPT-3.5 and GPT-4o is not stated. For StarCoder2-7B (the base model for SelRepair), the training cutoff is not mentioned in the paper. The Tufano dataset was published in 2019 and is widely used, making contamination risk real for both GPT models and StarCoder2." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": true, 232 "answer": false, 233 "justification": "The paper does not discuss potential train-test overlap for GPT-3.5, GPT-4o, or StarCoder2-7B with the Tufano benchmark dataset. Section F.3 does note that RAP-Gen's training data 'constructed by the same projects as Defects4J... may cause data leaks,' but this concern is not applied to SelRepair or the GPT baselines." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": true, 237 "answer": false, 238 "justification": "The Tufano dataset (from 2019) is a widely used benchmark that was almost certainly included in the training data of GPT-3.5, GPT-4o, and StarCoder2-7B (all trained on large code corpora). The paper does not address this contamination risk." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants are involved in this study. All evaluation uses automated metrics on code datasets." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants are involved. IRB approval is not applicable." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants are involved. Demographics are not applicable." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants are involved. Inclusion/exclusion criteria for participants are not applicable." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants are involved. Randomization of participants is not applicable." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants are involved. Blinding is not applicable." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants are involved. Attrition is not applicable." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": true, 281 "answer": true, 282 "justification": "Table 3 reports inference time reductions (e.g., '6.42% less than no threshold setting') and average input token lengths for different configurations. This constitutes a practical cost/efficiency analysis, though monetary API costs are not reported." 283 }, 284 "compute_budget_stated": { 285 "applies": true, 286 "answer": false, 287 "justification": "While the hardware is stated (4x NVIDIA RTX 3090), total GPU hours, training time, or total compute budget are not reported. The paper does not quantify how much compute was required for the fine-tuning process." 288 } 289 } 290 }, 291 "claims": [ 292 { 293 "claim": "SelRepair achieves state-of-the-art performance of 26.29% EM on Tufano Subset 1 and 17.64% EM on Tufano Subset 2, outperforming all compared approaches.", 294 "evidence": "Table 1 in Section 4.2 shows SelRepair achieving 26.29 EM and 17.64 EM on the two Tufano subsets, compared to RAP-Gen (24.80, 15.84), GPT-3.5 (2.58, 1.72), and GPT-4o (0.17, 0.00).", 295 "supported": "strong" 296 }, 297 { 298 "claim": "The RAG selection gate reduces inference time by at least 6.42% while maintaining or improving APR performance.", 299 "evidence": "Table 3 (Section 4.4) shows that with threshold 0.9, inference time decreases by 6.42%, 29.68%, and 15.96% compared to no threshold in the three datasets, while the EM score equals or exceeds the no-threshold baseline.", 300 "supported": "strong" 301 }, 302 { 303 "claim": "Both semantic retrieval (SR) and structural/dependency retrieval (SSDR) contribute to APR performance.", 304 "evidence": "Ablation study in Table 2 (Section 4.3) shows SelRepair outperforms 'w/o SR' by 2.82% and 'w/o SSDR' by 18.00% on Tufano Subset 1 in terms of EM. All ablated versions perform worse than the full SelRepair.", 305 "supported": "strong" 306 }, 307 { 308 "claim": "Full-parameter fine-tuning contributes more to APR performance than PEFT (LoRA) fine-tuning.", 309 "evidence": "Table 1 shows SelRepairLoRA achieves 22.62 EM vs. SelRepair's 26.29 EM on Tufano Subset 1. Table 2 shows that 'w/o Fine-tuning' achieves 0.00 EM, confirming fine-tuning is essential.", 310 "supported": "moderate" 311 }, 312 { 313 "claim": "SelRepair generalizes to real-world enterprise scenarios, generating 59 correct patches from 200 enterprise bug-fix pairs.", 314 "evidence": "Figure 2 (Section 4.5) shows SelRepair generates 59 correct patches on the enterprise dataset, compared to RAP-Gen's 1 correct patch. However, the enterprise dataset is not public and evaluation details are limited.", 315 "supported": "moderate" 316 } 317 ], 318 "methodology_tags": [ 319 "benchmark-eval" 320 ], 321 "key_findings": "SelRepair, a novel automated program repair framework combining full-parameter fine-tuning of StarCoder2-7B with a dual RAG module (semantic and syntactic/structural retrievers), achieves state-of-the-art performance on Java and C/C++ APR benchmarks with 26.29% and 17.64% exact match on Tufano subsets. The RAG selection gate mechanism controls prompt length and reduces inference time by 6.42-29.68% while maintaining or improving repair accuracy. Ablation studies confirm that both retrieval components and fine-tuning are necessary, with the system substantially outperforming both prompt-engineering baselines (GPT-3.5, GPT-4o) and smaller fine-tuned models (RAP-Gen). An enterprise evaluation with 200 real-world bug-fix pairs shows 59 correct patches, demonstrating practical applicability.", 322 "red_flags": [ 323 { 324 "flag": "No statistical uncertainty quantification", 325 "detail": "All results in Tables 1-5 are single-run point estimates with no confidence intervals, error bars, or variance reported across multiple seeds. Claims of SOTA superiority (e.g., SelRepair 26.29% vs. RAP-Gen 24.80% on Tufano Subset 1 — a 1.49 percentage point difference) are made without testing whether the differences are statistically significant." 326 }, 327 { 328 "flag": "Benchmark contamination not addressed", 329 "detail": "The Tufano dataset (published 2019) is widely used in code LLM training corpora. Both StarCoder2-7B (the base model for SelRepair) and GPT-3.5/GPT-4o were trained on large code datasets that almost certainly include this benchmark. The paper does not discuss contamination risk, and the paper acknowledges RAP-Gen has data leakage concerns on Defects4J but does not apply the same scrutiny to SelRepair." 330 }, 331 { 332 "flag": "GPT baseline versions unspecified", 333 "detail": "GPT-3.5 and GPT-4o are used as baselines without specifying API version or snapshot date. These models have different versions with varying capabilities, and results may not be reproducible as the referenced model version cannot be determined." 334 }, 335 { 336 "flag": "Undisclosed funding and potential conflict of interest", 337 "detail": "Three of the seven authors are affiliated with Ant Group. The enterprise dataset used in RQ4 may be from Ant Group itself, creating a potential evaluation bias. No funding disclosure or competing interests statement is present in the paper." 338 }, 339 { 340 "flag": "Enterprise dataset not publicly available", 341 "detail": "The enterprise dataset of 200 bug-fix pairs used in RQ4 is not publicly released, making those results unverifiable. The paper states 'we intend to open-source this benchmark in the future' but this is a promise, not a release." 342 }, 343 { 344 "flag": "Overly broad title relative to scope", 345 "detail": "The paper is titled 'Accelerating Automatic Program Repair' generally, but evaluation is restricted to Java and C/C++ at the method level only. The limitations section acknowledges this but the abstract does not sufficiently bound the generalizability claim." 346 } 347 ], 348 "cited_papers": [ 349 { 350 "title": "RAP-Gen: Retrieval-augmented patch generation with CodeT5 for automatic program repair", 351 "authors": [ 352 "Weishi Wang", 353 "Yue Wang", 354 "Shafiq Joty", 355 "Steven C.H. Hoi" 356 ], 357 "year": 2023, 358 "relevance": "The primary competing baseline in this paper; a RAG-based APR approach using CodeT5 and semantic similarity retrieval that SelRepair improves upon." 359 }, 360 { 361 "title": "Code Llama: Open foundation models for code", 362 "authors": [ 363 "Baptiste Rozière", 364 "Jonas Gehring", 365 "Fabian Gloeckle" 366 ], 367 "year": 2024, 368 "arxiv_id": "2308.12950", 369 "relevance": "Open-source code LLM used as the base model for one SelRepair variant (SelRepairLlama), relevant to code generation and LLM capability evaluation." 370 }, 371 { 372 "title": "StarCoder 2 and the Stack v2: The next generation", 373 "authors": [ 374 "Anton Lozhkov", 375 "Raymond Li", 376 "Loubna Ben Allal" 377 ], 378 "year": 2024, 379 "arxiv_id": "2402.19173", 380 "relevance": "StarCoder2-7B is the base model used for SelRepair's full-parameter fine-tuning; relevant to code LLM evaluation in the survey." 381 }, 382 { 383 "title": "UniXcoder: Unified cross-modal pre-training for code representation", 384 "authors": [ 385 "Daya Guo", 386 "Shuai Lu", 387 "Nan Duan", 388 "Yanlin Wang", 389 "Ming Zhou", 390 "Jian Yin" 391 ], 392 "year": 2022, 393 "relevance": "The code embedding model used in SelRepair's retrieval module; directly relevant to LLM-based code tooling." 394 }, 395 { 396 "title": "An empirical study on fine-tuning large language models of code for automated program repair", 397 "authors": [ 398 "Kai Huang", 399 "Xiangxin Meng", 400 "Jian Zhang", 401 "Yang Liu", 402 "Wenjie Wang", 403 "Shuhao Li", 404 "Yuqing Zhang" 405 ], 406 "year": 2023, 407 "relevance": "Empirical study on fine-tuning LLMs for APR that informs the methodology of this paper; directly relevant to the survey's focus on LLM evaluation in software engineering." 408 }, 409 { 410 "title": "VulRepair: a T5-based automated software vulnerability repair", 411 "authors": [ 412 "Michael Fu", 413 "Chakkrit Tantithamthavorn", 414 "Trung Le", 415 "Van Nguyen", 416 "Dinh Phung" 417 ], 418 "year": 2022, 419 "relevance": "Provides the C/C++ vulnerability repair dataset used in this paper's evaluation; relevant as a benchmark paper for LLM-based program repair." 420 }, 421 { 422 "title": "Agentless: Demystifying LLM-based software engineering agents", 423 "authors": [ 424 "Chunqiu Steven Xia", 425 "Yinlin Deng", 426 "Soren Dunn", 427 "Lingming Zhang" 428 ], 429 "year": 2024, 430 "arxiv_id": "2407.01489", 431 "relevance": "A prominent LLM-based software engineering agent paper that provides context for the agentic approach to program repair." 432 }, 433 { 434 "title": "Impact of code language models on automated program repair", 435 "authors": [ 436 "Nan Jiang", 437 "Kevin Liu", 438 "Thibaud Lutellier", 439 "Lin Tan" 440 ], 441 "year": 2023, 442 "relevance": "Empirical evaluation of code LLMs for automated program repair, directly relevant to the survey's focus on LLM-based SE tool evaluation." 443 }, 444 { 445 "title": "RepairLlama: Efficient representations and fine-tuned adapters for program repair", 446 "authors": [ 447 "André Silva", 448 "Sen Fang", 449 "Martin Monperrus" 450 ], 451 "year": 2024, 452 "arxiv_id": "2312.15698", 453 "relevance": "PEFT-based approach for LLM program repair that this paper compares against and distinguishes from via full-parameter fine-tuning." 454 }, 455 { 456 "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs", 457 "authors": [ 458 "René Just", 459 "Darioush Jalali", 460 "Michael D. Ernst" 461 ], 462 "year": 2014, 463 "relevance": "Standard APR benchmark used for supplementary evaluation (Appendix F.3); widely used in the SE community for evaluating program repair tools." 464 }, 465 { 466 "title": "Retrieval-based prompt selection for code-related few-shot learning", 467 "authors": [ 468 "Noor Nashid", 469 "Mifta Sintaha", 470 "Ali Mesbah" 471 ], 472 "year": 2023, 473 "relevance": "Prior RAG-based approach for code tasks that this paper builds upon and improves for APR with dual retrieval." 474 }, 475 { 476 "title": "Towards an understanding of large language models in software engineering tasks", 477 "authors": [ 478 "Zibin Zheng", 479 "Kaiwen Ning", 480 "Jiachi Chen", 481 "Yanlin Wang", 482 "Wenqing Chen", 483 "Lianghong Guo", 484 "Weicheng Wang" 485 ], 486 "year": 2023, 487 "arxiv_id": "2308.11396", 488 "relevance": "Survey of LLMs applied to software engineering tasks; provides broader context for evaluating LLM capability in code-related tasks." 489 } 490 ] 491 }