scan.json (32172B)
1 { 2 "paper": { 3 "title": "How Far Can We Go with Practical Function-Level Program Repair?", 4 "authors": [ 5 "Jiahong Xiang", 6 "Xiaoyang Xu", 7 "Fanchu Kong", 8 "Mingyuan Wu", 9 "Zizheng Zhan", 10 "Haotian Zhang", 11 "Yuqun Zhang" 12 ], 13 "year": 2024, 14 "venue": "arXiv.org", 15 "arxiv_id": "2404.12833", 16 "doi": "10.48550/arXiv.2404.12833" 17 }, 18 "scan_version": 3, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "methodology_tags": ["benchmark-eval"], 21 "key_findings": "LLMs with zero-shot learning are already powerful function-level APR techniques, while few-shot learning produces disparate and sometimes negative results (from +10% to -49.7% across models). Auxiliary repair-relevant information (trigger tests, error messages, comments) significantly improves repair performance, nearly matching costly statement-level fault location. SRepair, a dual-LLM framework using Chain of Thought for repair suggestions, correctly fixes 300 single-function and 32 multi-function bugs in Defects4J, outperforming prior APR techniques by at least 85%.", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "The paper states 'The data and code are available at GitHub [1]' referencing https://github.com/GhabiX/SRepair. A concrete URL is provided." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The evaluation uses publicly available Defects4J (v1.2 and v2.0) and QuixBugs datasets. The GitHub repository also provides supplementary data." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "Hardware is specified (128-core AMD EPYC 7H12, 512 GiB RAM, 8x NVIDIA A100 80GB, Ubuntu 20.04.6 LTS) but no software environment details (requirements.txt, dependency versions, Dockerfile) are provided in the paper. The paper defers to GitHub for 'LLM configuration details' but does not describe a reproducible software environment." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper references GitHub for code and data but does not include step-by-step reproduction instructions. The experimental pipeline (patch generation, validation) is described at a high level but not as executable steps." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "All results are reported as point estimates (number of plausible/correct fixes). No confidence intervals, error bars, or uncertainty measures are provided in any tables or figures." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper makes numerous comparative claims (e.g., 'surpassing all previous APR techniques by at least 85%') based solely on comparing raw counts without any statistical significance tests." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "The paper reports percentage improvements with baseline context, e.g., '85% more than ChatRepair' (from 162 to 300), '1.59× more than Repilot,' and specific percentage changes for each setup comparison (e.g., '26.7% improvement' for trigger tests)." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "No justification is given for the sample size of 200 patches per bug (beyond 'following prior works') or for the choice of 522 bugs. No power analysis or sample size reasoning is provided." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "No variance, standard deviation, or spread measures are reported. Results are single-run numbers. The 200 samples per bug use stochastic sampling (temperature=0.8) but no variance across runs is reported." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "Four recent SOTA LLM-based APR techniques are used as baselines: AlphaRepair, Repilot, FitRepair, and ChatRepair. Additionally, GPT-3.5-Turbo and Magicoder with PI(ALL) serve as ablation baselines." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "All baselines are recent: AlphaRepair (2022), Repilot (2023), FitRepair (2023), ChatRepair (2023). These represent the state of the art in LLM-based APR at the time of writing." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "The paper includes ablation variants: SRepair2M (dual-LLM without CoT), SRepair2M+FL (with fault location), and separate evaluations of GPT-3.5-Turbo and Magicoder components. The study section also systematically varies K-shot settings and auxiliary information types." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "Two primary metrics are used: plausible fixes (patches passing all tests) and correct fixes (manually verified semantic correctness). Patch distribution (plausible/test-failure/uncompilable ratios) is also reported." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": true, 96 "justification": "Three authors cross-validated the plausible patches of SRepair500 for semantic equivalency to determine correct patches, as stated in Section 5.2.3." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": true, 101 "justification": "The Defects4J test suite (trigger tests and relevant tests) serves as the evaluation set. Since models are used in zero-shot mode without any tuning on these bugs, the test data is not used for any selection decisions." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Table 7 provides per-project breakdowns for all 17 Defects4J projects. Results are shown separately for D4J 1.2 and D4J 2.0. Tables 3-5 provide per-model breakdowns." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "Multiple failure cases are discussed: Closure-66 (bug report misleads LLM, Figure 6b), Closure-112 (fault location information misleads LLM, Figure 7), and CodeLlama generating Python code for Java bugs (Section 4.2)." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "Several negative results are reported: few-shot learning causes up to 49.7% decline (Magicoder in K2(CE,PE)), fault location can mislead and prevent 14 previously fixable bugs from being fixed, and bug reports cause 19 bugs to become unfixable." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "Abstract claims are supported: 300 correct single-function fixes confirmed in Table 7, 85% improvement over ChatRepair (300 vs 162), 32 multi-function fixes shown in Figure 11, and the study findings about zero-shot and auxiliary information are demonstrated in Tables 3-5." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "Causal claims about few-shot learning and auxiliary information effects are supported by controlled experiments varying one factor at a time across the same models and bugs. The ablation variants (SRepair2M, SRepair2M+FL, SRepair200) also use controlled single-variable manipulation." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": false, 133 "justification": "The title 'How Far Can We Go with Practical Function-Level Program Repair?' and findings (e.g., 'LLMs with zero-shot learning are already powerful function-level APR techniques') are stated broadly, but results are primarily from Java bugs in Defects4J. QuixBugs adds Python but is only 40 trivial bugs. The generalization to other languages and real-world bug distributions is not bounded." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "The threats to validity section addresses data leakage and manual validation but does not discuss alternative explanations for the core findings. For example, whether SRepair's improvement comes primarily from additional context tokens rather than the CoT mechanism, or whether the dual-LLM architecture benefits from two independent sampling chances, is not discussed." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": true, 143 "justification": "The paper distinguishes between plausible patches (pass all tests, the proxy) and correct patches (manually verified semantic equivalence, the desired outcome). Section 5.2.3 explicitly notes the manual inspection process, acknowledging the gap between test-passing and correctness." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": true, 150 "justification": "Specific model versions are stated: 'code-davinci-edit-001' for Codex-edit, 'gpt-3.5-turbo-1106' for GPT-3.5-Turbo, CodeLlama-Instruct 7B/13B/34B, and 'MagicoderS-CL' for Magicoder. These are specific enough to identify the exact models." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": false, 155 "justification": "Figure 3 shows a prompt template with placeholders ('{Buggy code and fixed code pair examples}', '{Auxiliary repair-relevant Information}'). Figure 9 shows the CoT prompt structure. However, the full prompt text with actual fill values is not reproduced. The paper defers configuration details to the GitHub page." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Key hyperparameters are stated in Section 3.3: 'nucleus sampling with top p = 0.9, temperature = 0.8 and 200 samples per bug.' SRepair generates 5 patched functions per repair suggestion. A 5-hour time limit per bug is also specified." 161 }, 162 "scaffolding_described": { 163 "applies": true, 164 "answer": true, 165 "justification": "The SRepair dual-LLM framework is described in detail in Section 5.1 with Figure 8: GPT-3.5-Turbo as the repair suggestion model using CoT, Magicoder as the patch generation model, with the workflow of analyzing buggy code → identifying root cause → generating repair suggestions → generating patched functions." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 3.1.2 describes benchmark construction: 522 single-function bugs extracted from Defects4J 1.2 and 2.0, including 276 single-hunk and 158 single-line bugs. Section 3.3.3 details collection of bug reports from official issue links, extraction of trigger tests, error messages, and comments." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": true, 177 "justification": "Section 6 'Threats to Validity' provides a dedicated section with three subsections: threats to internal validity, external validity, and construct validity." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": true, 182 "justification": "Specific threats are discussed: three authors cross-validated patches (internal), only 7.4‰ of patches match developer patches for data leakage (internal), dual-LLM mitigates trigger test manipulation (internal), Defects4J may not generalize (external), and evaluation extended to QuixBugs and multi-function bugs to mitigate (external)." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": false, 187 "justification": "The external validity discussion notes that 'evaluation datasets used may not well generalize' but does not explicitly state what the results do NOT show. No specific exclusions are listed (e.g., no mention that results may not apply to languages other than Java, industrial codebases, or bugs beyond those in curated benchmarks)." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": true, 194 "justification": "Defects4J is publicly available. The paper states 'The data and code are available at GitHub' with a URL. The underlying bug data, test suites, and project code are all accessible." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 3.1.2 describes collecting 522 single-function bugs from Defects4J 1.2 and 2.0. Section 3.3.3 describes how bug reports were collected from official issue links, trigger tests extracted by building projects, and error messages captured from JUnit. Table 1 provides full statistics." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants. The data source is Defects4J, a standard benchmark." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "The pipeline is documented: bugs selected from Defects4J → auxiliary info collected (bug reports, trigger tests, error messages, comments) → prompts constructed → patches generated via nucleus sampling → patches compiled and tested → plausible patches manually inspected for correctness. Counts are provided at key stages (522 bugs, 200 samples per bug, >10M total patches)." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding source or acknowledgments section is present in the paper. The absence of any funding disclosure leaves it unclear whether the work was funded." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations are clearly listed: Southern University of Science and Technology (academic) and Kwai Inc. (industry). Both affiliations are displayed prominently with the author names." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "Since funding is not disclosed, independence of the funder cannot be assessed. Two authors are affiliated with Kwai Inc. but the paper does not evaluate Kwai products." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests or financial interests statement is present in the paper." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "The paper does not state the training data cutoff dates for any of the six models used (Codex-edit, GPT-3.5-Turbo, CodeLlama 7B/13B/34B, Magicoder)." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": true, 243 "justification": "Section 6 discusses 'potential for data leakage if the developer patches were included in the original training data' and reports that only 7.4‰ of plausible patches in the study and 1.5‰ from SRepair500 are identical to developer patches." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": true, 248 "justification": "The paper addresses contamination by checking the identity between generated plausible patches and developer patches (7.4‰ and 1.5‰ overlap rates). However, it does not discuss the temporal availability of Defects4J (published 2014) relative to model training, making the analysis incomplete but present." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study. It is a benchmark evaluation of LLM-based program repair." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this study." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": true, 292 "justification": "The paper reports 'repairing 300 single-function bugs with SRepair costs only $8.6, averaging $0.029 per correct fix.' The study section also reports '>10 million patches generated and validated, consuming more than 8,000 GPU and 100,000 CPU hours.'" 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": true, 297 "justification": "Total compute is stated: 'more than 8,000 GPU and 100,000 CPU hours,' hardware specified (128-core AMD EPYC 7H12, 8x NVIDIA A100 80GB), and the 5-hour per-bug time limit for SRepair." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "Results are not reported across multiple random seeds. The 200 samples per bug use stochastic sampling (temperature=0.8) but no seed sensitivity analysis is provided and no variance across independent runs is reported." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": true, 309 "justification": "The number of samples is explicitly stated: '200 samples per bug' for the study and SRepair200, and SRepair500 generates 500 samples. SRepair generates '5 patched functions' per repair suggestion." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "Hyperparameters (top_p=0.9, temperature=0.8) are stated as 'following prior works' but no search budget or justification for these specific values is provided." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": true, 319 "justification": "The selection of GPT-3.5-Turbo for repair suggestions and Magicoder for patch generation is justified based on empirical results from the study section (Section 4.2). All K-shot and auxiliary information configurations are reported, not just the best." 320 }, 321 "multiple_comparison_correction": { 322 "applies": false, 323 "answer": false, 324 "justification": "No statistical significance tests are performed at all, so multiple comparison correction is not applicable." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "The authors compare SRepair against published results of other techniques (AlphaRepair, Repilot, FitRepair, ChatRepair) without acknowledging self-evaluation bias or the possibility that their implementation/evaluation environment might favor their own approach." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": false, 334 "justification": "While SRepair200 and SRepair500 results are shown separately, there is no systematic analysis of performance as a function of compute budget. The baselines (AlphaRepair, Repilot, etc.) may use different compute budgets but this is not discussed as a confound." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": false, 339 "justification": "The paper does not discuss whether Defects4J bugs are representative of real-world bug distributions, or whether the plausible-patch metric (passing all tests) adequately captures repair quality beyond the correct/plausible distinction." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": true, 343 "answer": false, 344 "justification": "Baselines use fundamentally different architectures (AlphaRepair: cloze-style, Repilot: LSP-guided, ChatRepair: conversational, SRepair: dual-LLM CoT). The comparison attributes performance differences to the approach without isolating scaffold from model effects." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": true, 351 "justification": "The paper discusses 'potential for data leakage if the developer patches were included in the original training data' and checks patch identity rates (7.4‰ and 1.5‰). While the temporal relationship between Defects4J (2014) and model training is not explicitly stated, the core concern is addressed." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": true, 356 "justification": "Section 6 discusses that 'the LLMs might have recognized the trigger tests and manipulated them to pass all tests, creating seemingly plausible patches.' The dual-LLM mechanism is described as mitigating this: the repair suggestion model only sees auxiliary info, keeping the patch generation model isolated from trigger test data." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "The paper does not discuss whether the source code of Defects4J projects (Apache Commons, Closure Compiler, etc.) appears in the training data of the models, which would create non-independence between training and test data." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": true, 366 "justification": "A concrete detection method is applied: exact match analysis between generated plausible patches and developer patches, finding only 7.4‰ identity rate across the study and 1.5‰ for SRepair500." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "LLMs with zero-shot learning are already powerful function-level APR techniques, achieving 180 average plausible fixes and outperforming all few-shot setups by at least 10.4%.", 373 "evidence": "Table 3 shows K0(Basic) achieves 180 average plausible fixes vs. K1(CE) at 149, K1(PE) at 163, K2(CE,PE) at 149, and K2(PE,PE) at 160. Section 3.4.1.", 374 "supported": "strong" 375 }, 376 { 377 "claim": "Few-shot learning leads to disparate and even negative impacts on function-level repair, ranging from +10% to -49.7% across models.", 378 "evidence": "Table 3: CodeLlama 34B shows +10% (176 vs 160) in K1(CE), while Magicoder shows -49.7% (100 vs 199) in K2(CE,PE). Section 3.4.1.", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Directly applying trigger tests and error messages significantly enhances repair performance by 26.7% and 26.1% respectively.", 383 "evidence": "Table 4 shows PI(TT) achieves 228 vs 180 for K0(Basic) (26.7% increase) and PI(EM) achieves 227 vs 180 (26.1% increase). Section 3.4.2.", 384 "supported": "strong" 385 }, 386 { 387 "claim": "SRepair correctly fixes 300 single-function bugs, surpassing ChatRepair by 85% and Repilot by 1.59x, without statement-level fault location.", 388 "evidence": "Table 7 shows SRepair500 achieves 300 correct fixes vs ChatRepair at 162. Section 5.2.5.", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "SRepair successfully fixes 32 multi-function bugs, the first time achieved by any APR technique.", 393 "evidence": "Figure 11 shows SRepair500 achieves 53 plausible and 32 correct fixes for multi-function bugs. Figure 12 shows the JacksonDatabind-69 multi-function example. Section 5.2.5.", 394 "supported": "weak" 395 }, 396 { 397 "claim": "Auxiliary repair-relevant information can nearly replace costly statement-level fault location, with only a 7.1% gap.", 398 "evidence": "Table 5 shows PI(ALL) achieves 254 plausible fixes vs PI(ALL)+FL at 272, a 7.1% gap. Section 3.4.2.", 399 "supported": "moderate" 400 } 401 ], 402 "red_flags": [ 403 { 404 "flag": "No statistical significance tests", 405 "detail": "All comparative claims ('surpassing by 85%', 'significantly enhances') are based on comparing raw counts without any statistical significance tests. With stochastic sampling at temperature=0.8, results could vary across runs, but no variance is reported." 406 }, 407 { 408 "flag": "No variance across runs", 409 "detail": "Despite using stochastic sampling (temperature=0.8, top_p=0.9), all results are reported as single-run point estimates. No standard deviations, confidence intervals, or cross-run variance is provided. The stability of the reported 300 correct fixes is unknown." 410 }, 411 { 412 "flag": "'First ever' claim unverifiable", 413 "detail": "The claim that SRepair is 'the first time achieved by any APR technique ever' for multi-function bug repair is extremely strong and difficult to comprehensively verify against all prior work." 414 }, 415 { 416 "flag": "Defects4J contamination risk understated", 417 "detail": "Defects4J has been public since 2014 and is extensively discussed in SE literature. The contamination analysis only checks exact patch identity (7.4‰) but does not address whether models saw Defects4J project code or bug discussions during training. Training cutoff dates are not stated for any model." 418 }, 419 { 420 "flag": "Scaffold confound in baseline comparison", 421 "detail": "SRepair uses a dual-LLM CoT framework while baselines use fundamentally different architectures (cloze-style, conversational, LSP-guided). The 85% improvement claim does not isolate whether gains come from the methodology, the additional information, or the increased compute (500 samples with dual models)." 422 } 423 ], 424 "cited_papers": [ 425 { 426 "title": "Keep the Conversation Going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT", 427 "authors": ["Chunqiu Steven Xia", "Lingming Zhang"], 428 "year": 2023, 429 "arxiv_id": "2304.00385", 430 "relevance": "State-of-the-art conversational LLM-based APR technique (ChatRepair) and primary comparison baseline." 431 }, 432 { 433 "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning", 434 "authors": ["Chunqiu Steven Xia", "Lingming Zhang"], 435 "year": 2022, 436 "relevance": "AlphaRepair: zero-shot APR using CodeBERT, key baseline for cloze-style repair." 437 }, 438 { 439 "title": "Copiloting the copilots: Fusing large language models with completion engines for automated program repair", 440 "authors": ["Yuxiang Wei", "Chunqiu Steven Xia", "Lingming Zhang"], 441 "year": 2023, 442 "relevance": "Repilot: LLM+completion engine for compilable patches, key baseline for function-level APR." 443 }, 444 { 445 "title": "Revisiting the Plastic Surgery Hypothesis via Large Language Models", 446 "authors": ["Chunqiu Steven Xia", "Yifeng Ding", "Lingming Zhang"], 447 "year": 2023, 448 "arxiv_id": "2303.10494", 449 "relevance": "FitRepair: LLM-based APR combining fine-tuning with prompting strategies." 450 }, 451 { 452 "title": "Automated program repair in the era of large pre-trained language models", 453 "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"], 454 "year": 2023, 455 "relevance": "Study of LLM-based APR with few-shot learning, establishing the function-level APR investigation that this paper builds on." 456 }, 457 { 458 "title": "Automated repair of programs from large language models", 459 "authors": ["Zhiyu Fan", "Xiang Gao", "Martin Mirchev", "Abhik Roychoudhury", "Shin Hwei Tan"], 460 "year": 2023, 461 "relevance": "Study of repairing LLM-generated program errors, exploring APR in LLM code generation context." 462 }, 463 { 464 "title": "Inferfix: End-to-end program repair with llms", 465 "authors": ["Matthew Jin", "Syed Shahriar", "Michele Tufano", "Xin Shi", "Shuai Lu", "Neel Sundaresan", "Alexey Svyatkovskiy"], 466 "year": 2023, 467 "arxiv_id": "2303.07263", 468 "relevance": "End-to-end LLM-based APR technique using few-shot learning." 469 }, 470 { 471 "title": "Evaluating large language models trained on code", 472 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 473 "year": 2021, 474 "doi": "10.48550/arxiv.2107.03374", 475 "relevance": "Codex evaluation paper establishing LLM code generation capabilities and nucleus sampling methodology." 476 }, 477 { 478 "title": "An empirical study on fine-tuning large language models of code for automated program repair", 479 "authors": ["Kai Huang", "Xiangxin Meng", "Jian Zhang", "Yang Liu"], 480 "year": 2023, 481 "relevance": "Empirical study of fine-tuning LLMs for APR, directly relevant to understanding LLM repair capabilities." 482 }, 483 { 484 "title": "Rap-gen: Retrieval-augmented patch generation with codet5 for automatic program repair", 485 "authors": ["Weishi Wang", "Yue Wang", "Shafiq Joty", "Steven CH Hoi"], 486 "year": 2023, 487 "relevance": "Retrieval-augmented approach to LLM-based APR, combining retrieval with generation for patch generation." 488 }, 489 { 490 "title": "GAMMA: Revisiting template-based automated program repair via mask prediction", 491 "authors": ["Quanjun Zhang", "Chunrong Fang", "Tongke Zhang"], 492 "year": 2023, 493 "relevance": "Template-based APR using mask prediction, representing alternative approach to LLM-based repair." 494 }, 495 { 496 "title": "Selfapr: Self-supervised program repair with test execution diagnostics", 497 "authors": ["He Ye", "Matias Martinez", "Xiapu Luo", "Tao Zhang", "Martin Monperrus"], 498 "year": 2022, 499 "relevance": "Self-supervised APR using test execution diagnostics, relevant to using test information for repair." 500 }, 501 { 502 "title": "Can OpenAI's codex fix bugs? an evaluation on QuixBugs", 503 "authors": ["Julian Aron Prenner", "Hlib Babii", "Romain Robbes"], 504 "year": 2022, 505 "relevance": "Early evaluation of Codex for APR on QuixBugs, establishing LLM repair evaluation methodology." 506 } 507 ], 508 "engagement_factors": { 509 "practical_relevance": { 510 "score": 2, 511 "justification": "SRepair is a usable APR technique with released code, but requires API access to GPT-3.5-Turbo and local GPU for Magicoder." 512 }, 513 "surprise_contrarian": { 514 "score": 1, 515 "justification": "The finding that zero-shot outperforms few-shot for function-level APR is mildly surprising but aligns with emerging understanding of LLM in-context learning." 516 }, 517 "fear_safety": { 518 "score": 0, 519 "justification": "No safety, security, or AI risk concerns raised; this is a constructive software engineering tool paper." 520 }, 521 "drama_conflict": { 522 "score": 0, 523 "justification": "No controversy or conflict; straightforward empirical evaluation and technique proposal." 524 }, 525 "demo_ability": { 526 "score": 2, 527 "justification": "Code released on GitHub with data, though reproducing requires API access and GPU resources." 528 }, 529 "brand_recognition": { 530 "score": 1, 531 "justification": "Uses GPT-3.5-Turbo (recognizable) but authors are from Southern University of Science and Technology and Kwai Inc., not top-tier AI labs." 532 } 533 } 534 }