scan.json (25476B)
1 { 2 "paper": { 3 "title": "Aligning the Objective of LLM-based Program Repair", 4 "authors": [ 5 "Junjielong Xu", 6 "Ying Fu", 7 "Shin Hwei Tan", 8 "Pinjia He" 9 ], 10 "year": 2024, 11 "venue": "arXiv", 12 "arxiv_id": "2404.08877", 13 "doi": null 14 }, 15 "checklist": { 16 "artifacts": { 17 "code_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The paper states 'Our source code and experimental results are publicly available at https://github.com/CUHK-Shenzhen-SE/D4C' in the Data Availability footnote at the end of Section VII." 21 }, 22 "data_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "The paper uses publicly available benchmarks (Defects4J and DebugBench), both of which are established public benchmarks. The experimental results are also released at the GitHub repository." 26 }, 27 "environment_specified": { 28 "applies": true, 29 "answer": false, 30 "justification": "The paper mentions 'Python 3.9', 'Ubuntu 20.04.5 LTS', and '8xA100 NVIDIA GPU server' in Section IV-A, but does not provide a requirements.txt, Dockerfile, or detailed dependency specifications sufficient to recreate the environment." 31 }, 32 "reproduction_instructions": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper references the GitHub repository but does not include step-by-step reproduction instructions in the paper itself. The reader is directed to the repo without a description of how to replicate the main experiments." 36 } 37 }, 38 "statistical_methodology": { 39 "confidence_intervals_or_error_bars": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper reports only point estimates (number of correct patches) in Tables II-V. No confidence intervals or error bars are reported for any main results." 43 }, 44 "significance_tests": { 45 "applies": true, 46 "answer": false, 47 "justification": "The paper claims D4C 'outperforms' baselines and 'surpasses the SOTA APR methods by 10%' but provides no statistical significance tests. Comparisons are made purely by comparing raw counts of correct patches." 48 }, 49 "effect_sizes_reported": { 50 "applies": true, 51 "answer": true, 52 "justification": "The paper provides baseline context for its improvements: '180 bugs correctly in Defects4J... surpasses the SOTA APR methods with perfect fault localization by 10%' (abstract), and Table II shows all baselines' counts alongside D4C's, giving the reader sufficient context to assess the magnitude (180 vs 162 for ChatRepair, a ~11% improvement)." 53 }, 54 "sample_size_justified": { 55 "applies": true, 56 "answer": false, 57 "justification": "The paper evaluates on 437 single-function bugs from Defects4J and 590 from DebugBench, but there is no justification for why these sample sizes are adequate for the claims made, nor any power analysis." 58 }, 59 "variance_reported": { 60 "applies": true, 61 "answer": false, 62 "justification": "The main results (Tables II-V) report single-run numbers with no variance across runs. The only standard deviation reported is for the position of the correct patch among 10 samples (std=1.74 in Sec. V-C) and plausible patches per bug (std=3.28 in Sec. V-D), but these are descriptive statistics about patch distributions, not variance across experimental runs." 63 } 64 }, 65 "evaluation_design": { 66 "baselines_included": { 67 "applies": true, 68 "answer": true, 69 "justification": "Table II compares D4C against five state-of-the-art LLM-based APR methods: AlphaRepair, Repilot, RAP-Gen, FitRepair, and ChatRepair." 70 }, 71 "baselines_contemporary": { 72 "applies": true, 73 "answer": true, 74 "justification": "The baselines include ChatRepair (2023, GPT-4 based), RAP-Gen (2023), FitRepair (2023), and Repilot (2023), all of which are recent and represent the state of the art for LLM-based APR at the time of submission." 75 }, 76 "ablation_study": { 77 "applies": true, 78 "answer": true, 79 "justification": "RQ3 (Section IV-D, Table V) provides a thorough ablation study removing individual components: w/o Document, w/o Test, w/o Message, -Mask, and -Pure, quantifying each component's contribution." 80 }, 81 "multiple_metrics": { 82 "applies": true, 83 "answer": true, 84 "justification": "The paper uses multiple metrics: number of correct patches (manually verified), number of plausible patches (test-passing), number of verified patches (LeetCode unseen tests), perplexity for objective alignment validation, and patch sampling efficiency." 85 }, 86 "human_evaluation": { 87 "applies": true, 88 "answer": true, 89 "justification": "The paper includes manual validation of patch correctness: 'Two authors independently confirm the patch correctness. Any patches with disagreement were presented to the third author for review' (Section V-A, Internal Threats). This is human evaluation of the system's outputs." 90 }, 91 "held_out_test_set": { 92 "applies": true, 93 "answer": true, 94 "justification": "DebugBench uses LeetCode unseen test suites for verification that are separate from the test examples provided to the model. For Defects4J, the standard test suite is used for validation, and the paper explicitly addresses data leakage by using DebugBench for insight validation (RQ2)." 95 }, 96 "per_category_breakdown": { 97 "applies": true, 98 "answer": true, 99 "justification": "Results are broken down by Defects4J v1.2 vs v2.0 (Table II), by programming language (C++/Java/Python in Table IV and V for DebugBench), and by perfect vs statistical FL settings (Table III)." 100 }, 101 "failure_cases_discussed": { 102 "applies": true, 103 "answer": true, 104 "justification": "Section V-B provides qualitative analysis of plausible-but-incorrect patches, including a specific example (Fig. 7, Jsoup-19) where the LLM generated a patch that passed tests but was semantically incorrect due to overfitting to the failed test case." 105 }, 106 "negative_results_reported": { 107 "applies": true, 108 "answer": true, 109 "justification": "The paper reports that D4C performs worse than ChatRepair on Defects4J v1.2 (84 vs 114 in Table II), and that setting temperature to 0 results in fewer patches. The '-Pure' and '-Mask' ablations show degraded performance. Also, the paper discusses that 30 out of 210 plausible patches were incorrect." 110 } 111 }, 112 "claims_and_evidence": { 113 "abstract_claims_supported": { 114 "applies": true, 115 "answer": true, 116 "justification": "The abstract claims '180 bugs correctly in Defects4J' (supported by Table II sum column), 'surpasses the SOTA APR methods with perfect fault localization by 10%' (180 vs 162 ChatRepair), and 'reduces the patch sampling number by 90%' (10 vs 100). All are supported by the results." 117 }, 118 "causal_claims_justified": { 119 "applies": true, 120 "answer": true, 121 "justification": "The paper makes causal claims about objective alignment improving performance. It supports this through controlled ablation experiments (RQ2, Table IV) comparing different input/output formats while controlling other variables, and validates via perplexity measurements on a white-box model. The ablation design constitutes adequate controlled single-variable manipulation." 122 }, 123 "generalization_bounded": { 124 "applies": true, 125 "answer": true, 126 "justification": "The paper states 'our results may not generalize beyond the studied settings and other programming languages beyond the supported ones in Defects4J and DebugBench' (Sec. V-A, External Threats). The title 'LLM-based Program Repair' is appropriately bounded to the evaluated domain." 127 }, 128 "alternative_explanations_discussed": { 129 "applies": true, 130 "answer": true, 131 "justification": "The paper explicitly addresses the alternative explanation that improvements could be due to data leakage rather than objective alignment, using DebugBench (leakage-free) to validate insights. Section V-A discusses that model effectiveness may vary across settings, and data leakage is discussed at length in Sections IV-A and IV-B." 132 } 133 }, 134 "setup_transparency": { 135 "model_versions_specified": { 136 "applies": true, 137 "answer": true, 138 "justification": "Section IV-A specifies 'gpt-4-0613' and 'mixtral-8x7b-instruct-v0.1' as the exact model versions used, including that they used 'fixed remote API checkpoints or fixed local model versions.'" 139 }, 140 "prompts_provided": { 141 "applies": true, 142 "answer": false, 143 "justification": "The paper shows the prompt structure in Fig. 4 and Fig. 5 with templates using placeholders like {BUGGY_DOCUMENT}, {FAILED_TEST}, {TEST_INFO}, {BUGGY_CODE}, and a system instruction ('As an debugger, you should refine the buggy program for bug report'). However, the complete actual prompt text with the fixed example pair is not provided in the paper. The fixed example is described but not fully shown." 144 }, 145 "hyperparameters_reported": { 146 "applies": true, 147 "answer": true, 148 "justification": "Section IV-A reports temperature=1.0, sampling number=10, and timeout threshold of 1 minute per patch. Section IV-D additionally explores temperature=0.0 and different sampling numbers (1, 3, 10)." 149 }, 150 "scaffolding_described": { 151 "applies": false, 152 "answer": false, 153 "justification": "D4C is a single-pass prompting framework without agentic scaffolding. It does not use multi-round dialogue, tool use, retry logic, or feedback mechanisms. It is a one-shot prompting approach." 154 }, 155 "data_preprocessing_documented": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section III-C describes the artifact extraction process (extracting documents, failed test cases, and error messages). Section IV-A explains how single-function bugs were separated from Defects4J (v1.2: 203, v2.0: 234) and how DebugBench logic bugs were selected (590 out of 4,253 total bugs). The filtering criteria are stated." 159 } 160 }, 161 "limitations_and_scope": { 162 "limitations_section_present": { 163 "applies": true, 164 "answer": true, 165 "justification": "Section V-A 'Threats to Validity' provides a dedicated subsection discussing external and internal threats with substantive discussion." 166 }, 167 "threats_to_validity_specific": { 168 "applies": true, 169 "answer": true, 170 "justification": "The threats are specific to this study: data leakage of Defects4J in LLM pre-training, incomplete test coverage of Defects4J not guaranteeing patch correctness, potential bias in manual patch validation, and that FLUCCS only supports Defects4J v1.2. These are concrete, study-specific threats." 171 }, 172 "scope_boundaries_stated": { 173 "applies": true, 174 "answer": true, 175 "justification": "The paper explicitly states: 'our results may not generalize beyond the studied settings and other programming languages beyond the supported ones in Defects4J and DebugBench' (Sec. V-A). It also clarifies that 'This paper is not aimed at proposing D4C as a new APR technique, but rather to introduce a new mindset or paradigm' (Sec. I)." 176 } 177 }, 178 "data_integrity": { 179 "raw_data_available": { 180 "applies": true, 181 "answer": true, 182 "justification": "The paper states 'we decide to release our experimental results for public verification' (Sec. V-A) and provides a GitHub repository. Both Defects4J and DebugBench are publicly available benchmarks." 183 }, 184 "data_collection_described": { 185 "applies": true, 186 "answer": true, 187 "justification": "Section IV-A describes the data collection in detail: 437 single-function bugs from Defects4J (v1.2: 203, v2.0: 234), 590 logic bugs from DebugBench (200 C++, 194 Java, 196 Python), and explains the selection criteria (logic bugs, single-function bugs)." 188 }, 189 "recruitment_methods_described": { 190 "applies": false, 191 "answer": false, 192 "justification": "No human participants are involved. The data sources are standard public benchmarks (Defects4J and DebugBench)." 193 }, 194 "data_pipeline_documented": { 195 "applies": true, 196 "answer": true, 197 "justification": "The pipeline is documented: identify buggy function (via method-level FL), extract artifacts (documents, failed tests, error messages per Sec. III-C), construct prompt (Sec. III-D), generate patches (Sec. III-E), validate patches via test suite then manual verification (Sec. III-F). Each stage is described with counts." 198 } 199 }, 200 "conflicts_of_interest": { 201 "funding_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "Section VII (Acknowledgment) discloses funding: 'Guangdong Basic and Applied Basic Research Foundation (No. 2024A1515010145) and the Shenzhen Science and Technology Program (No. ZDSYS20230626091302006).'" 205 }, 206 "affiliations_disclosed": { 207 "applies": true, 208 "answer": true, 209 "justification": "Author affiliations are clearly listed: The Chinese University of Hong Kong, Shenzhen; Chongqing University; and Concordia University. None of the authors are affiliated with OpenAI or Mistral, the companies whose products are evaluated." 210 }, 211 "funder_independent_of_outcome": { 212 "applies": true, 213 "answer": true, 214 "justification": "The funders are Chinese government research foundations (Guangdong Basic and Applied Basic Research Foundation, Shenzhen Science and Technology Program), which have no financial interest in the outcome of this APR research." 215 }, 216 "financial_interests_declared": { 217 "applies": true, 218 "answer": false, 219 "justification": "No competing interests or financial interests statement is present in the paper." 220 } 221 }, 222 "contamination": { 223 "training_cutoff_stated": { 224 "applies": true, 225 "answer": false, 226 "justification": "The paper does not state the training data cutoff date for either GPT-4 (gpt-4-0613) or Mixtral-MoE. While data leakage is discussed extensively, the actual training cutoff dates are never specified." 227 }, 228 "train_test_overlap_discussed": { 229 "applies": true, 230 "answer": true, 231 "justification": "The paper extensively discusses train/test overlap: it acknowledges Defects4J data may be in LLM training corpora, argues that bug-fix pairs are sparse in training data (Sec. IV-A, IV-B), and uses DebugBench specifically because it was 'designed to counter data leakage (by implanting bugs into source data with GPT-4)' (Sec. IV-A)." 232 }, 233 "benchmark_contamination_addressed": { 234 "applies": true, 235 "answer": true, 236 "justification": "The paper explicitly addresses this concern: Table I labels Defects4J as 'Yes' for data leakage risk and DebugBench as 'No'. The authors use DebugBench for insight validation (RQ2, RQ3) specifically to mitigate contamination: 'we selected DebugBench, a latest, leakage-free benchmark, for evaluation' (Sec. V-A)." 237 } 238 }, 239 "human_studies": { 240 "pre_registered": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants are involved in this study. The manual validation of patches is done by the paper authors, not as a human subjects study." 244 }, 245 "irb_or_ethics_approval": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants are involved in this study." 249 }, 250 "demographics_reported": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants are involved in this study." 254 }, 255 "inclusion_exclusion_criteria": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants are involved in this study." 259 }, 260 "randomization_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants are involved in this study." 264 }, 265 "blinding_described": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants are involved in this study." 269 }, 270 "attrition_reported": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants are involved in this study." 274 } 275 }, 276 "cost_and_practicality": { 277 "inference_cost_reported": { 278 "applies": true, 279 "answer": true, 280 "justification": "Section V-C reports detailed cost information: average input prompt length (1,386.84 tokens), average output length (314.39 tokens), average cost per patch ($0.023), total cost for 10 patches per bug ($0.23 for the entire dataset), and reduced cost if stopping at first correct patch ($0.18 per bug)." 281 }, 282 "compute_budget_stated": { 283 "applies": true, 284 "answer": false, 285 "justification": "While API cost per patch is reported for GPT-4, no total GPU hours or compute budget is stated for the Mixtral-MoE experiments run on the 8xA100 server. The overall compute budget for the full experimental campaign is not quantified." 286 } 287 } 288 }, 289 "claims": [ 290 { 291 "claim": "D4C repairs 180 bugs correctly in Defects4J, surpassing SOTA APR methods with perfect fault localization by 10%.", 292 "evidence": "Table II shows D4C fixes 180 bugs total (84 on v1.2 + 96 on v2.0) vs ChatRepair's 162 (114 + 48), using only 10 samples per bug vs 100-5000 for baselines.", 293 "supported": "moderate" 294 }, 295 { 296 "claim": "Generating a complete refined function aligns better with the decoder-only LLM's training objective than generating fixed hunks.", 297 "evidence": "Table IV shows output perplexity for complete function (Func) is consistently lower than for hunks (Hunk): Report-Func output perplexity is 1.39 vs Report-Hunk at 8.50 on Mixtral-MoE. Report-Func also generates more verified patches across all languages and models.", 298 "supported": "strong" 299 }, 300 { 301 "claim": "Providing artifacts (documents, failed tests, error messages) enables LLMs to locate and repair buggy hunks without statement-level fault localization.", 302 "evidence": "Table IV shows Report-Func generates more verified patches than Mask-Func across all languages (e.g., 118 vs 99 C++, 125 vs 91 Java on Mixtral-MoE). Table V ablation shows removing any artifact component reduces performance.", 303 "supported": "strong" 304 }, 305 { 306 "claim": "D4C is less affected by inaccuracies in fault localization than infilling-style methods.", 307 "evidence": "Table III shows D4C drops only 4.8% from perfect to statistical FL (84 to 80), while AlphaRepair drops 30.8% and RAP-Gen drops 33.3%.", 308 "supported": "moderate" 309 }, 310 { 311 "claim": "D4C requires only 10 patch samples per bug, which is 90% fewer than the most efficient baseline.", 312 "evidence": "Table II shows D4C uses 10 samples while RAP-Gen uses 100, ChatRepair uses 100-200, and AlphaRepair/Repilot/FitRepair use 5000.", 313 "supported": "strong" 314 } 315 ], 316 "methodology_tags": [ 317 "benchmark-eval" 318 ], 319 "key_findings": "D4C demonstrates that aligning the output format of decoder-only LLMs to their training objective (complete function generation vs. hunk infilling) significantly improves automated program repair performance. On Defects4J, D4C fixes 180 bugs using only 10 samples per bug, outperforming the best baseline (ChatRepair, 162 bugs with 100-200 samples). Perplexity analysis on Mixtral-MoE validates that function completion has substantially lower perplexity than hunk generation, providing direct evidence for the objective alignment hypothesis. The approach also shows greater robustness to imperfect fault localization compared to infilling methods.", 320 "red_flags": [ 321 { 322 "flag": "No statistical significance testing", 323 "detail": "All comparisons are based on raw counts of correct patches without any significance tests. The difference between D4C (180) and ChatRepair (162) is presented as 'surpassing by 10%' but the statistical reliability of this difference is never tested." 324 }, 325 { 326 "flag": "Single-run results with no variance reporting", 327 "detail": "Despite using temperature=1.0 (stochastic decoding), the main results appear to be from a single run. No repeated trials with different random seeds are reported, making it impossible to assess result stability." 328 }, 329 { 330 "flag": "Uneven comparison conditions", 331 "detail": "D4C uses GPT-4 while most baselines use weaker models (CodeBERT, CodeT5, InCoder). Only ChatRepair also uses GPT-4. The 'surpasses SOTA by 10%' claim conflates model choice with method design, though the paper partially addresses this via controlled ablations in RQ2." 332 }, 333 { 334 "flag": "Baseline results from original papers without reproduction", 335 "detail": "The paper acknowledges 'we reuse their Defects4J results reported in the original paper' (Sec. IV-B) because some baselines are not open-sourced. Different experimental conditions (hardware, API versions, prompting strategies) across papers may affect comparability." 336 } 337 ], 338 "cited_papers": [ 339 { 340 "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning", 341 "authors": ["C. S. Xia", "L. Zhang"], 342 "year": 2022, 343 "relevance": "Key baseline (AlphaRepair) that pioneered infilling-style APR using LLMs, directly compared to D4C." 344 }, 345 { 346 "title": "Keep the conversation going: Fixing 162 out of 337 bugs for $0.42 each using chatgpt", 347 "authors": ["C. S. Xia", "L. Zhang"], 348 "year": 2023, 349 "arxiv_id": "2304.00385", 350 "relevance": "ChatRepair is the strongest baseline using GPT-4 with multi-round dialogue for APR, directly compared in Table II." 351 }, 352 { 353 "title": "Rap-gen: Retrieval-augmented patch generation with codet5 for automatic program repair", 354 "authors": ["W. Wang", "Y. Wang", "S. Joty", "S. C. Hoi"], 355 "year": 2023, 356 "arxiv_id": "2309.06057", 357 "relevance": "RAP-Gen is a retrieval-augmented LLM-based APR method compared as a baseline." 358 }, 359 { 360 "title": "Copiloting the copilots: Fusing large language models with completion engines for automated program repair", 361 "authors": ["Y. Wei", "C. S. Xia", "L. Zhang"], 362 "year": 2023, 363 "arxiv_id": "2309.00608", 364 "relevance": "Repilot is an LLM-based APR baseline that fuses LLMs with completion engines, directly evaluated." 365 }, 366 { 367 "title": "Teaching large language models to self-debug", 368 "authors": ["X. Chen", "M. Lin", "N. Schärli", "D. Zhou"], 369 "year": 2023, 370 "arxiv_id": "2304.05128", 371 "relevance": "Self-debugging approach that inspired D4C's concept of allowing LLMs to refine programs using execution feedback." 372 }, 373 { 374 "title": "Impact of code language models on automated program repair", 375 "authors": ["N. Jiang", "K. Liu", "T. Lutellier", "L. Tan"], 376 "year": 2023, 377 "arxiv_id": "2302.05020", 378 "relevance": "Demonstrates that data leakage is less severe for APR compared to other code tasks, a key argument for D4C's Defects4J evaluation." 379 }, 380 { 381 "title": "DebugBench: Evaluating debugging capability of large language models", 382 "authors": ["R. Tian", "Y. Ye", "Y. Qin", "X. Cong", "Y. Lin", "Z. Liu", "M. Sun"], 383 "year": 2024, 384 "arxiv_id": "2401.04621", 385 "relevance": "Key benchmark used for insight validation, designed to be leakage-free by implanting bugs with GPT-4." 386 }, 387 { 388 "title": "Automated repair of programs from large language models", 389 "authors": ["Z. Fan", "X. Gao", "M. Mirchev", "A. Roychoudhury", "S. H. Tan"], 390 "year": 2023, 391 "relevance": "LLM-based APR approach using flexible fault localization, which supports D4C's insight about not restricting repair to specific hunks." 392 }, 393 { 394 "title": "Revisiting the plastic surgery hypothesis via large language models", 395 "authors": ["C. S. Xia", "Y. Ding", "L. Zhang"], 396 "year": 2023, 397 "arxiv_id": "2303.10494", 398 "relevance": "FitRepair baseline that revisits APR with LLMs, directly compared to D4C." 399 }, 400 { 401 "title": "A unified debugging approach via LLM-based multi-agent synergy", 402 "authors": ["C. Lee", "C. S. Xia", "L. Yang", "J.-t. Huang", "Z. Zhu", "L. Zhang", "M. R. Lyu"], 403 "year": 2024, 404 "arxiv_id": "2404.17153", 405 "relevance": "Multi-agent LLM-based debugging approach that represents the expanding scope of LLM use in program repair." 406 }, 407 { 408 "title": "A survey of learning-based automated program repair", 409 "authors": ["Q. Zhang", "C. Fang", "Y. Ma", "W. Sun", "Z. Chen"], 410 "year": 2023, 411 "arxiv_id": "2301.03270", 412 "relevance": "Comprehensive survey of learning-based APR methods providing context for the field D4C operates in." 413 } 414 ] 415 }