scan-v5.json (27040B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Empirical Evaluation of Large Language Models in Automated Program Repair", 6 "authors": [ 7 "Jiajun Sun", 8 "Fengjie Li", 9 "Xinzhu Qi", 10 "Hongyu Zhang", 11 "Jiajun Jiang" 12 ], 13 "year": 2025, 14 "venue": "arXiv.org", 15 "arxiv_id": "2506.13186", 16 "doi": "10.48550/arXiv.2506.13186" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "All major abstract claims — CodeLlama outperforming larger LLaMA, non-linear scaling, early-stage correct patches, prompt sensitivity — are directly supported by Tables IV–VI and Figure 4.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The claim that 'fine-tuning on code-related tasks significantly enhances repair capabilities' is based on comparing CodeLlama-7B vs LLaMA-2-13B, which differ in both fine-tuning and parameter count; no controlled experiment isolates the fine-tuning variable.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "Findings are stated broadly (e.g., 'Finding 6: Bugs of shorter length are more likely to be successfully repaired by LLMs') without consistently bounding claims to the four specific models and six datasets studied.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper does not systematically consider alternatives; for example, the large performance gap between algorithmic and enterprise bugs could be due to training data contamination rather than bug complexity, but this is not explored.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper clearly defines repair rate (correct patches / total bugs) and precision (correct patches / plausible patches) and uses these direct APR metrics consistently with its claims.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section V.B is a dedicated 'Limitation' section and Section V.C provides a 'Threats to Validity' section addressing both internal and external threats.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "Threats are largely boilerplate; the external threat merely states 'generalizability remains an open question,' and the internal threat only notes manual patch verification without quantifying inter-rater agreement or disagreement rate.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper explicitly bounds its evaluation to four LLMs, six datasets, three languages, and single-function bugs, acknowledging that real-world bugs may be more complex and additional languages remain unexplored.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No acknowledgment or funding section is present anywhere in the paper; funding sources are entirely undisclosed.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations are clearly listed on the title page: Tianjin University, UESTC, and Chongqing University.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": false, 86 "answer": false, 87 "justification": "No funding is disclosed, making independence assessment impossible.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests statement or financial disclosures of any kind appear in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "APR is defined, and 'plausible patch,' 'correct patch,' 'repair rate,' and 'precision' are all formally defined in Section III.E.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Five explicit contribution bullet points are listed at the end of the introduction, clearly stating the study scope, analysis dimensions, and practical implications.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section II and the introduction explicitly compare this work to Xia et al. [37], Fan et al. [38], Xiang et al. [43], and others, articulating specific gaps (multi-language, modern large models, cost analysis) that this study addresses.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": false, 124 "justification": "The paper states artifacts are released 'at our homepage' but provides no URL; this is functionally unverifiable and equivalent to 'available upon request.'", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "All six evaluation datasets (Defects4J, BugsCpp, IntroClass-C/Java, ConDefects-Java/Py) are publicly available standard benchmarks.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "Model versions are cited by name but no hardware specifications, Python version, framework dependencies, or environment files are provided.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "No step-by-step reproduction instructions appear in the paper; the vague reference to 'our homepage' provides no actionable guidance.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "All results in Tables IV–VI are reported as point estimates (repair rate %, precision %) with no confidence intervals or error bars.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are applied to any comparative claims; all differences between models and prompt conditions are reported as raw counts without p-values.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "The paper reports relative changes (e.g., '206.7% increase in repair count,' '22.9% lower RRate') with baseline values, providing effective effect size context.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "Datasets were adopted from existing benchmarks without any power analysis or justification for why specific subset sizes (255, 228, 106, 297, 563 bugs) were selected.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "No variance, standard deviation, or run-to-run variability is reported; LLM generation is stochastic but all results appear to be single-run point estimates.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Four LLMs serve as mutual comparisons spanning general-purpose vs. code-specialized and 7B–33B parameter ranges, providing meaningful cross-model baselines.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "All four evaluated models (CodeLlama-7B, LLaMA-2-13B, StarCoder-15.5B, DeepSeek-Coder-33B-instruct) are from 2023–2024 and are widely used in current research.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "RQ4 systematically ablates prompt components across all four models: zero-shot vs. one-shot vs. analysis-augmented prompts on two datasets.", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "The evaluation uses repair rate, precision (correct/plausible), complementarity (unique bugs per model), and patch ranking position analysis.", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": true, 205 "answer": true, 206 "justification": "All plausible patches are manually inspected by the first two authors to verify semantic equivalence to developer patches, as described in Section III.E.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "Established bug benchmarks serve as evaluation sets; the evaluated models were not trained specifically on these benchmarks (perfect fault localization is provided to isolate patch generation capability).", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Results are broken down by dataset, programming language (Java vs. C/C++ vs. Python), bug type (enterprise vs. algorithmic), and prompt strategy across all tables.", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Figure 7 shows a concrete failure case where incorrect LLM-generated bug analysis misleads DeepSeek-Coder; Section IV-A analyzes BugsCpp failures attributing them to long bug functions.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "The paper reports that bug analysis hurts DeepSeek-Coder by 46.6%, that all LLMs perform poorly on BugsCpp (avg 3.5% RRate), and that LLaMA consistently underperforms across all settings.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "Table II specifies exact model identifiers: CodeLlama-7B, LLaMA-2-13B, StarCoder-15.5B, DeepSeek-Coder-33B-instruct, with references to original papers.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": true, 244 "justification": "Figure 1 shows the full prompt template structure with actual example code, guidance text, and all four prompt variants are described in detail with their components.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": false, 250 "justification": "No sampling hyperparameters (temperature, top-p, repetition penalty, beam search settings) are reported for any of the four models.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "This is direct LLM inference for patch generation; no agentic scaffolding is used.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Section III.B documents selection criteria: single-function bugs only, specific subset sizes, and random sampling of one submission per assignment for ConDefects to reduce overhead.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": false, 270 "justification": "Generated patches are claimed to be available 'at our homepage' without a URL; while input benchmarks are public, the 600K+ generated patches are not verifiably accessible.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Section III.B describes dataset selection criteria, subset sizes, random sampling methodology, and rationale for each dataset included in the study.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "Standard public benchmarks are used; no human participant recruitment is involved.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "Section III.E documents the full pipeline: patch generation (200 or 30 per bug) → deduplication → test suite validation → manual inspection for semantic equivalence.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "No training data cutoff dates are stated for any of the four evaluated models, despite the explicit concern about benchmark data appearing in training corpora.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": true, 302 "justification": "Section V.A explicitly discusses data leakage as a 'critical concern,' acknowledging that benchmark code may exist in training corpora, though the mitigation strategy (model diversity, dataset diversity) is weak.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": true, 308 "justification": "ConDefects [57] was specifically designed to address LLM data leakage concerns for fault localization and program repair, and the paper explicitly cites this as part of their contamination mitigation.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants; benchmark evaluation study only.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": false, 360 "justification": "While cost-effectiveness is discussed qualitatively (diminishing returns beyond 30 patches, smaller models with complementary value), no actual GPU hours, latency, or dollar costs are quantified.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "Total computational budget (GPU type, hours, hardware configuration) is not stated anywhere in the paper.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "Fine-tuned CodeLlama-7B consistently outperforms general-purpose LLaMA-2-13B despite having fewer parameters", 375 "evidence": "Table IV: CodeLlama fixes 40/34 bugs on Defects4J v1.2/v2.0 vs LLaMA's 19/18; pattern holds across all 4 algorithmic datasets in Table V", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "LLMs perform significantly better on algorithmic assignment bugs than enterprise-grade project bugs", 380 "evidence": "DeepSeek achieves 45.45% repair rate on IntroClass-C (Table V) vs 5.66% on BugsCpp; average RRate on Defects4J is 15.1% vs 3.5% on BugsCpp", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Correct patches predominantly emerge in early generations; 30 patches achieves comparable effectiveness to 200", 385 "evidence": "Figure 4: 95.77% of StarCoder's correct patches on IntroClass-Java within first 30 generations; most LLMs have at most 1 correct patch beyond rank 30 on Defects4J", 386 "supported": "strong" 387 }, 388 { 389 "claim": "In-context repair examples substantially improve LLM repair performance over zero-shot", 390 "evidence": "Table VI: average RRate on ConDefects-Java drops from 11.5% (one-shot) to 8.9% (zero-shot); LLaMA drops 85.7% with zero-shot", 391 "supported": "strong" 392 }, 393 { 394 "claim": "Bug analysis prompts improve weaker models but degrade stronger models", 395 "evidence": "Table VI: DeepSeek-Coder drops from 127 to 63 correct repairs on ConDefects-Java (-46.6%) with analysis; LLaMA increases from 1 to 32 (+3100%)", 396 "supported": "strong" 397 }, 398 { 399 "claim": "Shorter bugs are significantly more likely to be successfully repaired by LLMs", 400 "evidence": "Figure 5: median length of successfully repaired bugs is consistently lower than unrepaired bugs across all 6 datasets; significant drop observed for functions exceeding 100 lines", 401 "supported": "strong" 402 }, 403 { 404 "claim": "All four LLMs exhibit complementary repair capabilities, each producing unique fixes unattainable by others", 405 "evidence": "Figure 3: even LLaMA (weakest model) contributes 1 unique repair on Defects4J v2.0; CodeLlama fixes 9 unique bugs unmatched by any other model", 406 "supported": "strong" 407 } 408 ], 409 "methodology_tags": [ 410 "benchmark-eval", 411 "empirical" 412 ], 413 "key_findings": "Four open-source LLMs spanning 7B–33B parameters were evaluated on 2,309 bugs across six benchmarks in three programming languages. Code-specialized fine-tuned models substantially outperform general-purpose models even at smaller parameter counts (CodeLlama-7B > LLaMA-2-13B), and doubling parameter count yields sublinear gains. LLMs achieve 3–8× higher repair rates on algorithmic assignment bugs vs. enterprise project bugs, likely driven by shorter function lengths and simpler bug patterns. A key practical finding is that 95%+ of correct patches emerge in the first 30 generations, enabling significant cost reduction without meaningful accuracy loss. Prompt design has large and heterogeneous effects: in-context examples universally improve performance, while bug analysis helps weak models (+3100% for LLaMA) but hurts strong ones (-46.6% for DeepSeek-Coder) due to sensitivity to inaccurate diagnostic content.", 414 "red_flags": [ 415 { 416 "flag": "No statistical significance testing", 417 "detail": "All comparative claims between models and prompt conditions are made without significance tests; observed differences could reflect noise given stochastic LLM outputs." 418 }, 419 { 420 "flag": "No variance across runs", 421 "detail": "LLM patch generation is stochastic but no run-to-run variance or confidence intervals are reported; all results appear to be single experimental runs." 422 }, 423 { 424 "flag": "Confounded fine-tuning causal claim", 425 "detail": "The claim that fine-tuning improves APR compares CodeLlama-7B vs LLaMA-2-13B, which differ in both fine-tuning and architecture/parameter count; the effect of fine-tuning alone is not isolated." 426 }, 427 { 428 "flag": "Sampling hyperparameters undisclosed", 429 "detail": "Temperature, top-p, and repetition penalty are not reported for any model, making exact replication impossible and preventing assessment of how generation settings affect results." 430 }, 431 { 432 "flag": "No comparison to non-LLM APR baselines", 433 "detail": "The paper does not compare to traditional APR methods (GenProg, TBar) or recent LLM-based methods (ChatRepair, ThinkRepair) mentioned in related work, preventing contextualization of absolute performance." 434 }, 435 { 436 "flag": "Unverifiable reproducibility claim", 437 "detail": "Artifacts are claimed released 'at our homepage' with no URL provided; the claim cannot be verified and is functionally equivalent to 'available upon request.'" 438 } 439 ], 440 "cited_papers": [ 441 { 442 "title": "Automated program repair in the era of large pre-trained language models", 443 "relevance": "First major study applying large LLMs to APR on Defects4J/ManyBugs/QuixBugs; directly compared to and identified as gap this paper addresses" 444 }, 445 { 446 "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs", 447 "relevance": "Primary evaluation benchmark; used for both RQ1 enterprise-grade bug evaluation and patch ranking analysis" 448 }, 449 { 450 "title": "ConDefects: A new dataset to address the data leakage concern for LLM-based fault localization and program repair", 451 "relevance": "Key benchmark specifically designed to mitigate LLM contamination; central to the study's validity argument for data leakage mitigation" 452 }, 453 { 454 "title": "DeepSeek-Coder: When the large language model meets programming", 455 "relevance": "Best-performing evaluated model; represents state-of-the-art code-specialized open-source LLM at time of study" 456 }, 457 { 458 "title": "Keep the conversation going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT", 459 "relevance": "ChatRepair — representative recent LLM-based APR approach that motivated the study but focuses only on Defects4J" 460 }, 461 { 462 "title": "How far can we go with practical function-level program repair?", 463 "relevance": "Recent LLM APR study with Java-only evaluation; identified as gap motivating multi-language coverage" 464 }, 465 { 466 "title": "An empirical study on fine-tuning large language models of code for automated program repair", 467 "relevance": "Closely related ASE 2023 study on fine-tuning smaller LLMs for APR; directly compared in related work" 468 }, 469 { 470 "title": "Code llama: Open foundation models for code", 471 "relevance": "One of the four evaluated models; fine-tuned from LLaMA on code tasks, enabling the fine-tuning comparison" 472 } 473 ], 474 "engagement_factors": { 475 "practical_relevance": { 476 "score": 3, 477 "justification": "Directly actionable guidance: generate 30 patches not 200, use code-specialized models, combine models for complementary coverage, include in-context examples." 478 }, 479 "surprise_contrarian": { 480 "score": 2, 481 "justification": "Counterintuitive finding that bug analysis hurts stronger models (DeepSeek drops 46.6%) and that smaller 7B model produces unique fixes unavailable from 33B model challenges scale-is-everything assumptions." 482 }, 483 "fear_safety": { 484 "score": 0, 485 "justification": "No AI safety or risk concerns raised; this is a capability evaluation for software maintenance." 486 }, 487 "drama_conflict": { 488 "score": 0, 489 "justification": "Straightforward empirical comparison with no controversy or competing claims from other groups." 490 }, 491 "demo_ability": { 492 "score": 2, 493 "justification": "Uses publicly available open-source models (DeepSeek-Coder, CodeLlama) and public benchmarks; anyone with GPU access can replicate the core experiments." 494 }, 495 "brand_recognition": { 496 "score": 1, 497 "justification": "DeepSeek-Coder has moderate recognition; work is from Chinese universities without major lab branding." 498 } 499 }, 500 "hn_data": { 501 "threads": [ 502 { 503 "hn_id": "44507887", 504 "title": "Empirical Evaluation of Large Language Models in Automated Program Repair", 505 "points": 5, 506 "comments": 0, 507 "url": "https://news.ycombinator.com/item?id=44507887" 508 }, 509 { 510 "hn_id": "40876136", 511 "title": "LLMMatDesign – Gen AI for Materials", 512 "points": 4, 513 "comments": 0, 514 "url": "https://news.ycombinator.com/item?id=40876136" 515 }, 516 { 517 "hn_id": "44663723", 518 "title": "Prompt Injection 2.0: Hybrid AI Threats – Paper and Open Source Testing Toolkit", 519 "points": 3, 520 "comments": 1, 521 "url": "https://news.ycombinator.com/item?id=44663723" 522 }, 523 { 524 "hn_id": "43293373", 525 "title": "RingFormer: Rethinking Recurrent Transformer with Adaptive Level Signals", 526 "points": 3, 527 "comments": 0, 528 "url": "https://news.ycombinator.com/item?id=43293373" 529 }, 530 { 531 "hn_id": "44943311", 532 "title": "NaN-propagation: a novel method for sparsity detection in black-box computationa", 533 "points": 3, 534 "comments": 0, 535 "url": "https://news.ycombinator.com/item?id=44943311" 536 }, 537 { 538 "hn_id": "44962664", 539 "title": "Chain-of-Agents", 540 "points": 2, 541 "comments": 0, 542 "url": "https://news.ycombinator.com/item?id=44962664" 543 }, 544 { 545 "hn_id": "43914672", 546 "title": "Questions to Fall in Love with ChatGPT: An Experimental Study", 547 "points": 2, 548 "comments": 0, 549 "url": "https://news.ycombinator.com/item?id=43914672" 550 } 551 ], 552 "top_points": 5, 553 "total_points": 22, 554 "total_comments": 1 555 } 556 }