scan-v4.json (36812B)
1 { 2 "scan_version": 4, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Exploring Generalizable Automated Program Repair with Large Language Models", 6 "authors": [ 7 "Viola Campos", 8 "Ridwan Shariffdeen", 9 "Adrian Ulges", 10 "Yannic Noller" 11 ], 12 "year": 2025, 13 "venue": "arXiv", 14 "arxiv_id": "2506.03283", 15 "doi": null 16 }, 17 "checklist": { 18 "claims_and_evidence": { 19 "abstract_claims_supported": { 20 "applies": true, 21 "answer": true, 22 "justification": "The abstract's three key claims are all supported: (1) 'Different LLMs tend to perform best for different languages' — Table 2 shows 4 different top models across 4 benchmarks. (2) 'Combining models by pooling repairs adds value' — Table 5 and Figure 3 show ensemble gains. (3) 'significant drops in accuracy from imperfect FL' — Table 6 shows 7–17% pass@1 drops.", 23 "source": "opus" 24 }, 25 "causal_claims_justified": { 26 "applies": true, 27 "answer": true, 28 "justification": "Causal claims like 'including information about the failing test case significantly boosted the repair performance' are supported by controlled comparisons where only one prompt component changes while all else remains equal (same models, same bugs, same evaluation). This is a valid single-variable manipulation design.", 29 "source": "opus" 30 }, 31 "generalization_bounded": { 32 "applies": true, 33 "answer": true, 34 "justification": "Section 5 explicitly states: 'Our broad selection of models and benchmarks reduces the threat to external validity; however, we cannot claim generality beyond our experiments.' The paper appropriately scopes findings to the tested models, languages, and benchmarks.", 35 "source": "opus" 36 }, 37 "alternative_explanations_discussed": { 38 "applies": true, 39 "answer": true, 40 "justification": "Section 5 discusses several alternative explanations: data leakage (with specific leak ratios from Zhou et al.), output variability of LLMs, the distinction between plausibility and correctness (test overfitting), prompt optimization effects, and repair time differences between models.", 41 "source": "opus" 42 }, 43 "proxy_outcome_distinction": { 44 "applies": true, 45 "answer": true, 46 "justification": "Section 3.4 explicitly distinguishes between plausible patches (passing tests) and correct patches (satisfying intended requirements): 'A patch may pass all tests but still fail to implement the intended functionality.' They acknowledge using plausibility as a proxy and discuss test overfitting at length.", 47 "source": "opus" 48 } 49 }, 50 "limitations_and_scope": { 51 "limitations_section_present": { 52 "applies": true, 53 "answer": true, 54 "justification": "Section 5 'Discussion & Threats to Validity' provides substantial discussion of limitations across multiple paragraphs covering data leakage, LLM variability, prompt optimization, repair time, and plausibility vs. correctness.", 55 "source": "opus" 56 }, 57 "threats_to_validity_specific": { 58 "applies": true, 59 "answer": true, 60 "justification": "Section 5 discusses specific threats: Defects4J has a leaked ratio of 0.41% and BugsInPy 11.0% (citing Zhou et al.); plausibility vs. correctness as a construct validity concern (citing Petke et al.); DeepSeek R1's longer inference time creating unfair comparison; prompt optimization was intentionally avoided to prevent model-specific biases.", 61 "source": "opus" 62 }, 63 "scope_boundaries_stated": { 64 "applies": true, 65 "answer": true, 66 "justification": "The paper explicitly states scope boundaries: single-function fixes only (excluding multi-function), zero-shot prompts only (excluding iterative/agentic workflows), no prompt optimization was performed, repair time was not analyzed rigorously, and 'we cannot claim generality beyond our experiments' (Section 5).", 67 "source": "opus" 68 } 69 }, 70 "conflicts_of_interest": { 71 "funding_disclosed": { 72 "applies": true, 73 "answer": false, 74 "justification": "No funding sources, grants, or sponsors are mentioned anywhere in the paper. There is no acknowledgments section listing funding.", 75 "source": "opus" 76 }, 77 "affiliations_disclosed": { 78 "applies": true, 79 "answer": true, 80 "justification": "Author affiliations are clearly listed: RheinMain University of Applied Sciences, SonarSource (Singapore), and Ruhr University Bochum. Section 8 provides a disclaimer regarding SonarSource.", 81 "source": "opus" 82 }, 83 "funder_independent_of_outcome": { 84 "applies": true, 85 "answer": false, 86 "justification": "No funding is disclosed, making independence impossible to verify. One author is affiliated with SonarSource, a company in the code quality space, which could have a commercial interest in APR outcomes. Section 8 provides a disclaimer but no formal independence statement.", 87 "source": "opus" 88 }, 89 "financial_interests_declared": { 90 "applies": true, 91 "answer": false, 92 "justification": "No formal competing interests or financial interests statement is provided. Section 8 has a disclaimer about SonarSource but does not constitute a standard financial interests declaration covering patents, equity, or other commercial interests.", 93 "source": "opus" 94 } 95 }, 96 "scope_and_framing": { 97 "key_terms_defined": { 98 "applies": true, 99 "answer": true, 100 "justification": "Key terms are defined: 'open models' (publicly available weights, not necessarily open-source licensed), 'closed models' (proprietary), 'plausible' vs 'correct' patches, and fault localization granularity levels.", 101 "source": "haiku" 102 }, 103 "intended_contribution_clear": { 104 "applies": true, 105 "answer": true, 106 "justification": "Three explicit bulleted contributions are stated in the introduction: comprehensive multi-LLM evaluation, practitioner take-aways (model selection, FL bottleneck), and researcher insights, with open-sourced results.", 107 "source": "haiku" 108 }, 109 "engagement_with_prior_work": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section 2 explicitly compares with and extends Xia et al. (2023), Ouyang et al. (2024), and others, identifying specific gaps: more models, more languages, and automated FL comparison.", 113 "source": "haiku" 114 } 115 } 116 }, 117 "type_checklist": { 118 "empirical": { 119 "artifacts": { 120 "code_released": { 121 "applies": true, 122 "answer": false, 123 "justification": "Section 7 states 'Upon acceptance, we will update the artifact with a reproduction package including the scripts for prompting the LLMs as well for the data analysis.' This is a promise of future release, not a current release. Only results and generated patches are currently available on figshare.", 124 "source": "opus" 125 }, 126 "data_released": { 127 "applies": true, 128 "answer": true, 129 "justification": "Section 7 states results and generated patches are 'openly available in our supplemental material' on figshare. Additionally, all four benchmarks (Defects4J, BugsInPy, BugsJS, BugsPHP) are publicly available datasets.", 130 "source": "opus" 131 }, 132 "environment_specified": { 133 "applies": true, 134 "answer": false, 135 "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is provided. The paper mentions model names and temperature but not the computational environment for running experiments.", 136 "source": "opus" 137 }, 138 "reproduction_instructions": { 139 "applies": true, 140 "answer": false, 141 "justification": "No step-by-step reproduction instructions are provided. Scripts are promised for future release upon acceptance (Section 7).", 142 "source": "opus" 143 } 144 }, 145 "statistical_methodology": { 146 "confidence_intervals_or_error_bars": { 147 "applies": true, 148 "answer": false, 149 "justification": "Tables 2, 4, 6, and 7 report pass@k as point estimates without confidence intervals or error bars. No ± notation or CI notation appears anywhere in the results.", 150 "source": "opus" 151 }, 152 "significance_tests": { 153 "applies": true, 154 "answer": true, 155 "justification": "Section 3.4 states: 'we apply the Wilcoxon signed-rank test at a significance level of α = 0.05.' Results tables use bold for best and underline for results not significantly different from the best.", 156 "source": "opus" 157 }, 158 "effect_sizes_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Effect sizes are reported as absolute percentage differences with baselines for context. For example, Table 4 shows base vs. test prompt differences (e.g., 19.02% → 45.33% for Claude 3.7 on Java), and the text reports specific improvements like '+47% pass@1' and '-16.20%' for FL comparisons.", 162 "source": "opus" 163 }, 164 "sample_size_justified": { 165 "applies": true, 166 "answer": true, 167 "justification": "The number of generations per bug (n=15) is justified by citing prior work: 'Based on the standard deviation analysis of pass@1 for LLM-based APR from [27], we use n = 15 as a reasonable, yet manageable number of generations.' The 100-bug subsets are justified through stratified sampling preserving complexity distributions (Section 4.1.2).", 168 "source": "opus" 169 }, 170 "variance_reported": { 171 "applies": true, 172 "answer": false, 173 "justification": "Results are reported as pass@k point estimates computed from 15 generations (3 runs × 5 samples). No standard deviation, IQR, or variance across the 3 independent runs is reported. The reader cannot assess result stability.", 174 "source": "opus" 175 } 176 }, 177 "evaluation_design": { 178 "baselines_included": { 179 "applies": true, 180 "answer": true, 181 "justification": "13 LLMs are compared against each other, including both open and closed models. The base prompt serves as a baseline condition, with test and line-level localization prompts as variations.", 182 "source": "opus" 183 }, 184 "baselines_contemporary": { 185 "applies": true, 186 "answer": true, 187 "justification": "Models include recent frontier models: Claude 3.7 Sonnet, OpenAI o3-mini, DeepSeek R1, Gemini 2.0 Flash, and Qwen 2.5 Coder. Model selection was based on recent code-focused leaderboards (Section 3.2).", 188 "source": "opus" 189 }, 190 "ablation_study": { 191 "applies": true, 192 "answer": true, 193 "justification": "The study systematically varies prompt components: base (code only), test (adding test info), and line-level localization (adding fix location hints). This functions as an ablation of prompt ingredients, with Tables 4 and 6 showing the impact of each component.", 194 "source": "opus" 195 }, 196 "multiple_metrics": { 197 "applies": true, 198 "answer": true, 199 "justification": "Two metrics are reported throughout: pass@1 (success with one candidate) and pass@5 (success among five candidates), following the Chen et al. formulation.", 200 "source": "opus" 201 }, 202 "human_evaluation": { 203 "applies": true, 204 "answer": false, 205 "justification": "Section 3.4 states 'a manual review of patches is infeasible at scale.' Evaluation is entirely automated using test-suite plausibility. No human review of patch quality was conducted.", 206 "source": "opus" 207 }, 208 "held_out_test_set": { 209 "applies": true, 210 "answer": false, 211 "justification": "Section 3.5 mentions preliminary experiments to select the prompt format: 'In preliminary experiments, we evaluated different methods for integrating such line-level localization information into APR prompts.' It is unclear whether these preliminary experiments used separate data from the reported results, creating potential data snooping.", 212 "source": "opus" 213 }, 214 "per_category_breakdown": { 215 "applies": true, 216 "answer": true, 217 "justification": "Results are broken down by programming language (4 benchmarks), prompt type (base/test/LL), patch complexity (single-line/single-hunk/multi-hunk in Table 7), and model type (open vs. closed). Figure 2 shows per-model unique fix contributions via Venn diagrams.", 218 "source": "opus" 219 }, 220 "failure_cases_discussed": { 221 "applies": true, 222 "answer": true, 223 "justification": "Figure 1 analyzes Python indentation errors as a systematic failure mode. Section 4.2.2 discusses cases where automated FL fails to identify the correct function (72/100 bugs). The paper also discusses cases where plausible patches are produced despite incorrect localization (attributed to test overfitting).", 224 "source": "opus" 225 }, 226 "negative_results_reported": { 227 "applies": true, 228 "answer": true, 229 "justification": "Several negative results are reported: line-level localization hurts accuracy for 4/6 LLMs on PHP (Section 4.2.1); automated FL causes dramatic accuracy drops (Table 6); no single model generalizes across languages; Python performance is surprisingly poor due to indentation issues.", 230 "source": "opus" 231 } 232 }, 233 "setup_transparency": { 234 "model_versions_specified": { 235 "applies": true, 236 "answer": true, 237 "justification": "All 13 models are specified with identifiable names and sizes: Claude 3.7 Sonnet, Claude 3.5 Haiku, Gemini 1.5 Pro, Gemini 2.0 Flash, GPT-4o (Nov 11 2024 version), o3-mini (Jan 31 2025 version), CodeLlama 13B/70B, DeepSeek Coder 33B, DeepSeek R1 distilled to Llama 70B, Qwen 2.5 Coder 33B, Llama 3.3 70B, CodeGemma 7B. Version dates are given for continuously-updated OpenAI models.", 238 "source": "opus" 239 }, 240 "prompts_provided": { 241 "applies": true, 242 "answer": true, 243 "justification": "Full prompt templates are provided in Listings 1–4, showing system messages and user prompts with placeholders for benchmark code. The fill values (buggy functions, test cases, error messages) come from the public benchmarks, making prompts fully reconstructible. Supplemental material is also referenced.", 244 "source": "opus" 245 }, 246 "hyperparameters_reported": { 247 "applies": true, 248 "answer": true, 249 "justification": "Section 3.4 states: 'each respective model's standard setting with a temperature of 1.0.' Temperature is the key sampling hyperparameter. Other settings are stated to use defaults.", 250 "source": "opus" 251 }, 252 "scaffolding_described": { 253 "applies": false, 254 "answer": false, 255 "justification": "No agentic scaffolding is used. Section 3.5 states: 'the prompts are not iterative, meaning that each model processes a sample/query without follow-up interactions.' All experiments use single-prompt zero-shot setups.", 256 "source": "opus" 257 }, 258 "data_preprocessing_documented": { 259 "applies": true, 260 "answer": true, 261 "justification": "The filtering pipeline is documented: benchmarks are filtered for reproducibility, single-function bugs, and single-hunk bugs (Table 1 shows counts at each stage). For the 100-bug subsets, stratified sampling by complexity level is described (Section 4.1.2, Table 3), ensuring all repositories are represented.", 262 "source": "opus" 263 } 264 }, 265 "data_integrity": { 266 "raw_data_available": { 267 "applies": true, 268 "answer": true, 269 "justification": "Section 7 states: 'The data that support the findings of this study, including all our results and the generated patches, are openly available in our supplemental material: https://figshare.com/s/947fd7030f10a67a1c9f.' This includes raw patch outputs.", 270 "source": "opus" 271 }, 272 "data_collection_described": { 273 "applies": true, 274 "answer": true, 275 "justification": "Section 3.3 describes benchmark selection criteria (diverse languages, real bugs, executable with test suites, human ground-truth patches, sufficient reproducible bugs). Patch generation is described: 15 patches per bug via 3 independent runs of 5 generations each at temperature 1.0.", 276 "source": "opus" 277 }, 278 "recruitment_methods_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants. Data sources are standard public benchmarks (Defects4J, BugsInPy, BugsJS, BugsPHP).", 282 "source": "opus" 283 }, 284 "data_pipeline_documented": { 285 "applies": true, 286 "answer": true, 287 "justification": "Table 1 shows the filtering pipeline with counts at each stage (all bugs → single-file → single-function → single-hunk → single-line). Section 4.1.2 describes stratified sampling for 100-bug subsets with the distribution shown in Table 3. Reproducibility checks are described per benchmark.", 288 "source": "opus" 289 } 290 }, 291 "contamination": { 292 "training_cutoff_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "Model version dates are given for GPT-4o (Nov 11 2024) and o3-mini (Jan 31 2025), but these are release dates, not training data cutoff dates. Training cutoff dates are not stated for any of the 13 models evaluated.", 296 "source": "opus" 297 }, 298 "train_test_overlap_discussed": { 299 "applies": true, 300 "answer": true, 301 "justification": "Section 5 explicitly discusses overlap: 'we have to assume that some of the benchmark data may have been included in the training corpora of some of the LLMs.' They cite Zhou et al. showing Defects4J has 0.41% leaked ratio and BugsInPy 11.0%, and Ramos et al. on memorization in open-source models.", 302 "source": "opus" 303 }, 304 "benchmark_contamination_addressed": { 305 "applies": true, 306 "answer": true, 307 "justification": "Section 5 addresses contamination by citing Zhou et al.'s LessLeak-Bench analysis of leaked ratios and Ramos et al.'s study of memorization. They note 'BugsInPy obviously has some data leakage issues' yet was the most challenging benchmark, providing an empirical counterpoint to contamination concerns.", 308 "source": "opus" 309 } 310 }, 311 "human_studies": { 312 "pre_registered": { 313 "applies": false, 314 "answer": false, 315 "justification": "No human participants. This is a benchmark evaluation of LLMs on code repair tasks.", 316 "source": "opus" 317 }, 318 "irb_or_ethics_approval": { 319 "applies": false, 320 "answer": false, 321 "justification": "No human participants. The study evaluates LLM performance on public code benchmarks.", 322 "source": "opus" 323 }, 324 "demographics_reported": { 325 "applies": false, 326 "answer": false, 327 "justification": "No human participants in this study.", 328 "source": "opus" 329 }, 330 "inclusion_exclusion_criteria": { 331 "applies": false, 332 "answer": false, 333 "justification": "No human participants in this study.", 334 "source": "opus" 335 }, 336 "randomization_described": { 337 "applies": false, 338 "answer": false, 339 "justification": "No human participants in this study.", 340 "source": "opus" 341 }, 342 "blinding_described": { 343 "applies": false, 344 "answer": false, 345 "justification": "No human participants in this study.", 346 "source": "opus" 347 }, 348 "attrition_reported": { 349 "applies": false, 350 "answer": false, 351 "justification": "No human participants in this study.", 352 "source": "opus" 353 } 354 }, 355 "cost_and_practicality": { 356 "inference_cost_reported": { 357 "applies": true, 358 "answer": false, 359 "justification": "No inference costs, API costs, or per-example latencies are reported despite generating approximately 195,000 patches. Section 5 mentions timing differences between models ('spanning between seconds and several minutes') but provides no specific cost data.", 360 "source": "opus" 361 }, 362 "compute_budget_stated": { 363 "applies": true, 364 "answer": false, 365 "justification": "No total computational budget is stated. No GPU hours, total API spend, or hardware specifications are reported. Section 5 acknowledges: 'Our study methodology did not limit the time for model inference or patch validation.'", 366 "source": "opus" 367 } 368 }, 369 "experimental_rigor": { 370 "seed_sensitivity_reported": { 371 "applies": true, 372 "answer": false, 373 "justification": "Three independent runs of 5 generations each are performed, but results are aggregated into a single pass@k estimate. No breakdown by run or analysis of how results vary across the 3 independent runs is provided.", 374 "source": "opus" 375 }, 376 "number_of_runs_stated": { 377 "applies": true, 378 "answer": true, 379 "justification": "Section 3.4 states: 'We distributed the generation across three independent runs, each generating five candidate patches using each respective model's standard setting with a temperature of 1.0.' Total n=15 per bug.", 380 "source": "opus" 381 }, 382 "hyperparameter_search_budget": { 383 "applies": true, 384 "answer": false, 385 "justification": "Section 3.5 mentions 'preliminary experiments' to evaluate prompt integration methods for line-level localization, but does not report how many configurations were tried or the compute spent on this search.", 386 "source": "opus" 387 }, 388 "best_config_selection_justified": { 389 "applies": true, 390 "answer": false, 391 "justification": "The LL prompt format was selected from preliminary experiments ('The most effective strategy was found to be a simple comment \"TODO: Fix here:\"') but the paper does not report all configurations tried, what data was used for selection, or the selection criterion beyond effectiveness.", 392 "source": "opus" 393 }, 394 "multiple_comparison_correction": { 395 "applies": true, 396 "answer": false, 397 "justification": "Wilcoxon signed-rank tests are applied across 13 models × 4 languages (52+ comparisons per table) at α = 0.05 with no mention of Bonferroni, Holm, or other multiple comparison correction.", 398 "source": "opus" 399 }, 400 "self_comparison_bias_addressed": { 401 "applies": true, 402 "answer": true, 403 "justification": "The authors do not propose their own APR system; they evaluate 13 third-party LLMs using standardized prompts and public benchmarks. No self-comparison bias exists since they are not comparing their own tool against baselines.", 404 "source": "opus" 405 }, 406 "compute_budget_vs_performance": { 407 "applies": true, 408 "answer": false, 409 "justification": "Section 5 acknowledges 'Different resource demands across the models, in particular with respect to timing, can lead to unfair comparison' but does not report performance as a function of compute budget. DeepSeek R1 is noted as slower but no quantification is provided.", 410 "source": "opus" 411 }, 412 "benchmark_construct_validity": { 413 "applies": true, 414 "answer": true, 415 "justification": "Section 3.4 discusses construct validity at length: plausibility vs. correctness of patches, test overfitting as a proxy issue, and comparison of metrics (plausibility, TCE, SYE). They cite Petke et al. who found overfitting 'may be less problematic than previously assumed.'", 416 "source": "opus" 417 }, 418 "scaffold_confound_addressed": { 419 "applies": false, 420 "answer": false, 421 "justification": "No scaffolding is used. All models are evaluated using the same zero-shot prompt templates, eliminating scaffold confounds by design.", 422 "source": "opus" 423 } 424 }, 425 "data_leakage": { 426 "temporal_leakage_addressed": { 427 "applies": true, 428 "answer": true, 429 "justification": "Section 5 discusses temporal leakage: 'Related works, e.g., RepairBench used a dataset with more recent data, whose time period is now also included in the cut-off dates of the latest models.' They cite Zhou et al. on leaked ratios of the specific benchmarks used.", 430 "source": "opus" 431 }, 432 "feature_leakage_addressed": { 433 "applies": true, 434 "answer": false, 435 "justification": "No discussion of whether the evaluation setup leaks answer information through context. Different prompt variants provide different amounts of information (base vs test vs LL) but this is studied as a variable, not as a leakage concern.", 436 "source": "opus" 437 }, 438 "non_independence_addressed": { 439 "applies": true, 440 "answer": false, 441 "justification": "No discussion of whether bugs from the same projects within benchmarks create non-independence. Defects4J contains bugs from 17 projects and BugsInPy from 17 projects, but potential correlations within projects are not addressed.", 442 "source": "opus" 443 }, 444 "leakage_detection_method": { 445 "applies": true, 446 "answer": false, 447 "justification": "No concrete leakage detection method is applied by the authors. They cite external analyses (Zhou et al.'s leaked ratios, Ramos et al.'s memorization study) but do not apply their own canary strings, membership inference, or decontamination techniques.", 448 "source": "opus" 449 } 450 } 451 } 452 }, 453 "claims": [ 454 { 455 "claim": "No single LLM outperforms all others across Java, JavaScript, Python, and PHP for automated program repair", 456 "evidence": "Table 2 shows four different models achieving best pass@1 per language: Claude 3.7 Sonnet (Java), Claude 3.5 Haiku (JS), DeepSeek R1 dist. (PHP), Gemini 2.0 Flash (Python)", 457 "supported": "strong" 458 }, 459 { 460 "claim": "Including failing test case information in the prompt substantially improves APR performance across all models and languages", 461 "evidence": "Table 4 shows improvements up to +47% pass@1 (DeepSeek R1 on Python); improvements are consistent across all 6 models and 4 languages tested", 462 "supported": "strong" 463 }, 464 { 465 "claim": "Automated fault localization causes dramatic performance collapse compared to perfect fault localization", 466 "evidence": "Table 6 shows pass@1 drops from 15-27% (perfect FL) to 0.31-3.64% (automated FL) on Defects4J; FLACOCO correctly identified the buggy function in only 28/100 cases", 467 "supported": "strong" 468 }, 469 { 470 "claim": "Combining two models in an ensemble improves pass@5 over the best single model", 471 "evidence": "Table 5 shows improvements in 14 of 16 language-prompt combinations; e.g., JavaScript pass@5 improves from 68.0% (o3-mini alone) to 71.68% (o3-mini + DeepSeek R1)", 472 "supported": "strong" 473 }, 474 { 475 "claim": "Open-source models are catching up to closed models for APR performance", 476 "evidence": "Figure 4 shows DeepSeek R1 (dist.) reaching or exceeding closed model performance in pass@5 averaged over all benchmarks", 477 "supported": "moderate" 478 }, 479 { 480 "claim": "Line-level localization adds less value than test information and can degrade performance", 481 "evidence": "Table 4 shows LL consistently underperforms test-prompt improvements; PHP performance degrades for 4/6 models with line-level localization", 482 "supported": "strong" 483 }, 484 { 485 "claim": "Python APR performance is poor largely due to indentation errors in generated patches", 486 "evidence": "Figure 1 shows high rates of indentation errors across most models; Gemini models with fewer indentation errors achieve comparatively better Python performance", 487 "supported": "moderate" 488 } 489 ], 490 "methodology_tags": [ 491 "benchmark-eval" 492 ], 493 "key_findings": "No single LLM generalizes best across Java, JavaScript, Python, and PHP for automated program repair, with different models leading on different languages. Including failing test case information dramatically improves performance (up to +47% pass@1), while automated fault localization causes catastrophic performance drops (from 15-27% to under 4% pass@1 on Defects4J), exposing a critical gap between idealized research evaluations and realistic deployment. Model ensembles consistently outperform individual models in 14/16 settings, and open-source models (particularly DeepSeek R1) are approaching closed-model performance levels.", 494 "red_flags": [ 495 { 496 "flag": "Reproduction scripts unavailable", 497 "detail": "Section 7 promises reproduction scripts 'upon acceptance' but currently only raw results on figshare are available; the paper cannot yet be fully reproduced from the released artifact." 498 }, 499 { 500 "flag": "Plausibility as correctness proxy without manual verification", 501 "detail": "Test-passing (plausibility) is used as the primary metric throughout; the paper acknowledges this is a construct validity threat but performs no manual patch correctness verification." 502 }, 503 { 504 "flag": "No variance reported across runs", 505 "detail": "Three independent runs of five patches each are generated but variance or std dev of pass@k estimates across runs is not reported, obscuring run-to-run variability." 506 }, 507 { 508 "flag": "Automated FL tested on Java only", 509 "detail": "FLACOCO supports only Java, so the critical finding about automated FL performance collapse is validated on only one of four languages, limiting generalizability of this key result." 510 }, 511 { 512 "flag": "No confidence intervals on pass@k estimates", 513 "detail": "Pass@k point estimates are reported without confidence intervals; statistical uncertainty beyond what the Wilcoxon test captures is not quantified." 514 }, 515 { 516 "flag": "No inference costs reported", 517 "detail": "~195,000 patches were generated across 13 models including expensive closed-source APIs (Claude 3.7, o3-mini, GPT-4o), but no cost figures are provided, limiting practitioner budget planning." 518 } 519 ], 520 "cited_papers": [ 521 { 522 "title": "Automated Program Repair in the Era of Large Pre-trained Language Models", 523 "relevance": "Direct predecessor empirical study by Xia et al. evaluating 9 LLMs on APR across multiple benchmarks and languages; this paper explicitly extends and updates those findings" 524 }, 525 { 526 "title": "RepairBench: Leaderboard of Frontier Models for Program Repair", 527 "relevance": "Contemporary benchmark and prompt design reference; the test prompt template is directly adopted from RepairBench" 528 }, 529 { 530 "title": "Evaluating Large Language Models Trained on Code", 531 "relevance": "Source of the pass@k metric (Chen et al. 2021) used as primary evaluation criterion throughout the paper" 532 }, 533 { 534 "title": "Defects4J: a database of existing faults to enable controlled testing studies for Java programs", 535 "relevance": "Primary benchmark used; most prior APR work uses Defects4J enabling direct comparison" 536 }, 537 { 538 "title": "Benchmarking Automated Program Repair: An Extensive Study on Both Real-World and Artificial Bugs", 539 "relevance": "Prior comprehensive APR benchmark study by Ouyang et al. that this work extends to more recent LLMs" 540 }, 541 { 542 "title": "The Fact Selection Problem in LLM-Based Program Repair", 543 "relevance": "Prior work on test diagnostic feedback for LLM-based repair; this paper confirms and extends those findings across more models and languages" 544 }, 545 { 546 "title": "LessLeak-Bench: A First Investigation of Data Leakage in LLMs Across 83 Software Engineering Benchmarks", 547 "relevance": "Provides benchmark-specific data leakage rates (Defects4J 0.41%, BugsInPy 11.0%) used to assess contamination threat" 548 }, 549 { 550 "title": "Breaking the Silence: the Threats of Using LLMs in Software Engineering", 551 "relevance": "Framework for addressing LLM evaluation threats including non-determinism and data leakage that this paper follows" 552 } 553 ], 554 "engagement_factors": { 555 "practical_relevance": { 556 "score": 2, 557 "justification": "APR practitioners can use the model comparison insights and model committee strategy, but no tool or code is currently released." 558 }, 559 "surprise_contrarian": { 560 "score": 1, 561 "justification": "The finding that no single LLM dominates across languages is somewhat expected; the dramatic impact of automated FL is noteworthy but not shocking." 562 }, 563 "fear_safety": { 564 "score": 0, 565 "justification": "No security or safety concerns raised; the paper focuses on bug-fixing capabilities of LLMs." 566 }, 567 "drama_conflict": { 568 "score": 0, 569 "justification": "No controversy or conflict; a straightforward empirical comparison study." 570 }, 571 "demo_ability": { 572 "score": 0, 573 "justification": "No demo, tool, or released code; scripts are promised upon acceptance but not yet available." 574 }, 575 "brand_recognition": { 576 "score": 1, 577 "justification": "From academic labs (RheinMain, Ruhr University) and SonarSource; evaluates well-known models (GPT-4o, Claude, Gemini) but the paper itself is not from a major AI lab." 578 } 579 }, 580 "hn_data": { 581 "threads": [ 582 { 583 "hn_id": "44750462", 584 "title": "Nonogram: Complexity of Inference and Phase Transition Behavior", 585 "points": 16, 586 "comments": 2, 587 "url": "https://news.ycombinator.com/item?id=44750462" 588 }, 589 { 590 "hn_id": "44815351", 591 "title": "The possibility of a giant impact on Venus", 592 "points": 5, 593 "comments": 0, 594 "url": "https://news.ycombinator.com/item?id=44815351" 595 }, 596 { 597 "hn_id": "31662569", 598 "title": "NeMF: Neural Motion Fields for Kinematic Animation", 599 "points": 4, 600 "comments": 0, 601 "url": "https://news.ycombinator.com/item?id=31662569" 602 }, 603 { 604 "hn_id": "46021186", 605 "title": "User Location Disclosure Amplifies Regional Divisions on Chinese Social Media", 606 "points": 3, 607 "comments": 0, 608 "url": "https://news.ycombinator.com/item?id=46021186" 609 }, 610 { 611 "hn_id": "27450354", 612 "title": "Tabular Data: Deep Learning Is Not All You Need", 613 "points": 3, 614 "comments": 0, 615 "url": "https://news.ycombinator.com/item?id=27450354" 616 }, 617 { 618 "hn_id": "47690469", 619 "title": "Frontier AI models are the most cost-efficient", 620 "points": 2, 621 "comments": 0, 622 "url": "https://news.ycombinator.com/item?id=47690469" 623 }, 624 { 625 "hn_id": "44003454", 626 "title": "Twist: Teleoperated Whole-Body Imitation System", 627 "points": 2, 628 "comments": 0, 629 "url": "https://news.ycombinator.com/item?id=44003454" 630 }, 631 { 632 "hn_id": "43692092", 633 "title": "Semantic Commit: Helping Users Update Intent Specifications for AI Memory", 634 "points": 2, 635 "comments": 0, 636 "url": "https://news.ycombinator.com/item?id=43692092" 637 }, 638 { 639 "hn_id": "32176051", 640 "title": "Nezha: Deployable and High-Performance Consensus Using Synchronized Clocks", 641 "points": 2, 642 "comments": 0, 643 "url": "https://news.ycombinator.com/item?id=32176051" 644 }, 645 { 646 "hn_id": "45293628", 647 "title": "A Trustworthiness-Based Metaphysics of Artificial Intelligence Systems", 648 "points": 1, 649 "comments": 0, 650 "url": "https://news.ycombinator.com/item?id=45293628" 651 } 652 ], 653 "top_points": 16, 654 "total_points": 40, 655 "total_comments": 2 656 } 657 }