scan.json (26850B)
1 { 2 "paper": { 3 "title": "Automated Repair of C Programs Using Large Language Models", 4 "authors": ["Mahdi Farzandway", "Fatemeh Ghassemi"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2509.01947" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "Section 4.5 states 'The implementation of our method is accessible on GitHub' and references a GitHub repository, though the specific URL is given only implicitly through the paper's context. The claim is made in the implementation details section." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses the Codeflaws dataset (reference [27]), which is a publicly available benchmark. Section 4.4 describes the dataset in detail. Since they used a standard public benchmark without modification, this counts as YES." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper mentions using LangChain, LangGraph, GCC with specific flags, and the Suresoft-GLaDOS/SBFL library, but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions. No version numbers are given for any dependencies." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "While the methodology is described at a high level (Section 3), there are no step-by-step reproduction instructions, no README with commands, and no scripts to replicate the main experiments. A researcher would have to reconstruct the pipeline from the prose description." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "All results are reported as point estimates (e.g., '44.93% repair accuracy'). No confidence intervals, error bars, or uncertainty measures are provided in Tables 1, 2, or 3." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims improvements over baselines (e.g., '3.61% absolute improvement over GPT-4 with CoT') but provides no statistical significance tests such as p-values, t-tests, or bootstrap tests to support these claims." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports absolute improvement with baseline context: '44.93% repair accuracy—representing a 3.61% absolute improvement over strong state-of-the-art APR baselines such as GPT-4 with CoT' (41.32%). Table 1 provides baselines and the proposed method's scores, allowing readers to compute relative differences." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The evaluation uses 3,902 bugs from Codeflaws but there is no justification for why this sample size is adequate, no power analysis, and no discussion of whether the sample is representative of real-world bugs." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. It is unclear whether results represent a single run or are averaged across multiple runs. No seed variation or multiple-run results are mentioned." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Table 1 compares against GPT-4 with CoT (41.32%), SPR (20.06%), CoCoNuT (18.34%), and Angelix (15.14%). Multiple baselines from both classical APR and LLM-based approaches are included." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": false, 68 "justification": "The classical baselines SPR (2015), Angelix (2016), and CoCoNuT (2020) are quite old. The only contemporary baseline is GPT-4 with CoT. More recent LLM-based APR tools like CigaR (2024), ContrastRepair (2024), and D4C (2024) are discussed in related work but not compared against on the same benchmark. The paper acknowledges not comparing against Agentless and AutoCodeRover due to different task settings." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "The five evaluation scenarios (Section 4.2) function as an ablation study: Scenario 1 (no feedback), Scenario 2 (test cases only), Scenario 3 (test cases + SBFL), Scenario 4 (CoT + test cases), Scenario 5 (CoT + SBFL + test cases). Table 1 shows progressive improvement as components are added." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Section 4.3 defines three metrics: Repair Accuracy, Time-to-Repair, and Partial Repair Rate. Results are reported for all three in Tables 1, 2, and 3." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation of the generated patches is performed. Evaluation is entirely automated (pass/fail on test suites). Human evaluation could have assessed patch quality, readability, or semantic correctness beyond test suite passing." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": false, 88 "justification": "The paper uses the full Codeflaws dataset of 3,902 bugs. There is no mention of train/dev/test splits. Since the LLMs are not fine-tuned on the dataset, a held-out set is less critical, but model selection and hyperparameter tuning (e.g., choosing 4 iterations, Ochiai > 0.5 threshold) are performed on the same dataset used for evaluation." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": false, 93 "justification": "Results are reported only as aggregate accuracy across all 3,902 bugs. No breakdown by bug type, defect category, or difficulty level is provided, despite the Codeflaws dataset containing categorization information." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 5.2 'Failure Analysis' identifies four categories of failure: non-compilable patches, semantic errors, timeouts/non-terminating executions. This provides substantive analysis of where the approach breaks down." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that approximately 55% of bugs remain unresolved (Section 5.2), and Table 3 shows 48.98% 'No Improvement' cases for the best model. The diminishing returns beyond 4 iterations (Fig 5) is also a negative finding." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims 44.93% repair accuracy and 3.61% absolute improvement over GPT-4 with CoT. These are supported by Table 1 (Llama 3.1 405b Scenario 5 = 44.93%, GPT-4 CoT = 41.32%, difference = 3.61%)." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The ablation-style scenarios (1-5) provide a controlled progression of components. Each scenario adds one element (test feedback, SBFL, CoT), isolating contributions. This controlled single-variable manipulation across scenarios supports the causal claims about which components improve performance." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title claims 'Automated Repair of C Programs' broadly, but the evaluation is limited to competitive programming bugs from Codeflaws. Section 7 briefly mentions expanding 'beyond C programs from programming competitions to enterprise-level software systems,' but the title and abstract do not bound the claims to competitive programming." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper does not discuss alternative explanations for the observed improvements. For example, the improvement over GPT-4 with CoT could be partly due to the iterative loop (4 iterations vs. single query for GPT-4), different model characteristics, or the specific nature of Codeflaws bugs. Section 7 discusses limitations but not alternative explanations for the results." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper specifies 'Llama 3.1 405b', 'Llama 3.2 90b', and 'Llama 3.1 70b' but does not provide snapshot dates or specific model version identifiers. For the GPT-4 baseline, only 'GPT-4' is mentioned with a reference to the GPT-4 technical report. These are marketing names without version specifics (e.g., no 'meta-llama/Meta-Llama-3.1-405B-Instruct' or similar precise identifier)." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "Figures 4a and 4b show the 'structure' of system and user prompts, but these are schematic diagrams of prompt architecture, not the actual prompt text. The full prompt text used in experiments is not provided in the paper or appendix." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "No temperature, top-p, max tokens, or other LLM API parameters are reported. The paper mentions using OpenRouter platform for API access but does not state any sampling parameters used for the Llama models or the GPT-4 baseline." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "Section 3 and 4.5 describe the agentic scaffolding in detail: the iterative refinement loop, the memory mechanism for prior attempts, the directed graph-based workflow, the two-tier prompting mechanism, SBFL integration, and the termination conditions. Figure 2 illustrates the workflow." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 4.4 describes the Codeflaws dataset structure (faulty implementation, test cases, expected outputs). Section 4.5 describes how code is compiled with GCC flags, how coverage data is collected, and how SBFL scores are computed with the Ochiai formula filtering for scores > 0.5." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 7 'Limitations and Future Work' provides a dedicated section discussing limitations including the requirement for executable code, line-level vs. statement-level granularity, dependency on high-quality test cases, and time-related errors." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Section 7 discusses specific threats: 'the current implementation requires executable code to run test cases,' 'our spectral analysis operates at line-level granularity, where statement-level analysis could potentially improve precision,' and 'the evaluation assumes the availability of high-quality test cases, which may not always be available in practice.' These are specific to this study." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "While Section 7 mentions future plans to 'expand beyond C programs from programming competitions to enterprise-level software systems,' the paper does not explicitly state what the results do NOT show. The limitations section focuses on what could be improved, not on what claims cannot be made from the current evidence." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "Only aggregate results are presented in tables. The per-bug results, individual patch outputs, and detailed repair logs are not made available for independent verification." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 4.4 describes the Codeflaws dataset: 3,902 unique code samples from public programming competitions, each consisting of a faulty implementation, test cases with pass/fail results, and expected correct outputs." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants are involved. The evaluation is entirely computational using the Codeflaws benchmark dataset." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 3 and 4.5 document the pipeline: code execution with GCC coverage flags, test case execution with 2-minute timeout, coverage data collection via gcov, SBFL computation using Ochiai formula, prompt construction, and iterative refinement. The pipeline from data collection to analysis is described." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No acknowledgments section, no funding disclosure, and no mention of grants or sponsors anywhere in the paper." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly listed: both authors are from University of Tehran. The paper evaluates open-source Llama models, so there is no direct product-affiliation conflict." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding is disclosed at all. The absence of a funding disclosure means we cannot assess whether a funder influenced the outcome. The paper uses OpenRouter's free API access for Llama models, which could be considered a form of in-kind support but is not disclosed." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement or financial interest declaration is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper does not state the training data cutoff dates for any of the Llama models (3.1 405b, 3.2 90b, 3.1 70b) or GPT-4. This is relevant because Codeflaws bugs come from public programming competitions and could appear in training data." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of whether Codeflaws bugs or their solutions appeared in the training data of Llama or GPT-4. The Codeflaws dataset (2017) predates all models used, making contamination a real concern that is not addressed." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "The Codeflaws benchmark was published in 2017, well before the training cutoffs of Llama 3.1/3.2 and GPT-4. Solutions to these competitive programming problems are widely available online. The paper does not acknowledge or address this contamination risk." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants are involved in this study. It is a purely computational benchmark evaluation." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants are involved in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants are involved in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants are involved in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants are involved in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants are involved in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants are involved in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "Table 2 reports average response times per bug for each model across all scenarios. For example, Llama 3.1 405b takes 80.3 seconds in Scenario 5. While API dollar costs are not reported, the paper notes OpenRouter provided free API access, and time-to-repair is a meaningful practical cost metric." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total computational budget is stated. The paper does not report total GPU hours, total API calls, total tokens consumed, or total wall-clock time for running all 3,902 bugs across all scenarios and models." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "The approach achieves 44.93% repair accuracy on the Codeflaws benchmark, representing a 3.61% absolute improvement over GPT-4 with CoT.", 286 "evidence": "Table 1 shows Llama 3.1 405b Scenario 5 achieves 44.93%, while GPT-4 (CoT) achieves 41.32%. The difference is 3.61 percentage points.", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "Four iterations represent the optimal configuration for the iterative refinement process.", 291 "evidence": "Section 5 and Figure 5 show convergence analysis: 'While we extended our testing to nine iterations, the performance gains beyond the fourth iteration proved negligible.'", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "Integration of SBFL with CoT reasoning shows synergistic improvement over either component alone.", 296 "evidence": "Table 1 shows progressive improvement across scenarios: Scenario 2 (test cases only) < Scenario 3 (+ SBFL) < Scenario 4 (CoT, no SBFL) < Scenario 5 (CoT + SBFL). For Llama 3.1 405b: 36.80% → 41.13% → 42.92% → 44.93%.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Even lower-performing models achieve competitive results with the full methodology.", 301 "evidence": "Table 1 shows Llama 3.1 70b in Scenario 5 (38.83%) approaches baseline GPT-4 CoT (41.32%). However, 'competitive' is loosely defined and the gap is still 2.49 percentage points.", 302 "supported": "weak" 303 }, 304 { 305 "claim": "The system achieves a median repair time of 80 seconds per defect, which is a substantial improvement over existing techniques.", 306 "evidence": "Table 2 shows Llama 3.1 405b Scenario 5 averages 80.3 seconds. The paper claims existing techniques require '30 seconds to 4 minutes' but the 80s figure falls within this range, and the high time is partly attributed to API availability issues.", 307 "supported": "weak" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "The paper presents an automated program repair framework that combines spectrum-based fault localization (SBFL) with Chain-of-Thought prompting in an iterative loop for C bug repair. Evaluated on 3,902 Codeflaws bugs, the approach achieves 44.93% repair accuracy using Llama 3.1 405b, a 3.61 percentage point improvement over GPT-4 with CoT (41.32%). The ablation across five scenarios shows each component (test feedback, SBFL, iterative CoT) contributes incrementally to performance, with four iterations found to be optimal before diminishing returns.", 312 "red_flags": [ 313 { 314 "flag": "Benchmark contamination risk", 315 "detail": "The Codeflaws dataset (2017) is derived from public programming competitions. All models used (Llama 3.1/3.2, GPT-4) were trained well after 2017 and likely saw solutions to these competitive programming problems during training. The paper does not acknowledge or address this contamination risk, which could inflate all reported results including baselines." 316 }, 317 { 318 "flag": "No statistical significance testing", 319 "detail": "The claimed 3.61% improvement over GPT-4 with CoT is reported without any significance test, confidence interval, or variance estimate. Without knowing the variance across runs, it is impossible to determine whether this difference is meaningful or within random fluctuation." 320 }, 321 { 322 "flag": "Unfair baseline comparison", 323 "detail": "GPT-4 with CoT is given a single query with test cases and asked to reason and repair, while the proposed method uses 4 iterative queries with SBFL, memory of prior attempts, and runtime feedback. The comparison is between a single-turn approach and a multi-turn agentic approach, making it unclear whether the improvement comes from the methodology or simply from more compute (4x more LLM calls)." 324 }, 325 { 326 "flag": "No variance or reproducibility information", 327 "detail": "LLM outputs are stochastic, yet the paper reports single-number results with no indication of whether experiments were repeated, what temperature settings were used, or how reproducible the results are." 328 }, 329 { 330 "flag": "Missing hyperparameters", 331 "detail": "No LLM API parameters (temperature, top-p, max tokens) are reported for any model. These significantly affect output and are essential for reproducibility." 332 }, 333 { 334 "flag": "Overstated conclusion", 335 "detail": "The conclusion states the approach 'achieves a substantial 44.93% absolute accuracy improvement in bug localization and correction for C programs over baseline LLM methods.' This is misleading — the improvement is 3.61 percentage points over GPT-4 with CoT, not 44.93% improvement. The 44.93% is the absolute accuracy, compared to 29.01% for the same model without feedback (Scenario 1)." 336 } 337 ], 338 "cited_papers": [ 339 { 340 "title": "Evaluating large language models trained on code", 341 "authors": ["Mark Chen", "Jerry Tworek"], 342 "year": 2021, 343 "arxiv_id": "2107.03374", 344 "relevance": "Foundational work on LLM code generation capabilities (Codex), directly relevant to evaluating LLM-based programming tools." 345 }, 346 { 347 "title": "AutoCodeRover: Autonomous program improvement", 348 "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"], 349 "year": 2024, 350 "relevance": "State-of-the-art agentic program repair tool evaluated on SWE-bench, representative of modern AI-based software engineering agents." 351 }, 352 { 353 "title": "Keep the conversation going: Fixing 162 out of 337 bugs for $0.42 each using chatgpt", 354 "authors": ["Chunqiu Steven Xia", "Lingming Zhang"], 355 "year": 2023, 356 "arxiv_id": "2304.00385", 357 "relevance": "Conversational automated program repair using LLMs with cost analysis, directly relevant to LLM-based repair methodology and cost reporting." 358 }, 359 { 360 "title": "ContrastRepair: Enhancing conversation-based automated program repair via contrastive test case pairs", 361 "authors": ["Jiaolong Kong", "Mingfei Cheng"], 362 "year": 2024, 363 "arxiv_id": "2403.01971", 364 "relevance": "Recent LLM-based program repair approach using contrastive learning with test feedback, closely related methodology." 365 }, 366 { 367 "title": "Agentless: Demystifying LLM-based software engineering agents", 368 "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"], 369 "year": 2024, 370 "relevance": "Key work on LLM-based software engineering comparing agentic vs. agentless approaches, directly relevant to survey scope." 371 }, 372 { 373 "title": "SWE-bench: Can language models resolve real-world GitHub issues?", 374 "authors": ["Carlos E Jimenez", "John Yang"], 375 "year": 2023, 376 "arxiv_id": "2310.06770", 377 "relevance": "Major benchmark for evaluating LLM-based software engineering agents on real-world GitHub issues." 378 }, 379 { 380 "title": "Automated repair of programs from large language models", 381 "authors": ["Zhiyu Fan", "Xiang Gao", "Martin Mirchev", "Abhik Roychoudhury", "Shin Hwei Tan"], 382 "year": 2023, 383 "relevance": "Empirical study of LLM-generated code repair at ICSE 2023, directly relevant to understanding LLM repair capabilities." 384 }, 385 { 386 "title": "CigaR: Cost-efficient program repair with LLMs", 387 "authors": ["David Hidvegi", "Khashayar Etemadi", "Sofia Bobadilla", "Martin Monperrus"], 388 "year": 2024, 389 "arxiv_id": "2402.06598", 390 "relevance": "Cost-efficient LLM-based program repair, relevant to evaluating practical cost considerations in AI-based repair tools." 391 }, 392 { 393 "title": "Automatic programming: Large language models and beyond", 394 "authors": ["Michael R Lyu", "Baishakhi Ray", "Abhik Roychoudhury"], 395 "year": 2024, 396 "arxiv_id": "2405.02213", 397 "relevance": "Survey of LLM capabilities for automatic programming, relevant to understanding the broader landscape of LLM-based code generation." 398 }, 399 { 400 "title": "Aligning LLMs for FL-free program repair", 401 "authors": ["Junjielong Xu", "Ying Fu", "Shin Hwei Tan", "Pinjia He"], 402 "year": 2024, 403 "arxiv_id": "2404.08877", 404 "relevance": "Alternative approach to LLM-based program repair that aligns model output with repair objectives without fault localization." 405 }, 406 { 407 "title": "Copiloting the copilots: Fusing large language models with completion engines for automated program repair", 408 "authors": ["Yuxiang Wei", "Chunqiu Steven Xia", "Lingming Zhang"], 409 "year": 2023, 410 "relevance": "Hybrid approach combining LLMs with completion engines for program repair, relevant to understanding integration of traditional and AI-based repair methods." 411 }, 412 { 413 "title": "Fully autonomous programming with large language models", 414 "authors": ["Vadim Liventsev", "Anastasiia Grishina"], 415 "year": 2023, 416 "relevance": "SEIDR system integrating transformers with search-based methods for autonomous programming, relevant to agentic AI for code generation." 417 } 418 ] 419 }