scan.json (25149B)
1 { 2 "paper": { 3 "title": "Keep the Conversation Going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT", 4 "authors": ["Chunqiu Steven Xia", "Lingming Zhang"], 5 "year": 2023, 6 "venue": "arXiv / ISSTA (submitted)", 7 "arxiv_id": "2304.00385", 8 "doi": null 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": false, 15 "justification": "No repository URL or code archive is provided in the paper. The paper describes the implementation in Python accessing the ChatGPT API but does not release the source code." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The paper uses publicly available benchmarks: Defects4j (v1.2 and v2.0) and QuixBugs, both well-known public datasets with established download procedures." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": true, 25 "justification": "Section 4.1 specifies the evaluation environment: '8-core workstation with Intel i7 10700KF Comet Lake CPU @3.80GHz and 64GB RAM, running Ubuntu 20.04.3 LTS and OpenJDK Java 64-Bit Server version 1.8.0_312.' The model used is also specified as gpt-3.5-turbo-0301." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. While the approach is described algorithmically (Algorithm 1), there are no concrete commands or reproduction guide." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "All results are reported as point estimates (number of bugs fixed). No confidence intervals or error bars are provided despite the stochastic nature of LLM sampling." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper claims ChatRepair outperforms baselines (e.g., '15 and 17 more than prior best baseline') but no statistical significance tests are performed. Comparisons are based solely on raw counts of bugs fixed." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper reports improvements in absolute terms with context: '114 and 48 correct bug fixes (15 and 17 more than prior best baseline) on Defects4j 1.2 and 2.0 respectively.' The ablation study also reports specific numeric improvements (e.g., plausible patch generation adds 4, 7, 2 correct fixes in the three settings)." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "No justification is given for the sample sizes. The benchmarks used (391 bugs in D4J 1.2, 82 single-line in D4J 2.0, 40 in QuixBugs) are inherited from prior work without discussion of whether they are sufficient for the claims made." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "Results are reported from what appears to be a single run. No standard deviation, variance, or multi-run results are reported despite the stochastic nature of LLM sampling with temperature=1." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper compares against 21 prior APR tools including 8 learning-based/LLM-based baselines (SelfAPR, AlphaRepair, RewardRepair, Recoder, CURE, CoCoNuT, DLFix, SequenceR) and 12 traditional APR tools plus a BaseChatGPT ablation." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "Baselines include contemporary tools: AlphaRepair (2022), SelfAPR (2022), RewardRepair (2022), CodexRepair (2023), all published within 1-2 years of this work. These represent the state of the art at the time." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": true, 74 "justification": "Section 5.3 (RQ3) provides a comprehensive ablation study examining: (1) initial prompt variations (Table 4), (2) feedback response variations (Table 5), and (3) maximum conversation length (Figure 7). Each component's contribution is measured." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": true, 79 "justification": "The paper uses multiple metrics: plausible patches, correct patches, number of tries (queries to ChatGPT), and dollar cost. Section 4.3 describes all four metrics." 80 }, 81 "human_evaluation": { 82 "applies": true, 83 "answer": true, 84 "justification": "Section 4.3 states: 'We follow common practice in APR and manually determine the semantic equivalency to compute correct patches.' This manual inspection of patches by the authors constitutes human evaluation of the system's outputs." 85 }, 86 "held_out_test_set": { 87 "applies": true, 88 "answer": true, 89 "justification": "Defects4j 2.0 (438 new bugs from 9 additional projects) serves as a separate evaluation set from D4J 1.2. QuixBugs provides a third independent benchmark. The ablation study uses the 80 single-line D4J 1.2 bugs separately." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "Table 1 provides per-project breakdowns (Chart, Closure, Lang, Math, Mockito, Time) for both D4J 1.2 and 2.0. Table 3 breaks down by repair scenario (single-line, single-hunk, single-function). The Venn diagram (Figure 3) shows overlap with other tools." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": false, 99 "justification": "The paper primarily shows success cases (Figures 4, 5, 6). While some discussion of when ChatRepair fails is implicit (e.g., only 114/391 bugs fixed on D4J 1.2), no explicit failure analysis or qualitative examples of failures are provided." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "The paper reports that BaseChatGPT 'performs even slightly worse than CodexRepair on the real-world benchmark of Defects4j 1.2' (Section 5.2), and the ablation shows that some configurations perform worse (e.g., BasePrompt achieves fewer fixes than the full approach)." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims '114 and 48 correct fixes on Defects4j 1.2 and 2.0 respectively' and '$0.42 each' — these are directly supported by Table 1 and the cost analysis in Section 5.1. The claim of 'new state-of-the-art' is supported by the comparison tables." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": true, 116 "justification": "The paper makes causal claims about components improving performance (e.g., test failure information helps, conversation helps). The ablation study in Section 5.3 systematically varies individual components while controlling others (Tables 4, 5, Figure 7), providing adequate single-variable manipulation evidence." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": false, 121 "justification": "The title and abstract present broad claims ('conversation-driven APR approach') but the evaluation is limited to Java bugs on Defects4j and QuixBugs using a single LLM (gpt-3.5-turbo-0301). While the abstract says 'our approach is general,' the generalization to other languages, models, or bug types is not bounded." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": true, 126 "justification": "Section 6 (Threats to Validity) discusses the alternative explanation that performance may come from data leakage (training data memorization). The paper addresses this by showing 36% of patches match developer fixes, and ChatRepair still outperforms BaseChatGPT using the same model (ruling out pure memorization)." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": true, 133 "justification": "Section 4.1 states: 'We use the gpt-3.5-turbo-0301 model of the ChatGPT family which is the current latest model available to us.' This is a specific model version with a date identifier." 134 }, 135 "prompts_provided": { 136 "applies": true, 137 "answer": true, 138 "justification": "Figure 2 provides the actual prompt templates with system message ('You are an Automated Program Repair Tool'), the initial prompt structure with infill notation, feedback messages, and the plausible patch generation instruction ('Please generate an alternative fix line'). The prompts are shown with actual fill values in the examples." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": true, 143 "justification": "Section 4.1 reports: sampling temperature of 1, maximum repair attempts of 200 (single-line/hunk) and 100 (single-function), 1 few-shot example, maximum conversation length of 3, and 5-hour end-to-end timeout." 144 }, 145 "scaffolding_described": { 146 "applies": true, 147 "answer": true, 148 "justification": "The conversational scaffolding is described in detail in Section 3 with Algorithm 1: the feedback loop (compile & test → construct feedback → re-query), plausible patch generation (Section 3.3), conversation length management, and retry logic are all documented." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": true, 153 "justification": "Section 4.2 documents how bugs are categorized from Defects4j (e.g., 'we categorize Defects4j 1.2 into single-function (255 bugs), single-hunk (154 bugs) and single-line (80 bugs)' with subset relationships explained). Section 3.1 describes how prompts are constructed from buggy code." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 6 'Threats to Validity' discusses both internal threats (manual validation correctness, data leakage) and external threats (generalizability to other datasets)." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": true, 165 "justification": "Section 6 provides specific threats: (1) manual validation bias with the mitigation of careful examination, (2) data leakage quantified at 36% of patches matching developer fixes, and (3) generalizability limited to Defects4j and QuixBugs with mitigation of using multiple datasets." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": false, 170 "justification": "While the threats section mentions dataset limitations, the paper does not explicitly state what the results do NOT show. There is no statement bounding the scope to Java, to the specific ChatGPT version used, or to the specific types of bugs tested. The title and claims suggest broad applicability without explicit boundaries." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": false, 177 "justification": "The raw patches generated, conversation logs, and per-bug cost breakdowns are not made available. Only aggregate results are reported in the tables." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "The data collection process is well described: Defects4j and QuixBugs are established benchmarks with known provenance. Section 4.2 describes the bug selection process and categorization. The evaluation procedure (compile and test each generated patch) is described in detail." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No human participants are involved. The study uses standard public benchmarks (Defects4j, QuixBugs)." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": true, 192 "justification": "The data pipeline is documented: buggy code → prompt construction (Section 3.1) → ChatGPT query → patch extraction → compile and test validation → feedback loop (Algorithm 1). The categorization of bugs into repair scenarios is explained in Section 4.2." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": false, 199 "justification": "No funding source or acknowledgments section is present in the paper. The authors are from UIUC, an academic institution, but no grants or sponsorships are mentioned." 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "Author affiliations are clearly stated: both authors are from the University of Illinois Urbana-Champaign. The paper evaluates OpenAI's ChatGPT but the authors have no disclosed affiliation with OpenAI." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": false, 209 "justification": "No funding is disclosed at all. Without knowing who funded the research, independence cannot be confirmed." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests statement or financial disclosure is present in the paper." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": true, 220 "answer": false, 221 "justification": "The paper uses gpt-3.5-turbo-0301 to evaluate on Defects4j and QuixBugs benchmarks, but does not state the model's training data cutoff date. Section 6 acknowledges 'ChatGPT is a proprietary model and can only be accessed through API, we do not have access to the exact training data used.'" 222 }, 223 "train_test_overlap_discussed": { 224 "applies": true, 225 "answer": true, 226 "justification": "Section 6 explicitly discusses this: 'Another threat to validity comes from the data leakage of reference developer patches being part of the original training data of ChatGPT.' The paper quantifies that 36% of correct patches match developer fixes and shows ChatRepair still outperforms BaseChatGPT, arguing the improvement is not from memorization." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": true, 230 "answer": true, 231 "justification": "Section 6 directly addresses this by analyzing what percentage of correct patches are identical to developer fixes (36%), showing ChatRepair fixes 12 unique bugs no prior tool can fix even after removing memorized patches, and comparing against BaseChatGPT to isolate the contribution of the method vs. memorization." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "No human participants are involved in this study. It is a benchmark evaluation of an automated tool." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants are involved in this study." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants are involved in this study." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants are involved in this study." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants are involved in this study." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants are involved in this study." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants are involved in this study." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": true, 275 "justification": "Cost is a central metric. The paper reports $0.42 per bug on average, and the ablation study (Tables 4, 5) reports average dollar cost per configuration. Section 4.3 explains the cost model: '$0.002 per every 1000 tokens processed or generated.'" 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": true, 280 "justification": "Section 4.1 states the hardware (Intel i7, 64GB RAM), the 5-hour timeout per bug, and the maximum number of API queries (200 for single-line/hunk, 100 for single-function). The total cost per bug is reported ($0.42)." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "ChatRepair achieves 114 and 48 correct bug fixes on Defects4j 1.2 and 2.0 respectively, surpassing all prior APR tools by 15 and 17 fixes.", 287 "evidence": "Table 1 shows ChatRepair at 114 on D4J 1.2 (vs. CodexRepair 99, next best) and 48 on D4J 2.0 (vs. AlphaRepair 36, next best). Section 5.1 discusses these results.", 288 "supported": "strong" 289 }, 290 { 291 "claim": "ChatRepair can fix 162 out of 337 bugs for $0.42 each using ChatGPT.", 292 "evidence": "Section 5.1 reports the combined count and cost. The cost model is described in Section 4.3 based on the ChatGPT API pricing at the time.", 293 "supported": "moderate" 294 }, 295 { 296 "claim": "ChatRepair fixes 18 unique bugs on Defects4j 1.2 that no prior approach can fix.", 297 "evidence": "Figure 3 (Venn diagram) shows 18 bugs uniquely fixed by ChatRepair. Examples are discussed in Figures 4 and 5.", 298 "supported": "strong" 299 }, 300 { 301 "claim": "Test failure information and conversational feedback each contribute to ChatRepair's improvement over baseline ChatGPT.", 302 "evidence": "Section 5.3 (RQ3) ablation study: Table 4 shows BasePrompt (55 plausible) vs. full prompt with test info (64 plausible). Table 5 shows BaseFeedback (58) vs. Dynamic (64). Figure 7 shows conversation length=1 (no conversation) performs worst.", 303 "supported": "strong" 304 }, 305 { 306 "claim": "Plausible patch generation adds on average 9.4, 16.6, and 5.5 plausible patches in single-line, single-hunk, and single-function scenarios.", 307 "evidence": "Section 5.2 reports these numbers and states the plausible patch generation step improves correct fixes by 4, 7, 2 in the respective scenarios on D4J 1.2.", 308 "supported": "moderate" 309 }, 310 { 311 "claim": "ChatRepair correctly fixes all 40 bugs in QuixBugs-Python and all 40 in QuixBugs-Java.", 312 "evidence": "Table 2 shows 40/40 for both Python and Java QuixBugs datasets.", 313 "supported": "strong" 314 } 315 ], 316 "methodology_tags": ["benchmark-eval"], 317 "key_findings": "ChatRepair introduces a conversation-driven approach to automated program repair that feeds test failure information and prior patch attempts back to ChatGPT (gpt-3.5-turbo-0301) in an iterative loop. On Defects4j 1.2 and 2.0, it achieves 114 and 48 correct fixes respectively, surpassing the prior state-of-the-art by 15 and 17 fixes. The ablation study shows that both test failure information in the initial prompt and conversational feedback with dynamic error messages contribute to improved repair performance. The approach achieves these results at a cost of $0.42 per bug on average.", 318 "red_flags": [ 319 { 320 "flag": "No variance or multi-run reporting", 321 "detail": "LLM sampling with temperature=1 is inherently stochastic, yet the paper appears to report single-run results with no standard deviation, confidence intervals, or multiple-run analysis. Bug counts could vary meaningfully across runs." 322 }, 323 { 324 "flag": "No statistical significance tests", 325 "detail": "Claims of superiority over baselines are based solely on raw bug-count comparisons (e.g., 114 vs. 99) without any significance testing, despite the stochastic nature of the approach." 326 }, 327 { 328 "flag": "Missing failure analysis", 329 "detail": "The paper fixes 114 out of 391 bugs on D4J 1.2 (29%) but provides no analysis of why the remaining 277 bugs were not fixed. Only success cases are qualitatively analyzed (Figures 4, 5, 6)." 330 }, 331 { 332 "flag": "Potential benchmark contamination", 333 "detail": "Defects4j bugs and fixes have been publicly available since 2014. The paper acknowledges that 36% of correct patches match developer fixes exactly, raising contamination concerns. While addressed in threats to validity, this percentage is notable." 334 }, 335 { 336 "flag": "Baseline fairness concerns", 337 "detail": "Most baseline results are taken from prior publications that used different models and potentially different configurations. Only BaseChatGPT and CodexRepair provide a controlled comparison using the same or similar LLMs under comparable conditions." 338 } 339 ], 340 "cited_papers": [ 341 { 342 "title": "Evaluating large language models trained on code", 343 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 344 "year": 2021, 345 "arxiv_id": "2107.03374", 346 "relevance": "Foundational Codex paper; one of the baselines (CodexRepair) in this study uses Codex for code generation." 347 }, 348 { 349 "title": "Automated Program Repair in the Era of Large Pre-trained Language Models", 350 "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"], 351 "year": 2023, 352 "relevance": "Prior study by same authors extensively evaluating LLM-based APR across multiple models and architectures; provides the CodexRepair baseline." 353 }, 354 { 355 "title": "Less Training, More Repairing Please: Revisiting Automated Program Repair via Zero-Shot Learning", 356 "authors": ["Chunqiu Steven Xia", "Lingming Zhang"], 357 "year": 2022, 358 "relevance": "Introduces AlphaRepair (cloze-style APR using CodeBERT), a key baseline and state-of-the-art LLM-based APR tool evaluated in this paper." 359 }, 360 { 361 "title": "Language models are few-shot learners", 362 "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"], 363 "year": 2020, 364 "relevance": "GPT-3 paper establishing the few-shot prompting paradigm used as foundation for LLM-based program repair approaches." 365 }, 366 { 367 "title": "SelfAPR: Self-supervised Program Repair with Test Execution Diagnostics", 368 "authors": ["He Ye", "Matias Martinez", "Xiapu Luo", "Tao Zhang", "Martin Monperrus"], 369 "year": 2022, 370 "relevance": "NMT-based APR baseline that also uses test execution information, enabling comparison with ChatRepair's prompting-based approach." 371 }, 372 { 373 "title": "Neural Program Repair with Execution-based Backpropagation", 374 "authors": ["He Ye", "Matias Martinez", "Martin Monperrus"], 375 "year": 2022, 376 "relevance": "RewardRepair baseline that uses test execution rewards for training NMT repair models." 377 }, 378 { 379 "title": "A Syntax-Guided Edit Decoder for Neural Program Repair", 380 "authors": ["Qihao Zhu", "Zeyu Sun", "Yuan-an Xiao"], 381 "year": 2021, 382 "relevance": "Recoder: syntax-guided neural program repair baseline compared in the evaluation." 383 }, 384 { 385 "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair", 386 "authors": ["Nan Jiang", "Thibaud Lutellier", "Lin Tan"], 387 "year": 2021, 388 "relevance": "NMT-based APR baseline using code-aware translation for automated program repair." 389 }, 390 { 391 "title": "Can OpenAI's Codex Fix Bugs?: An evaluation on QuixBugs", 392 "authors": ["Julian Aron Prenner", "Hlib Babii", "Romain Robbes"], 393 "year": 2022, 394 "relevance": "Early evaluation of LLM (Codex) for program repair on QuixBugs benchmark." 395 }, 396 { 397 "title": "Is the cure worse than the disease? overfitting in automated program repair", 398 "authors": ["Edward K Smith", "Earl T Barr", "Claire Le Goues", "Yuriy Brun"], 399 "year": 2015, 400 "relevance": "Foundational work on overfitting in APR — the plausible-but-incorrect patch problem that ChatRepair's plausible patch generation step aims to mitigate." 401 }, 402 { 403 "title": "InCoder: A Generative Model for Code Infilling and Synthesis", 404 "authors": ["Daniel Fried", "Armen Aghajanyan", "Jessy Lin"], 405 "year": 2022, 406 "arxiv_id": "2204.05999", 407 "relevance": "Code infilling model used as a baseline LLM in prior APR studies compared with ChatRepair." 408 }, 409 { 410 "title": "Training language models to follow instructions with human feedback", 411 "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"], 412 "year": 2022, 413 "relevance": "InstructGPT/RLHF paper foundational to ChatGPT's instruction-following capability leveraged by ChatRepair." 414 } 415 ] 416 }