scan-v5.json (24214B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Automated Repair of Programs from Large Language Models", 6 "authors": [ 7 "Zhiyu Fan", 8 "Xiang Gao", 9 "Martin Mirchev", 10 "Abhik Roychoudhury", 11 "Shin Hwei Tan" 12 ], 13 "year": 2022, 14 "venue": "arXiv", 15 "arxiv_id": "2205.10583", 16 "doi": "10.48550/arXiv.2205.10583" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "All abstract claims are substantiated: defect overlap with human code is shown by Table II manual analysis of 335 solutions, Codex-e parity/superiority over TBar/Recoder is shown by Tables III and V.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper makes comparative claims ('Codex-estm produces the best results') but relies on raw counts with no statistical tests; differences between tools (e.g., 16 vs 9 vs 11 correct patches) are never tested for significance.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": true, 35 "justification": "Section VII explicitly acknowledges results may not generalize beyond Java, the studied configurations, or beyond Codex; hedged language ('may have potential') is used throughout the abstract.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": true, 41 "justification": "The paper discusses why Codex-e outperforms (larger training data, flexible fault localization) and why TBar vs Recoder differ (search space vs learned patterns), providing multiple mechanistic explanations for observed outcomes.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper explicitly distinguishes 'plausible patches' (pass public tests) from 'correct patches' (pass LeetCode private held-out tests), making the measurement hierarchy clear throughout.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section VII 'Threats to Validity' exists and covers both external and internal threats with specific discussion.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": true, 61 "justification": "Specific threats named: Java-only, single LLM evaluated, Codex-e algorithm undocumented (black-box), annotator disagreements in defect labeling (14 initial disagreements), and automated script bugs.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "Scope explicitly bounded to easy/medium LeetCode Java tasks, Codex model, contests after Jun 2021; seven tasks requiring customized data structures explicitly excluded.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No funding acknowledgment appears anywhere in the paper.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "All five author affiliations are listed on the title page (NUS, Beihang University, SUSTech).", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": false, 86 "answer": false, 87 "justification": "No funding disclosed, so independence cannot be assessed.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests or financial interests statement appears in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "APR is defined (Section I), 'plausible' vs 'correct' patches are explicitly defined (Section IV), Codex-e modes (Codex-ebug, Codex-eline, Codex-estm) are precisely defined in Section V.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "The contributions section lists three explicit contributions: systematic study of APR on LLM code, first evaluation of Codex edit mode as APR tool, and the LMDefects dataset.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section VIII explicitly positions this work relative to Nguyen et al. (Copilot evaluation, 33 tasks vs their 113), and situates Codex-e evaluation as a first-of-its-kind study in the APR and code LLM literature.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": false, 124 "justification": "Section VII states 'we will make our scripts available upon acceptance' — a promise of future release, not actual release at time of publication.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "Section VII states 'We also release our dataset and classification result for public verification' — LMDefects is claimed to be released with the paper.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "Only hardware is specified (Ubuntu 16.04, 64GB RAM, Intel Xeon, NVIDIA Titan V GPU); no software dependency list, requirements file, or Dockerfile is provided.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "The workflow is described at a high level (Figure 1) but step-by-step reproduction instructions are absent; scripts are only promised upon acceptance.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "All results are reported as absolute patch counts with no confidence intervals or error bars anywhere in the paper.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are applied to comparative claims (e.g., Recoder fixes 8 tasks vs TBar's 6) despite multiple tool comparisons.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": false, 162 "justification": "Results are given as raw counts (correct patches, correctly fixed tasks); no effect sizes, Cohen's d, or normalized improvement metrics are reported.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "The 113-task LMDefects dataset size is not justified; no power analysis or sample size rationale is provided.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "Codex generates 50 candidates per task but variance in patch generation outcomes across runs is never reported; only point estimates of fix counts appear.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "TBar (pattern-based) and Recoder (learning-based) are both used as baselines against which Codex-e is evaluated.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "TBar (ISSTA 2019) and Recoder (FSE 2021) are the best-performing open-source Java APR tools on Defects4J at the time of the study.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "Three Codex-e instruction variants (Codex-ebug, Codex-eline, Codex-estm) constitute an ablation of guidance level and specificity, with results in Table V.", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Both 'plausible patches' (pass public tests) and 'correct patches' (pass private tests) are used, plus per-defect-category breakdown.", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": true, 205 "answer": true, 206 "justification": "Two authors independently constructed and cross-validated ground truth patches for all 335 incorrect solutions; 14 initial disagreements resolved by discussion.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "LeetCode's private test suite serves as a held-out test set, with patched solutions submitted to the LeetCode judge for final validation.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Tables IV and V break down correctly fixed solutions by all defect sub-categories (S-O, S-C, S-V, M-S, M-U, M-L, etc.) for each tool.", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Figure 5 shows a specific multi-hunk bug that all tools fail to fix, with explanation of why statistical fault localization breaks down on program-dependent bugs.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "The study's main finding is that existing APR tools fix very few bugs: TBar fixes 6/67 tasks, Recoder 8/67, with multi-hunk failures totaling 0/62 solutions for both tools.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "Exact model IDs specified: code-davinci-002 (Codex) and code-davinci-edit-001 (Codex-e), both stated to be trained on data up to Jun 2021.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": true, 244 "justification": "Figure 2 shows a complete example prompt (function signature + Javadoc problem description); Codex-e instruction templates are specified verbatim ('Fix bug in the program', 'Fix line N', 'Fix s1').", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Temperature 0.8, max tokens 2048, stop sequences ('public', 'class', '//', 'System.out.print'), 50 candidates generated per task with top-5 selection — all reported.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": true, 255 "answer": true, 256 "justification": "Figure 1 shows the complete workflow; fault localization integration with Codex-e is described in detail, including the 10 most suspicious statements × 5 edits = 50 attempts per solution.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Data collection procedure documented: LeetCode contests from 4 July 2021 to 6 April 2022, easy/medium only, exclusion of hard problems and 7 tasks requiring custom data structures, public tests manually converted to JUnit.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": true, 270 "justification": "LMDefects dataset and defect classification results are stated to be released for public verification (Section VII).", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Collection procedure fully described: crawled 40 weekly + 20 biweekly LeetCode contests, 4 Jul 2021–6 Apr 2022, resulting in 60 easy + 53 medium tasks.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants; data is collected from a public competitive programming platform.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "Full pipeline documented: Codex generation → public test validation → APR tool application → LeetCode private test submission, with all parameters specified.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": true, 296 "justification": "Both Codex and Codex-e training data cutoff explicitly stated as June 2021.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": true, 302 "justification": "The study specifically designs around contamination by only using LeetCode contests released after June 2021 to ensure no overlap with Codex's training data.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": true, 308 "justification": "Confirmed with Codex-e developers that Codex and Codex-e share the same training dataset; LMDefects tasks all postdate the Jun 2021 cutoff, explicitly preventing contamination.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants in the study.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants in the study.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants in the study.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants in the study.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants in the study.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants in the study.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants in the study.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": false, 360 "justification": "No Codex API costs or inference latency are reported despite using a paid API to generate 50 candidates × 113 tasks = 5,650 Codex queries plus Codex-e queries.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "Only the per-repair timeout (15 minutes) is stated; total computational budget for the full experiment is not reported.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "Auto-generated code from Codex shares common defect categories with human-written code, with similar mutation operators and multi-hunk fix patterns overlapping with Codeflaws.", 375 "evidence": "Manual analysis of 335 incorrect solutions classified by two annotators using the Codeflaws defect taxonomy; defect categories (S-O, S-V, M-U, etc.) directly overlap.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "57% of Codex bugs are algorithm-related (misaligned algorithm) and 11% are syntax errors, making them largely inaccessible to existing pattern-based APR.", 380 "evidence": "Table II shows 191/335 solutions classified as 'Misaligned Algorithm' and 37/335 as syntax errors.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Existing APR tools (TBar, Recoder) are very limited at fixing Codex-generated bugs: TBar fixes 6/67 tasks, Recoder fixes 8/67 tasks.", 385 "evidence": "Table III reports correct patches per tool; neither tool fixes any multi-hunk bugs (Table IV shows 0 correct patches for M-S/M-U/M-L categories).", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Codex edit mode with statement-level fault localization (Codex-estm) outperforms both TBar and Recoder, producing 16 correct patches vs 9 (TBar) and 11 (Recoder).", 390 "evidence": "Table V shows Codex-estm fixes 14 single-hunk and 2 multi-hunk solutions; Figure 7 Venn diagram shows TBar's patches are a subset of Codex-estm ∪ Recoder.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Codex-e without any location guidance (Codex-ebug) performs nearly as well as with statement-level guidance (Codex-estm): 15 vs 16 correct patches.", 395 "evidence": "Table V comparison of Codex-ebug (8+3 easy/medium single-hunk, 2+2 multi-hunk) vs Codex-estm (10+4 single-hunk, 2+0 multi-hunk).", 396 "supported": "moderate" 397 }, 398 { 399 "claim": "Combining TBar and Codex-e patch ingredients covers required patches for 9 solutions versus 4-5 for individual tools; adding multiple Codex candidates (TBar+Codex) extends coverage to 12.", 400 "evidence": "Table VI shows patch ingredient coverage across S-HO, M-S, and M-U defect categories for TBar, Codex-e, TBar+Codex-e, and TBar+Codex combinations.", 401 "supported": "moderate" 402 } 403 ], 404 "methodology_tags": [ 405 "benchmark-eval", 406 "case-study", 407 "observational" 408 ], 409 "key_findings": "Codex-generated Java programs fail on LeetCode contests primarily due to algorithm misalignment (57%) and syntax errors (11%), with defect patterns closely overlapping human programmer mistakes. Existing pattern-based (TBar) and learning-based (Recoder) APR tools fix only 6-8 of 67 unsolved tasks because they cannot handle multi-hunk bugs or diverse patch ingredients. Codex edit mode with statement-level guidance (Codex-estm) modestly outperforms both APR tools (16 vs 9-11 correct patches) and uniquely produces flexible multi-location fixes; surprisingly, giving no location guidance (Codex-ebug) achieves nearly the same count (15 fixes) while fixing more multi-hunk bugs. Combining TBar's pattern space with Codex-e or multiple Codex candidates provides patch ingredients for more complex bugs than either approach alone.", 410 "red_flags": [ 411 { 412 "flag": "No statistical testing", 413 "detail": "All tool comparisons are raw patch counts (e.g., 16 vs 11 vs 9 correct patches) with no significance tests, confidence intervals, or effect sizes; differences could easily be within noise on a 67-task dataset." 414 }, 415 { 416 "flag": "Very small dataset", 417 "detail": "113 total tasks (67 unsolved) is extremely small for drawing conclusions about comparative tool effectiveness; many cells in Tables IV-V contain single-digit counts." 418 }, 419 { 420 "flag": "Code not released at publication", 421 "detail": "Scripts promised 'available upon acceptance' rather than released with the paper, making independent verification impossible at time of publication." 422 }, 423 { 424 "flag": "Single-LLM, single-language scope", 425 "detail": "Study is limited to Codex on Java LeetCode problems; all conclusions about 'auto-generated code' behavior are bounded to this narrow configuration despite broad framing." 426 }, 427 { 428 "flag": "Codex-e is a black box", 429 "detail": "The underlying algorithm of Codex edit mode is undocumented; the paper cannot explain mechanism of improvement beyond speculation about training data size and flexible fault localization." 430 } 431 ], 432 "cited_papers": [ 433 { 434 "title": "Evaluating Large Language Models Trained on Code (Codex/HumanEval)", 435 "relevance": "Primary subject of study; provides pass@k metrics and APPS baseline results used for comparison" 436 }, 437 { 438 "title": "Competition-Level Code Generation with AlphaCode", 439 "relevance": "Contemporary LLM code generation system; provides comparative pass rates on competition tasks" 440 }, 441 { 442 "title": "Measuring Coding Challenge Competence with APPS", 443 "relevance": "Benchmark dataset for code generation evaluation; provides context for LeetCode-based LMDefects" 444 }, 445 { 446 "title": "TBar: Revisiting Template-based Automated Program Repair", 447 "relevance": "Primary APR baseline tool evaluated in the study; pattern-based Java repair" 448 }, 449 { 450 "title": "A Syntax-Guided Edit Decoder for Neural Program Repair (Recoder)", 451 "relevance": "Primary learning-based APR baseline tool; syntax-guided decoder approach" 452 }, 453 { 454 "title": "Codeflaws: A Programming Competition Benchmark for Evaluating Automated Program Repair Tools", 455 "relevance": "Defect taxonomy used to classify Codex bugs; shows overlap between human and LLM programming errors" 456 }, 457 { 458 "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies", 459 "relevance": "Standard Java APR benchmark used to select TBar and Recoder as representative tools" 460 }, 461 { 462 "title": "An Empirical Evaluation of GitHub Copilot's Code Suggestions", 463 "relevance": "Most directly related prior work; evaluates Copilot on 33 LeetCode tasks vs this paper's 113" 464 } 465 ], 466 "engagement_factors": { 467 "practical_relevance": { 468 "score": 2, 469 "justification": "Directly relevant to developers using Codex/GitHub Copilot for code generation, offering concrete strategies (fault-localization-guided Codex-e) to improve output quality." 470 }, 471 "surprise_contrarian": { 472 "score": 2, 473 "justification": "Counterintuitive finding that Codex-e with no location guidance (Codex-ebug) nearly matches statement-level guidance, and that Codex-e outperforms dedicated APR tools built specifically for this task." 474 }, 475 "fear_safety": { 476 "score": 0, 477 "justification": "No AI safety or risk concerns raised; purely a software engineering effectiveness study." 478 }, 479 "drama_conflict": { 480 "score": 0, 481 "justification": "Standard academic tool comparison with no controversy or conflict angle." 482 }, 483 "demo_ability": { 484 "score": 1, 485 "justification": "LMDefects dataset is released enabling replication, but scripts are not yet available; Codex API access required and is no longer public." 486 }, 487 "brand_recognition": { 488 "score": 2, 489 "justification": "Codex/GitHub Copilot is a high-recognition product; Abhik Roychoudhury is a well-known APR researcher." 490 } 491 }, 492 "hn_data": { 493 "threads": [], 494 "top_points": 0, 495 "total_points": 0, 496 "total_comments": 0 497 } 498 }