scan-v5.json (25834B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Impact of Code Language Models on Automated Program Repair", 6 "authors": [ 7 "Nan Jiang", 8 "Kevin Liu", 9 "Thibaud Lutellier", 10 "Lin Tan" 11 ], 12 "year": 2023, 13 "venue": "International Conference on Software Engineering", 14 "arxiv_id": "2302.05020", 15 "doi": "10.1109/ICSE48619.2023.00125" 16 }, 17 "checklist": { 18 "claims_and_evidence": { 19 "abstract_claims_supported": { 20 "applies": true, 21 "answer": true, 22 "justification": "All abstract claims are traceable to tables: 72% improvement (Table II, InCoder-6B 105 vs KNOD 61), 31%-1267% fine-tuning gains (Table IV), and buggy-line degradation (Table III).", 23 "source": "haiku" 24 }, 25 "causal_claims_justified": { 26 "applies": true, 27 "answer": true, 28 "justification": "Causal claims (fine-tuning improves fixing; buggy lines hurt pre-trained CLMs) are tested via controlled ablations: same models evaluated with/without fine-tuning and with/without buggy lines provided.", 29 "source": "haiku" 30 }, 31 "generalization_bounded": { 32 "applies": true, 33 "answer": true, 34 "justification": "Paper explicitly states focus on Java single-hunk bugs and acknowledges HumanEval-Java contains 'mostly small programs'; implications section calls for larger benchmarks, bounding the scope.", 35 "source": "haiku" 36 }, 37 "alternative_explanations_discussed": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper attributes CodeT5's poor pre-tuning performance to its pre-training tasks and CLM confusion about buggy lines to lack of pre-training signal, but does not systematically consider alternative explanations (e.g., architecture differences, prompt format biases).", 41 "source": "haiku" 42 }, 43 "proxy_outcome_distinction": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper explicitly argues against BLEU scores as proxies and validates patches via developer-written test cases followed by manual semantic equivalence checking, directly measuring bug-fixing success.", 47 "source": "haiku" 48 } 49 }, 50 "limitations_and_scope": { 51 "limitations_section_present": { 52 "applies": true, 53 "answer": true, 54 "justification": "Section VIII 'Threats to Validity and Limitations' is a dedicated section covering data leakage, Codex exclusion, and patch correctness evaluation concerns.", 55 "source": "haiku" 56 }, 57 "threats_to_validity_specific": { 58 "applies": true, 59 "answer": true, 60 "justification": "Threats are specific: identifies that four Defects4J repositories appear in CodeSearchNet and the whole Defects4J repository is in BigQuery; discusses Codex's continuous model updates as a specific evaluation barrier; notes BLEU vs test-case divergence with concrete numbers (RewardRepair CodeBLUE 36.76 vs CodeT5 33.47 despite 19 fewer correct fixes).", 61 "source": "haiku" 62 }, 63 "scope_boundaries_stated": { 64 "applies": true, 65 "answer": true, 66 "justification": "Paper states it focuses on Java single-hunk bugs, excludes encoder-only models (CodeBERT, GraphCodeBERT) with justification, excludes Codex because it cannot be fine-tuned, and notes HumanEval-Java contains mostly small programs as a scope limitation.", 67 "source": "haiku" 68 } 69 }, 70 "conflicts_of_interest": { 71 "funding_disclosed": { 72 "applies": true, 73 "answer": true, 74 "justification": "Acknowledgment section explicitly states: 'This work is partially supported by a J.P. Morgan AI Faculty Research Award.'", 75 "source": "haiku" 76 }, 77 "affiliations_disclosed": { 78 "applies": true, 79 "answer": true, 80 "justification": "Author affiliations are disclosed on the title page: Purdue University, Lynbrook High School, and University of Alberta.", 81 "source": "haiku" 82 }, 83 "funder_independent_of_outcome": { 84 "applies": true, 85 "answer": true, 86 "justification": "J.P. Morgan is a financial institution with no direct stake in CLM vs. DL-based APR tool performance; the funder does not develop any of the evaluated models.", 87 "source": "haiku" 88 }, 89 "financial_interests_declared": { 90 "applies": true, 91 "answer": false, 92 "justification": "No competing interests statement, no declaration of patents, equity, or consulting relationships; only funding source is mentioned.", 93 "source": "haiku" 94 } 95 }, 96 "scope_and_framing": { 97 "key_terms_defined": { 98 "applies": true, 99 "answer": true, 100 "justification": "CLM is defined and categorized by architecture (encoder-only, decoder-only, encoder-decoder) in Section II; APR, fine-tuning, single-hunk bugs, plausible vs. correct patches are all explicitly defined.", 101 "source": "haiku" 102 }, 103 "intended_contribution_clear": { 104 "applies": true, 105 "answer": true, 106 "justification": "Section I.D enumerates five explicit contributions including the new HumanEval-Java benchmark, the CLM evaluation study, fine-tuning experiments, buggy-line analysis, and efficiency analysis.", 107 "source": "haiku" 108 }, 109 "engagement_with_prior_work": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section IX explains how this work differs from prior CLM code refinement studies (no BLEU-only metrics, full project context, fine-tuning) and from prior APR techniques; differences are substantive rather than just listing references.", 113 "source": "haiku" 114 } 115 } 116 }, 117 "type_checklist": { 118 "empirical": { 119 "artifacts": { 120 "code_released": { 121 "applies": true, 122 "answer": true, 123 "justification": "Replication package at https://github.com/lin-tan/clm explicitly includes source code for reproduction, fine-tuned CLM models, HumanEval-Java benchmark, and generated patches.", 124 "source": "haiku" 125 }, 126 "data_released": { 127 "applies": true, 128 "answer": true, 129 "justification": "HumanEval-Java (new benchmark) is released in the replication package; existing benchmarks (Defects4J, QuixBugs) are publicly available; fine-tuning data comes from prior work [10] which is publicly available.", 130 "source": "haiku" 131 }, 132 "environment_specified": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper mentions GPU memory requirements and batch-size-of-1 hardware constraints but provides no requirements.txt, Dockerfile, or specific CUDA/library version specifications.", 136 "source": "haiku" 137 }, 138 "reproduction_instructions": { 139 "applies": true, 140 "answer": false, 141 "justification": "The replication package is referenced but no step-by-step reproduction instructions appear in the paper; a reader could not follow the paper alone to reproduce experiments.", 142 "source": "haiku" 143 } 144 }, 145 "statistical_methodology": { 146 "confidence_intervals_or_error_bars": { 147 "applies": true, 148 "answer": false, 149 "justification": "All results are reported as raw counts of correct fixes with no confidence intervals or error bars across any benchmark or model comparison.", 150 "source": "haiku" 151 }, 152 "significance_tests": { 153 "applies": true, 154 "answer": false, 155 "justification": "No statistical significance tests are used for any comparative claim; '72% more bugs' and '164% more bugs' are stated as plain comparisons without p-values or effect size tests.", 156 "source": "haiku" 157 }, 158 "effect_sizes_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Percentage improvements are reported throughout (31%-1,267% for fine-tuning, 72% for best CLM vs. best APR tool), providing practical effect size information with baseline context.", 162 "source": "haiku" 163 }, 164 "sample_size_justified": { 165 "applies": true, 166 "answer": false, 167 "justification": "No power analysis or justification is given for the benchmark sizes (130, 108, 40, 164 bugs); existing benchmarks are used as-is without discussion of statistical adequacy.", 168 "source": "haiku" 169 }, 170 "variance_reported": { 171 "applies": true, 172 "answer": false, 173 "justification": "A single fixed random seed is used for fine-tuning runs; no variance across multiple runs or seeds is reported for any result.", 174 "source": "haiku" 175 } 176 }, 177 "evaluation_design": { 178 "baselines_included": { 179 "applies": true, 180 "answer": true, 181 "justification": "Four state-of-the-art DL-based APR techniques (CURE, RewardRepair, Recoder, KNOD) are used as baselines throughout Tables II and IV.", 182 "source": "haiku" 183 }, 184 "baselines_contemporary": { 185 "applies": true, 186 "answer": true, 187 "justification": "Baselines include KNOD (ICSE 2023, same venue), RewardRepair (ICSE 2022), and Recoder (FSE 2021); all are top-performing published systems at time of submission.", 188 "source": "haiku" 189 }, 190 "ablation_study": { 191 "applies": true, 192 "answer": true, 193 "justification": "Multiple ablations are conducted: CLMs with vs. without buggy lines (Table III), CLMs with vs. without fine-tuning (Tables II/IV), and fine-tuning with varying data sizes (Figure 9).", 194 "source": "haiku" 195 }, 196 "multiple_metrics": { 197 "applies": true, 198 "answer": true, 199 "justification": "Evaluation uses number of correct fixes, compilation rate (Figure 5), and size/time/memory efficiency (Figure 10), providing multiple complementary metrics.", 200 "source": "haiku" 201 }, 202 "human_evaluation": { 203 "applies": true, 204 "answer": true, 205 "justification": "Plausible patches are manually checked for semantic equivalence to developer patches, going beyond automated test-case passing to verify correctness.", 206 "source": "haiku" 207 }, 208 "held_out_test_set": { 209 "applies": true, 210 "answer": true, 211 "justification": "Benchmarks are test sets entirely separate from fine-tuning data; fine-tuning uses GitHub commits, and test benchmarks (Defects4J, QuixBugs, HumanEval-Java) are held out.", 212 "source": "haiku" 213 }, 214 "per_category_breakdown": { 215 "applies": true, 216 "answer": true, 217 "justification": "Results are broken down by all four benchmarks (Defects4J v1.2, v2.0, QuixBugs, HumanEval-Java) and by each individual CLM in Tables II, III, and IV.", 218 "source": "haiku" 219 }, 220 "failure_cases_discussed": { 221 "applies": true, 222 "answer": true, 223 "justification": "Figures 6, 7, and 8 provide concrete code-level examples of failure cases with analysis of why CLMs fail (e.g., insufficient context for Math-75, confusion from buggy lines for FLIP_CASE).", 224 "source": "haiku" 225 }, 226 "negative_results_reported": { 227 "applies": true, 228 "answer": true, 229 "justification": "Multiple negative results are explicitly reported: CodeT5 performs poorly without fine-tuning, all CLMs fix fewer bugs when buggy lines are provided, and too much fine-tuning data degrades performance for CodeT5 and CodeGen.", 230 "source": "haiku" 231 } 232 }, 233 "setup_transparency": { 234 "model_versions_specified": { 235 "applies": true, 236 "answer": true, 237 "justification": "Exact model names with parameter counts are specified for all 10 CLMs (e.g., PLBART-base 140M, CodeT5-large 770M, CodeGen-6B, InCoder-6B) with data sources documented in Table I.", 238 "source": "haiku" 239 }, 240 "prompts_provided": { 241 "applies": true, 242 "answer": true, 243 "justification": "Figure 3 shows exact prompt templates for all four CLM architectures (PLBART, CodeT5, CodeGen, InCoder) both with and without buggy lines, using a real example from Defects4J.", 244 "source": "haiku" 245 }, 246 "hyperparameters_reported": { 247 "applies": true, 248 "answer": true, 249 "justification": "Fine-tuning hyperparameters are fully reported: batch size 1, Adam optimizer, learning rate 1e-5, one training epoch, fixed random seed, and 10 candidate patches per bug.", 250 "source": "haiku" 251 }, 252 "scaffolding_described": { 253 "applies": false, 254 "answer": false, 255 "justification": "No agentic scaffolding is used; CLMs are applied directly via prompts without tool use or multi-step orchestration.", 256 "source": "haiku" 257 }, 258 "data_preprocessing_documented": { 259 "applies": true, 260 "answer": true, 261 "justification": "Fine-tuning data preprocessing is documented: 143,666 total instances randomly split 90/10 into training (129,300) and validation (14,366); HumanEval-Java creation process (manual Python-to-Java conversion, bug injection) is described in detail.", 262 "source": "haiku" 263 } 264 }, 265 "data_integrity": { 266 "raw_data_available": { 267 "applies": true, 268 "answer": true, 269 "justification": "Replication package explicitly includes 'the generated patches for all four benchmarks by all CLMs,' making raw model outputs available for independent verification.", 270 "source": "haiku" 271 }, 272 "data_collection_described": { 273 "applies": true, 274 "answer": true, 275 "justification": "HumanEval-Java creation is described in detail (Section III-B); existing benchmark sources are cited; fine-tuning data source (GitHub commits from prior work [10]) is specified with provenance.", 276 "source": "haiku" 277 }, 278 "recruitment_methods_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants were recruited; the study uses existing code benchmarks and developer-written test cases.", 282 "source": "haiku" 283 }, 284 "data_pipeline_documented": { 285 "applies": true, 286 "answer": true, 287 "justification": "Full pipeline is documented: prompt construction → CLM generation of 10 candidate patches → test case execution for plausibility → manual semantic verification for correctness.", 288 "source": "haiku" 289 } 290 }, 291 "contamination": { 292 "training_cutoff_stated": { 293 "applies": true, 294 "answer": true, 295 "justification": "Training data sources and composition are documented for each CLM in Table I and text (e.g., BigQuery, CodeSearchNet, GitHub repositories); the contamination threat is treated as a primary concern with data source documentation.", 296 "source": "haiku" 297 }, 298 "train_test_overlap_discussed": { 299 "applies": true, 300 "answer": true, 301 "justification": "Extensively discussed: 'four repositories used by the Defects4J benchmark are also in CodeSearchNet, and the whole Defects4J repository is included by BigQuery'; this motivates the creation of HumanEval-Java.", 302 "source": "haiku" 303 }, 304 "benchmark_contamination_addressed": { 305 "applies": true, 306 "answer": true, 307 "justification": "A new benchmark (HumanEval-Java) is explicitly created to avoid contamination, and Codex is flagged as 'particularly susceptible to the data-leaking threat' due to continuous model updates.", 308 "source": "haiku" 309 } 310 }, 311 "human_studies": { 312 "pre_registered": { 313 "applies": false, 314 "answer": false, 315 "justification": "No human participants in the study.", 316 "source": "haiku" 317 }, 318 "irb_or_ethics_approval": { 319 "applies": false, 320 "answer": false, 321 "justification": "No human participants in the study.", 322 "source": "haiku" 323 }, 324 "demographics_reported": { 325 "applies": false, 326 "answer": false, 327 "justification": "No human participants in the study.", 328 "source": "haiku" 329 }, 330 "inclusion_exclusion_criteria": { 331 "applies": false, 332 "answer": false, 333 "justification": "No human participants in the study.", 334 "source": "haiku" 335 }, 336 "randomization_described": { 337 "applies": false, 338 "answer": false, 339 "justification": "No human participants in the study.", 340 "source": "haiku" 341 }, 342 "blinding_described": { 343 "applies": false, 344 "answer": false, 345 "justification": "No human participants in the study.", 346 "source": "haiku" 347 }, 348 "attrition_reported": { 349 "applies": false, 350 "answer": false, 351 "justification": "No human participants in the study.", 352 "source": "haiku" 353 } 354 }, 355 "cost_and_practicality": { 356 "inference_cost_reported": { 357 "applies": true, 358 "answer": true, 359 "justification": "Time efficiency (GPU seconds per correct fix) and memory efficiency (GPU GB required) are reported for all 10 CLMs in Figure 10(b) and 10(c), enabling practical cost comparison.", 360 "source": "haiku" 361 }, 362 "compute_budget_stated": { 363 "applies": true, 364 "answer": false, 365 "justification": "Hardware constraints are mentioned (batch size 1) and per-model memory is shown, but total GPU-hours for fine-tuning all 10 models or running all experiments is not reported.", 366 "source": "haiku" 367 } 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "InCoder-6B without fine-tuning fixes 72% more bugs than the best DL-based APR technique (KNOD) across four benchmarks.", 374 "evidence": "Table II: InCoder-6B fixes 105 total bugs vs. KNOD's 61 across all four benchmarks.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Providing buggy lines to CLMs without fine-tuning consistently reduces their bug-fixing performance by 6%-78%.", 379 "evidence": "Table III shows all 10 CLMs fix fewer bugs when buggy lines are provided (reductions range from -6 for InCoder-6B to -36 for PLBART-base).", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Fine-tuning with APR data improves CLMs' fixing capabilities by 31%-1,267%, with fine-tuned CLMs outperforming the best DL-based APR technique by 46%-164%.", 384 "evidence": "Table IV: InCoder-6B fine-tuned fixes 161 bugs vs. KNOD's 61 (164% more); PLBART-base gains 31% improvement.", 385 "supported": "strong" 386 }, 387 { 388 "claim": "Fine-tuned CLMs over-rely on buggy lines, missing bugs that pre-trained CLMs can fix without fine-tuning.", 389 "evidence": "Figure 8(b) shows SORT_NUMBERS fixed by pre-trained CLMs but all fine-tuned variants fail; Section V-B discusses the mechanism.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "CodeT5 and InCoder models have the best size efficiency; PLBART, CodeT5, and InCoder have better time and memory efficiency than CodeGen.", 394 "evidence": "Figure 10 shows CodeT5 and InCoder fix most bugs per parameter count; CodeGen requires 3.64-13.88s/fix vs. PLBART's 0.70-0.89s/fix.", 395 "supported": "strong" 396 }, 397 { 398 "claim": "Too much fine-tuning data reduces performance: CodeT5 and CodeGen peak at 10,000 instances; more data causes 8%-19% regression.", 399 "evidence": "Figure 9 shows peak at 10K for CodeT5-large (59 fixes) and CodeGen-6B, declining with full 129K dataset.", 400 "supported": "strong" 401 } 402 ], 403 "methodology_tags": [ 404 "benchmark-eval" 405 ], 406 "key_findings": "Pre-trained CLMs already outperform specialized DL-based APR techniques, with InCoder-6B fixing 72% more bugs than KNOD without any task-specific training. Counterintuitively, providing buggy lines to pre-trained CLMs reduces their performance, suggesting they are not pre-trained to use error information. Fine-tuning resolves this and yields 31%-1,267% improvement, enabling fine-tuned CLMs to fix 46%-164% more bugs than the best existing APR technique, but introduces over-reliance on buggy lines as a new failure mode. CodeT5 and InCoder architectures offer the best size efficiency, suggesting future APR work should focus on scaling these model families rather than training from scratch.", 407 "red_flags": [ 408 { 409 "flag": "No statistical significance testing", 410 "detail": "All comparative claims (72% more bugs, 164% more bugs) are made without p-values, confidence intervals, or any statistical significance tests, making it impossible to assess whether differences could be due to chance." 411 }, 412 { 413 "flag": "Single run, no variance reported", 414 "detail": "A single fixed random seed is used for all fine-tuning experiments; no variance across seeds or runs is reported, so reliability of the performance numbers is unknown." 415 }, 416 { 417 "flag": "Manual patch verification subjectivity", 418 "detail": "Patch correctness is determined by manual inspection for semantic equivalence; no inter-rater reliability or second reviewer is mentioned, introducing potential bias from authors evaluating their own approach." 419 }, 420 { 421 "flag": "Batch size constrained by hardware", 422 "detail": "Fine-tuning uses batch size of 1 'due to hardware constraints,' which is non-standard and may produce suboptimal fine-tuned models, making performance comparisons less reliable." 423 }, 424 { 425 "flag": "HumanEval-Java contamination assumption", 426 "detail": "The paper assumes HumanEval-Java is unseen by CLMs because it was manually created, but does not verify this empirically (e.g., by checking if HumanEval Python is in training corpora that CLMs might have learned Java analogs from)." 427 } 428 ], 429 "cited_papers": [ 430 { 431 "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs", 432 "relevance": "Primary evaluation benchmark providing Java bugs with developer-written test cases" 433 }, 434 { 435 "title": "KNOD: Domain Knowledge Distilled Tree Decoder for Automated Program Repair", 436 "relevance": "State-of-the-art DL-based APR baseline and key comparison target" 437 }, 438 { 439 "title": "Neural Program Repair with Execution-Based Backpropagation (RewardRepair)", 440 "relevance": "Contemporary DL-based APR baseline using execution feedback during training" 441 }, 442 { 443 "title": "A Syntax-Guided Edit Decoder for Neural Program Repair (Recoder)", 444 "relevance": "DL-based APR baseline with AST-level patch generation" 445 }, 446 { 447 "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair", 448 "relevance": "DL-based APR baseline and prior work by the same first and last authors" 449 }, 450 { 451 "title": "CodeT5: Identifier-Aware Unified Pre-Trained Encoder-Decoder Models for Code Understanding and Generation", 452 "relevance": "One of four CLM architectures evaluated, with multi-task pre-training design studied" 453 }, 454 { 455 "title": "A Conversational Paradigm for Program Synthesis (CodeGen)", 456 "relevance": "Decoder-only CLM architecture evaluated in the APR context" 457 }, 458 { 459 "title": "InCoder: A Generative Model for Code Infilling and Synthesis", 460 "relevance": "Best-performing CLM in the study, using masked span prediction for code infilling" 461 }, 462 { 463 "title": "Evaluating Large Language Models Trained on Code (Codex/HumanEval)", 464 "relevance": "Source of HumanEval benchmark used as basis for new HumanEval-Java benchmark; Codex discussed as excluded baseline" 465 }, 466 { 467 "title": "Trust Enhancement Issues in Program Repair", 468 "relevance": "Survey cited for developer willingness thresholds (93% review ≤10 patches, 63% expect <1 hour response) motivating the evaluation protocol" 469 } 470 ], 471 "engagement_factors": { 472 "practical_relevance": { 473 "score": 3, 474 "justification": "Directly actionable for software engineering practitioners — shows that fine-tuning off-the-shelf CLMs on APR data outperforms purpose-built APR tools, with efficiency analysis enabling informed deployment choices." 475 }, 476 "surprise_contrarian": { 477 "score": 2, 478 "justification": "Two counterintuitive findings: CLMs without fine-tuning already beat specialized APR tools, and providing more information (buggy lines) makes CLMs perform worse." 479 }, 480 "fear_safety": { 481 "score": 0, 482 "justification": "No AI safety or risk concerns raised; paper focuses on software reliability improvement." 483 }, 484 "drama_conflict": { 485 "score": 1, 486 "justification": "Implicitly challenges the direction of specialized DL-based APR research by showing general CLMs dominate, but framed constructively rather than controversially." 487 }, 488 "demo_ability": { 489 "score": 2, 490 "justification": "Replication package with fine-tuned models and HumanEval-Java benchmark is released, allowing practitioners to test the models directly." 491 }, 492 "brand_recognition": { 493 "score": 0, 494 "justification": "Purdue University and University of Alberta; no famous AI lab or high-profile product association." 495 } 496 }, 497 "hn_data": { 498 "threads": [], 499 "top_points": 0, 500 "total_points": 0, 501 "total_comments": 0 502 } 503 }