scan-v5.json (26780B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "A Deep Dive into Large Language Models for Automated Bug Localization and Repair", 6 "authors": [ 7 "Soneya Binta Hossain", 8 "Nan Jiang", 9 "Qiang Zhou", 10 "Xiaopeng Li", 11 "Wen-Hao Chiang", 12 "Yingjun Lyu", 13 "Hoan Nguyen", 14 "Omer Tripp" 15 ], 16 "year": 2024, 17 "venue": "Proc. ACM Softw. Eng.", 18 "arxiv_id": "2404.11595", 19 "doi": "10.1145/3660773" 20 }, 21 "checklist": { 22 "claims_and_evidence": { 23 "abstract_claims_supported": { 24 "applies": true, 25 "answer": true, 26 "justification": "The abstract claims state-of-the-art on CodeXGLUE and Defects4J; Table 1 and Table 3 confirm Toggle (PolyCoder-2.7B 25.07%) exceeds NSEdit (23.86%) and fixes more bugs in Top-10/30/50/100 on Defects4J than any compared method.", 27 "source": "haiku" 28 }, 29 "causal_claims_justified": { 30 "applies": true, 31 "answer": true, 32 "justification": "Causal claims (e.g., 'prompt 4 significantly improves bug fixing accuracy') are tested through controlled ablations in RQ3 using ground-truth bug locations to isolate prompt effects, and RQ5 enables/disables the adjustment module across 16 configurations.", 33 "source": "haiku" 34 }, 35 "generalization_bounded": { 36 "applies": true, 37 "answer": true, 38 "justification": "The Threats to Validity section acknowledges results may not generalize beyond the studied datasets, and the Defects4J generalizability test uses only 240 single-hunk Java bugs; findings are generally scoped to the specific benchmarks tested.", 39 "source": "haiku" 40 }, 41 "alternative_explanations_discussed": { 42 "applies": true, 43 "answer": false, 44 "justification": "The paper attributes improvements to 'inductive bias' from token-level localization and prompt design but does not discuss alternative explanations such as constrained-generation making the fine-tuning task easier or model pre-training data effects.", 45 "source": "haiku" 46 }, 47 "proxy_outcome_distinction": { 48 "applies": true, 49 "answer": true, 50 "justification": "The paper explicitly defines exact match (EM) as its primary metric and distinguishes it from BLEU/CodeBLEU; for Defects4J, patch correctness is verified via test execution, and the paper does not conflate EM with real-world utility.", 51 "source": "haiku" 52 } 53 }, 54 "limitations_and_scope": { 55 "limitations_section_present": { 56 "applies": true, 57 "answer": true, 58 "justification": "Section 5 is explicitly titled 'THREATS TO VALIDITY' and spans a dedicated paragraph discussing generalization, tooling bugs, and metric validity.", 59 "source": "haiku" 60 }, 61 "threats_to_validity_specific": { 62 "applies": true, 63 "answer": false, 64 "justification": "The threats section mentions 'results may not generalize across other datasets' without specifying what properties would limit generalization, and the 'scripts might contain bugs' concern is boilerplate; no quantified or domain-specific threats are identified.", 65 "source": "haiku" 66 }, 67 "scope_boundaries_stated": { 68 "applies": true, 69 "answer": true, 70 "justification": "The paper explicitly restricts the Defects4J evaluation to 'single-hunk' bugs (240 bugs) and the fine-tuning models are bounded to specific parameter ranges (110M–2.7B); scope is stated within individual RQ setups.", 71 "source": "haiku" 72 } 73 }, 74 "conflicts_of_interest": { 75 "funding_disclosed": { 76 "applies": true, 77 "answer": false, 78 "justification": "There is no acknowledgment section or funding disclosure anywhere in the paper.", 79 "source": "haiku" 80 }, 81 "affiliations_disclosed": { 82 "applies": true, 83 "answer": true, 84 "justification": "Author affiliations (University of Virginia, Purdue University, Amazon Web Services) are explicitly listed in the author block.", 85 "source": "haiku" 86 }, 87 "funder_independent_of_outcome": { 88 "applies": false, 89 "answer": false, 90 "justification": "No funder is identified; five of eight authors are Amazon Web Services employees evaluating their own research framework, making funder independence moot but affiliation bias is a concern.", 91 "source": "haiku" 92 }, 93 "financial_interests_declared": { 94 "applies": true, 95 "answer": false, 96 "justification": "No competing interests or financial interests statement appears anywhere in the paper.", 97 "source": "haiku" 98 } 99 }, 100 "scope_and_framing": { 101 "key_terms_defined": { 102 "applies": true, 103 "answer": true, 104 "justification": "Key terms such as 'token-granulated bug localization,' 'exact match metric,' 'inductive bias,' 'shared prefix/suffix,' and 'single-hunk bugs' are defined or explained contextually with examples and figures.", 105 "source": "haiku" 106 }, 107 "intended_contribution_clear": { 108 "applies": true, 109 "answer": true, 110 "justification": "Section 1.4 explicitly lists four contributions: granularity shift to token-level, four novel prompt designs, adjustment module for tokenizer discrepancies, and comprehensive empirical study.", 111 "source": "haiku" 112 }, 113 "engagement_with_prior_work": { 114 "applies": true, 115 "answer": true, 116 "justification": "Section 4 (Related Work) positions Toggle against specific prior methods (NSEdit, CoText, CURE, KNOD, AlphaRepair, Recoder) with direct performance comparisons; the paper explains how its token-level approach differs from line-level methods in prior LLM-APR work.", 117 "source": "haiku" 118 } 119 } 120 }, 121 "type_checklist": { 122 "empirical": { 123 "artifacts": { 124 "code_released": { 125 "applies": true, 126 "answer": false, 127 "justification": "No Toggle source code release is mentioned anywhere in the paper; base model checkpoints are referenced via Hugging Face but the Toggle framework itself is not released.", 128 "source": "haiku" 129 }, 130 "data_released": { 131 "applies": true, 132 "answer": true, 133 "justification": "All datasets used (CodeXGLUE/Tufano, CodeReviewer, Defects4J, GitHub) are publicly available benchmarks referenced with citations.", 134 "source": "haiku" 135 }, 136 "environment_specified": { 137 "applies": true, 138 "answer": false, 139 "justification": "Only PyTorch and Hugging Face are mentioned; no version numbers, requirements file, or Dockerfile are provided.", 140 "source": "haiku" 141 }, 142 "reproduction_instructions": { 143 "applies": true, 144 "answer": false, 145 "justification": "No step-by-step reproduction instructions are provided; the paper describes the methodology but not how to replicate the experimental setup from scratch.", 146 "source": "haiku" 147 } 148 }, 149 "statistical_methodology": { 150 "confidence_intervals_or_error_bars": { 151 "applies": true, 152 "answer": false, 153 "justification": "No confidence intervals or error bars appear in any of the results tables (Tables 1–8); only point estimates are reported.", 154 "source": "haiku" 155 }, 156 "significance_tests": { 157 "applies": true, 158 "answer": false, 159 "justification": "No statistical significance tests are applied to any comparative claims, despite numerous comparisons between methods and prompts.", 160 "source": "haiku" 161 }, 162 "effect_sizes_reported": { 163 "applies": true, 164 "answer": true, 165 "justification": "Absolute performance values with baselines are consistently reported (e.g., 25.07% vs 23.86% on Tufano Small), providing enough context to assess effect magnitudes.", 166 "source": "haiku" 167 }, 168 "sample_size_justified": { 169 "applies": true, 170 "answer": false, 171 "justification": "The choice of 240 Defects4J single-hunk bugs and 210 patches per bug is not statistically justified; no power analysis is discussed.", 172 "source": "haiku" 173 }, 174 "variance_reported": { 175 "applies": true, 176 "answer": false, 177 "justification": "The paper states experiments were 'repeated several times to confirm consistency' but no variance, standard deviation, or spread across runs is reported in any table.", 178 "source": "haiku" 179 } 180 }, 181 "evaluation_design": { 182 "baselines_included": { 183 "applies": true, 184 "answer": true, 185 "justification": "Baselines include NSEdit, CoText (Table 1) and CURE, RewardRepair, Recoder, KNOD, Tare, AlphaRepair, TENURE (Table 3).", 186 "source": "haiku" 187 }, 188 "baselines_contemporary": { 189 "applies": true, 190 "answer": true, 191 "justification": "Baselines include papers from 2021–2023 (NSEdit 2022, KNOD 2023, Tare 2023, AlphaRepair 2022), which are competitive and recent relative to the 2024 submission.", 192 "source": "haiku" 193 }, 194 "ablation_study": { 195 "applies": true, 196 "answer": true, 197 "justification": "RQ3 ablates across four prompt designs; RQ5 ablates the adjustment module enabled vs disabled across 4 models × 4 datasets; RQ4 ablates the effect of contextual information.", 198 "source": "haiku" 199 }, 200 "multiple_metrics": { 201 "applies": true, 202 "answer": true, 203 "justification": "The paper uses exact match (EM) for CodeXGLUE/CodeReviewer and Top-K (K=10,30,50,100,200) metrics for Defects4J, plus start/end token accuracy for localization.", 204 "source": "haiku" 205 }, 206 "human_evaluation": { 207 "applies": false, 208 "answer": false, 209 "justification": "Human evaluation of system outputs is not relevant to this task; patch correctness is verified via automated exact match and test execution on Defects4J.", 210 "source": "haiku" 211 }, 212 "held_out_test_set": { 213 "applies": true, 214 "answer": true, 215 "justification": "Datasets are explicitly split 80/10/10 into training, validation, and test sets; Defects4J is kept entirely held-out from fine-tuning.", 216 "source": "haiku" 217 }, 218 "per_category_breakdown": { 219 "applies": true, 220 "answer": true, 221 "justification": "Results are broken down by dataset (Tufano Small, Tufano Medium, CodeReviewer w/o comment, CodeReviewer w/ comment) and by model backbone for all major experiments.", 222 "source": "haiku" 223 }, 224 "failure_cases_discussed": { 225 "applies": true, 226 "answer": true, 227 "justification": "Figure 7 explicitly shows a failure case where correct bug location still produces incorrect fix, and RQ6 discusses conditions under which prompt 4 underperforms prompt 3.", 228 "source": "haiku" 229 }, 230 "negative_results_reported": { 231 "applies": true, 232 "answer": true, 233 "justification": "CodeGPT underperforms on multilingual datasets due to Java-only pretraining; prompt 4 underperforms prompt 3 on Tufano datasets; smaller models don't benefit as much from the adjustment module.", 234 "source": "haiku" 235 } 236 }, 237 "setup_transparency": { 238 "model_versions_specified": { 239 "applies": true, 240 "answer": true, 241 "justification": "Models are specified with parameter counts and sources: CodeGPT-110M, CodeParrot-110M, CodeGen-350M, CodeGen-2B, PolyCoder-400M, PolyCoder-2.7B, CodeT5-large (347M); Hugging Face references are provided.", 242 "source": "haiku" 243 }, 244 "prompts_provided": { 245 "applies": true, 246 "answer": true, 247 "justification": "All four prompts are illustrated in Figure 5 with concrete code examples showing the exact format including separator tokens and truncation strategy.", 248 "source": "haiku" 249 }, 250 "hyperparameters_reported": { 251 "applies": true, 252 "answer": false, 253 "justification": "No learning rates, batch sizes, number of epochs, or optimizer settings are reported for any of the fine-tuning experiments.", 254 "source": "haiku" 255 }, 256 "scaffolding_described": { 257 "applies": true, 258 "answer": true, 259 "justification": "Section 2.3 describes the Toggle framework architecture in detail including the localization model (CodeT5 encoder with attention-based prediction), four prompt designs, and adjustment module (CodeT5 encoder with FC layer).", 260 "source": "haiku" 261 }, 262 "data_preprocessing_documented": { 263 "applies": true, 264 "answer": true, 265 "justification": "The GitHub dataset preprocessing is documented (commit filtering by keywords, AST-based Defects4J deduplication); train/validation/test splits of 80/10/10 are stated; adjustment module training data collection procedure is described.", 266 "source": "haiku" 267 } 268 }, 269 "data_integrity": { 270 "raw_data_available": { 271 "applies": true, 272 "answer": true, 273 "justification": "All benchmarks used (CodeXGLUE/Tufano, CodeReviewer, Defects4J) are publicly available; citations and URLs are provided for access.", 274 "source": "haiku" 275 }, 276 "data_collection_described": { 277 "applies": true, 278 "answer": true, 279 "justification": "The GitHub dataset curation is described (commit message keywords, single-statement patches, AST-based Defects4J exclusion); the other datasets reference published papers describing their collection.", 280 "source": "haiku" 281 }, 282 "recruitment_methods_described": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants; all data is from public code repositories and established benchmarks.", 286 "source": "haiku" 287 }, 288 "data_pipeline_documented": { 289 "applies": true, 290 "answer": true, 291 "justification": "The full pipeline from dataset splits through fine-tuning to patch generation and evaluation is described, including the 7-shift range used for adjustment module training data.", 292 "source": "haiku" 293 } 294 }, 295 "contamination": { 296 "training_cutoff_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "No pre-training cutoff dates are stated for any of the six base LLMs (CodeGPT, CodeParrot, CodeGen, PolyCoder, CodeT5) despite their pre-training corpora potentially overlapping with public benchmarks.", 300 "source": "haiku" 301 }, 302 "train_test_overlap_discussed": { 303 "applies": true, 304 "answer": true, 305 "justification": "The paper explicitly excludes Defects4J samples from the GitHub fine-tuning dataset via AST comparison to prevent data leakage into the held-out generalization test.", 306 "source": "haiku" 307 }, 308 "benchmark_contamination_addressed": { 309 "applies": true, 310 "answer": false, 311 "justification": "The pre-trained base models (CodeParrot, CodeGen, etc.) were trained on large code corpora that likely include CodeXGLUE and Defects4J data; this pre-training contamination risk is never discussed.", 312 "source": "haiku" 313 } 314 }, 315 "human_studies": { 316 "pre_registered": { 317 "applies": false, 318 "answer": false, 319 "justification": "No human participants.", 320 "source": "haiku" 321 }, 322 "irb_or_ethics_approval": { 323 "applies": false, 324 "answer": false, 325 "justification": "No human participants.", 326 "source": "haiku" 327 }, 328 "demographics_reported": { 329 "applies": false, 330 "answer": false, 331 "justification": "No human participants.", 332 "source": "haiku" 333 }, 334 "inclusion_exclusion_criteria": { 335 "applies": false, 336 "answer": false, 337 "justification": "No human participants.", 338 "source": "haiku" 339 }, 340 "randomization_described": { 341 "applies": false, 342 "answer": false, 343 "justification": "No human participants.", 344 "source": "haiku" 345 }, 346 "blinding_described": { 347 "applies": false, 348 "answer": false, 349 "justification": "No human participants.", 350 "source": "haiku" 351 }, 352 "attrition_reported": { 353 "applies": false, 354 "answer": false, 355 "justification": "No human participants.", 356 "source": "haiku" 357 } 358 }, 359 "cost_and_practicality": { 360 "inference_cost_reported": { 361 "applies": true, 362 "answer": false, 363 "justification": "The paper mentions 'resource-intensive nature of larger models' as justification for testing only smaller models in some RQs, but no actual inference cost, latency, or GPU-hours are reported.", 364 "source": "haiku" 365 }, 366 "compute_budget_stated": { 367 "applies": true, 368 "answer": false, 369 "justification": "No total computational budget (GPU hours, cloud costs, hardware used) is stated anywhere in the paper.", 370 "source": "haiku" 371 } 372 } 373 } 374 }, 375 "claims": [ 376 { 377 "claim": "Toggle achieves new state-of-the-art on CodeXGLUE code refinement benchmark (Tufano Small 25.07%, Tufano Medium 16.19%)", 378 "evidence": "Table 1 shows PolyCoder-2.7B at 25.07% vs NSEdit's 23.86% on Tufano Small and 16.19% vs CoText's 15.36% on Tufano Medium", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Toggle outperforms all compared APR methods on Defects4J in Top-10, Top-30, Top-50, and Top-100 metrics", 383 "evidence": "Table 3 shows Toggle fixes 41 bugs in Top-10 vs next-best 36 (Recoder), 58 vs 51, 64 vs 62, 74 vs 70 respectively", 384 "supported": "strong" 385 }, 386 { 387 "claim": "Larger LLMs yield better bug fixing accuracy after fine-tuning with Toggle prompts", 388 "evidence": "Table 1 consistently shows larger models outperform smaller ones (e.g., CodeGen-2B 24.73% vs CodeGen-350M 23.19% on Tufano Small)", 389 "supported": "strong" 390 }, 391 { 392 "claim": "Token-granulated prompts (3 and 4) significantly outperform standard prompting (prompt 1) for bug fixing", 393 "evidence": "Table 4 shows CodeGPT-110M improves from 16.07% (prompt 1) to 56.98% (prompt 4) on Tufano Small using ground-truth bug locations", 394 "supported": "strong" 395 }, 396 { 397 "claim": "Contextual information (buggy line numbers, code review comments) significantly improves bug localization accuracy", 398 "evidence": "Table 5 shows starting token accuracy for Tufano Small improves from 39.07% to 60.37% (+21%) with buggy line numbers", 399 "supported": "strong" 400 }, 401 { 402 "claim": "The adjustment module consistently improves bug fixing accuracy across all models and datasets", 403 "evidence": "Table 6 shows improvement in all 16 configurations, e.g., CodeParrot-110M on Tufano Small improves from 21.78% to 23.51%", 404 "supported": "moderate" 405 }, 406 { 407 "claim": "Prompt 4 outperforms prompt 3 only when both start and end token locations are highly accurate", 408 "evidence": "Table 8 shows prompt 3 superior on Tufano datasets but prompt 4 superior on CodeReviewer datasets where partial location accuracy is higher (65.76% vs 53.23%)", 409 "supported": "moderate" 410 } 411 ], 412 "methodology_tags": [ 413 "benchmark-eval" 414 ], 415 "key_findings": "Toggle introduces token-granulated bug localization and repair, demonstrating that preventing LLMs from regenerating non-buggy shared prefix/suffix significantly improves accuracy (prompt 1 to prompt 4: 16.07% to 56.98% for CodeGPT on Tufano Small). The system achieves state-of-the-art on CodeXGLUE code refinement and outperforms all compared methods on Defects4J in Top-10 through Top-100 metrics using only 110M parameter models and 210 generated patches. Contextual information (line numbers, code review comments) improves localization accuracy by 20-30 percentage points. The choice between prompts 3 and 4 with predicted locations is dataset-dependent, with prompt 4 winning when partial location accuracy is high and additional context is available.", 416 "red_flags": [ 417 { 418 "flag": "No statistical significance tests", 419 "detail": "All comparisons between Toggle and baselines, and between prompt configurations, are made without any statistical significance testing despite many tables of numerical comparisons." 420 }, 421 { 422 "flag": "No variance or confidence intervals", 423 "detail": "Despite claiming experiments were repeated multiple times, no standard deviation, confidence intervals, or error bars are reported for any results." 424 }, 425 { 426 "flag": "No hyperparameters reported", 427 "detail": "Learning rates, batch sizes, number of epochs, and optimizer configurations for all fine-tuning experiments are absent, making reproduction impossible." 428 }, 429 { 430 "flag": "No code release", 431 "detail": "The Toggle framework is not released; only the public base model checkpoints are referenced, preventing independent verification of results." 432 }, 433 { 434 "flag": "Pre-training contamination unaddressed", 435 "detail": "Base LLMs (CodeGPT, CodeParrot, CodeGen, PolyCoder, CodeT5) were trained on large code corpora that likely include CodeXGLUE and Defects4J benchmarks; this contamination risk is never discussed." 436 }, 437 { 438 "flag": "Asymmetric patch count in Defects4J comparison", 439 "detail": "Toggle generates 210 patches per bug (Top-100 is primary comparison), while competing methods (Tare, AlphaRepair, TENURE) generate 500+ patches and only their Top-500+ results are reported, making Top-100 comparisons potentially favorable to Toggle." 440 }, 441 { 442 "flag": "No funding disclosure", 443 "detail": "Five of eight authors are Amazon Web Services employees; no funding source or competing interests are disclosed." 444 } 445 ], 446 "cited_papers": [ 447 { 448 "title": "CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation", 449 "relevance": "Primary benchmark for evaluation; Toggle achieves state-of-the-art on its code refinement tasks" 450 }, 451 { 452 "title": "CodeReviewer: Pre-Training for Automating Code Review Activities", 453 "relevance": "Provides dataset and CodeT5 baseline for code review-guided bug fixing experiments" 454 }, 455 { 456 "title": "Defects4J: A Database of existing faults to enable controlled testing studies for Java programs", 457 "relevance": "Primary generalizability benchmark; 835 real-world Java bugs used for out-of-distribution evaluation" 458 }, 459 { 460 "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation", 461 "relevance": "Backbone model for bug localization; used as both a baseline and the encoder in Toggle's localization module" 462 }, 463 { 464 "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair", 465 "relevance": "Key APR baseline compared on Defects4J (18 bugs in Top-10 vs Toggle's 41)" 466 }, 467 { 468 "title": "KNOD: Domain Knowledge Distilled Tree Decoder for Automated Program Repair", 469 "relevance": "Strong APR baseline using tree-based decoding; compared on Defects4J across all Top-K metrics" 470 }, 471 { 472 "title": "Less Training, More Repairing Please: Revisiting Automated Program Repair via Zero-Shot Learning", 473 "relevance": "AlphaRepair baseline demonstrating LLMs used for APR without fine-tuning; contextualizes Toggle's fine-tuning approach" 474 }, 475 { 476 "title": "Impact of Code Language Models on Automated Program Repair", 477 "relevance": "Prior work on LLM-based APR that Toggle directly builds on and improves over" 478 }, 479 { 480 "title": "Fix Bugs with Transformer through a Neural-Symbolic Edit Grammar", 481 "relevance": "NSEdit — primary baseline for CodeXGLUE leaderboard comparison; Toggle surpasses it on all Tufano datasets" 482 }, 483 { 484 "title": "An empirical study on learning bug-fixing patches in the wild via neural machine translation", 485 "relevance": "Source of Tufano Small/Medium datasets used as primary fine-tuning and evaluation benchmarks" 486 } 487 ], 488 "engagement_factors": { 489 "practical_relevance": { 490 "score": 2, 491 "justification": "APR is directly useful to developers; Toggle is a concrete working system tested on real bug benchmarks, though it requires fine-tuning and infrastructure to deploy." 492 }, 493 "surprise_contrarian": { 494 "score": 1, 495 "justification": "Token-level vs line-level localization is a novel framing but the performance improvements are expected given the design rationale." 496 }, 497 "fear_safety": { 498 "score": 0, 499 "justification": "No AI risk or safety concerns raised; the paper is purely about automated software engineering." 500 }, 501 "drama_conflict": { 502 "score": 0, 503 "justification": "Standard benchmark competition paper with no controversy or conflict angle." 504 }, 505 "demo_ability": { 506 "score": 1, 507 "justification": "The framework is described in detail but no public demo or code is released, limiting hands-on accessibility." 508 }, 509 "brand_recognition": { 510 "score": 1, 511 "justification": "Amazon Web Services affiliation for five authors adds some recognition, but this is not a top-name lab publication; published at FSE which is a respected venue." 512 } 513 }, 514 "hn_data": { 515 "threads": [ 516 { 517 "hn_id": "40205264", 518 "title": "Urban highways are barriers to social ties", 519 "points": 6, 520 "comments": 0, 521 "url": "https://news.ycombinator.com/item?id=40205264" 522 }, 523 { 524 "hn_id": "41103162", 525 "title": "Beyond Deepfake Images: Detecting AI-Generated Videos [pdf]", 526 "points": 3, 527 "comments": 0, 528 "url": "https://news.ycombinator.com/item?id=41103162" 529 }, 530 { 531 "hn_id": "40165320", 532 "title": "Generation of Low-Inclination, Neptune-Crossing TNOs by Planet Nine", 533 "points": 2, 534 "comments": 0, 535 "url": "https://news.ycombinator.com/item?id=40165320" 536 } 537 ], 538 "top_points": 6, 539 "total_points": 11, 540 "total_comments": 0 541 } 542 }