scan-v5.json (26409B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Enhancing Automated Program Repair via Faulty Token Localization and Quality-Aware Patch Refinement", 6 "authors": [ 7 "Jiaolong Kong", 8 "Xiaofei Xie", 9 "Yiheng Xiong", 10 "Yuekun Wang", 11 "Jian Wang" 12 ], 13 "year": 2025, 14 "venue": "arXiv.org", 15 "arxiv_id": "2511.18001", 16 "doi": "10.48550/arXiv.2511.18001" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "The 88/139 correct-fix counts and 8.2%–34.9%/3.3%–16.1% improvement ranges are directly traceable to Table 4 and the Venn diagrams in Fig. 4; per-model baseline comparisons verify the claimed ranges.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Ablation studies (RQ3, Table 5) systematically remove each component and show performance drops of up to 20.6%, supporting the causal attribution of gains to the proposed modules.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "The abstract and conclusion claim 'state-of-the-art in automated program repair' without noting the restriction to single-hunk Java bugs and 7B–8B parameter models; the evaluation scope is stated in the setup but not bounded in the main conclusions.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper attributes performance gains solely to token-level uncertainty without seriously considering whether gains could be explained by the increased effective sampling diversity introduced by the refinement loop.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper explicitly distinguishes #Plausible (passes test suites) from #Correct (manually verified as semantically equivalent to ground truth), with three independent reviewers spending 10+ hours each.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section 7 'Threats to Validity' addresses manual verification bias and experimental reproducibility threats, constituting a dedicated section.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": true, 61 "justification": "Threats are specific: manual verification is mitigated by three independent SE researchers each spending 10+ hours; reproducibility threat is attributed to floating-point non-determinism in LLM inference.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "The paper does not explicitly state in conclusions or limitations that results are bounded to Java, single-hunk bugs, or small (7B–8B) open-source models; the single-hunk restriction appears only in the setup section.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No funding acknowledgment or grant information appears anywhere in the paper.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "All five authors list Singapore Management University as their affiliation in the paper header.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": false, 86 "answer": false, 87 "justification": "No funding is disclosed, so funder independence cannot be assessed.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests or financial disclosure statement is present in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "'Plausible patch' and 'correct patch' are formally defined in Section 4.1.4; token-level uncertainty is formally defined via the probability-difference metric in Eq. 1; APR is explained through prior work context.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section 1 explicitly lists three bulleted contributions: first incorporation of internal reflection into LLM-based repair, the TokenRepair framework itself, and the comprehensive evaluation.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 6 situates TokenRepair relative to conversation-based (ChatRepair, ContrastRepair, CigaR, RepairAgent) and fine-tuning-based APR methods, and explains how this work differs by exploiting internal uncertainty signals.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": false, 124 "justification": "The paper states 'we have made our patches open-source for public evaluation' but provides no repository URL or link; patch outputs are not the same as source code.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "Both Defects4J 1.2 and HumanEval-Java are standard public benchmarks used unmodified and are publicly accessible.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "HuggingFace model links are provided but no requirements file, Docker container, or dependency list is included.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "No step-by-step reproduction instructions are provided; the methodology is described algorithmically but the operational pipeline for running experiments is absent.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "All results in Table 4 and Table 5 are reported as single point estimates with no confidence intervals or error bars.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are applied to comparative claims; differences in bug-fix counts are reported without hypothesis testing.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Percentage improvement over the best baseline is explicitly reported (e.g., 8.2%–34.9% on Defects4J) providing effect sizes in context.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "The benchmarks are used as-is (154 and 163 bugs) with no sample size justification or power analysis.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "Results are single-run counts; no variance or standard deviation across repeated runs is reported.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Three baselines are included: Base Sampling, CoT-Decoding, and ChatRepair, covering the main competing paradigms.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "ChatRepair (2024), CoT-Decoding (2024), and Base Sampling (2025) are contemporary and directly competitive with the proposed approach.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "Section 5.3 presents a full ablation with three variants (w/o Majority, w/o Localize, w/o Quality) evaluated on both benchmarks across all five models.", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Three metrics are used: #Plausible (test-passing patches), #Correct (manually verified), and #Gen (efficiency: patches per correct fix).", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": true, 205 "answer": true, 206 "justification": "Three SE researchers independently manually verified plausible patches, each spending 10+ hours, with disagreements resolved by consensus (Section 7).", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "Defects4J provides ground-truth tests separate from the development process; patches must pass predefined test suites not used in generation.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Results are broken down by model (5 models) and benchmark (2 datasets); Table 6 further breaks down by hyperparameter configuration per model.", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Section 5.2 discusses DeepSeek's marginal underperformance on HumanEval-Java and explains it via weaker localization accuracy; Section 5.4 discusses why m=9 consistently underperforms.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "The paper reports that TokenRepair slightly underperforms Base Sampling for DeepSeek on HumanEval-Java (98 vs 99) and that m=9 never achieves best performance across any configuration.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "Exact model names (e.g., 'Qwen2.5-Coder-7B-Instruct', 'Llama-3.1-8B-Instruct') are provided with HuggingFace repository links in references [5,6,7,11,23,25].", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": false, 244 "justification": "The paper references ConstructPrompt as an algorithm step and describes inputs conceptually, but no actual prompt templates or examples are shown.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Temperature (t=1), budget (50), TopK (3), decay factor α (0.5), n∈{2,5}, and m∈{3,6,9} are all explicitly reported in Section 4.1.1.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": true, 255 "answer": true, 256 "justification": "Algorithm 1 provides a complete pseudocode description of the full TokenRepair pipeline including the BFS loop, quality filtering, and internal/external feedback phases.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Section 4.1.2 specifies the benchmark construction process: 154 single-hunk bugs from Defects4J 1.2 with buggy hunk location provided from ground truth, following prior work [19,36].", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": false, 270 "justification": "Generated patches are claimed open-source but no URL is provided; the raw LLM outputs, uncertainty scores, and intermediate results are not publicly released.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Both benchmarks are established public datasets with documented origins; the subset selection criterion (single-hunk bugs) is explicitly stated.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "Standard benchmarks are used; no participant recruitment is involved.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "Algorithm 1 documents the full pipeline from bug input through patch generation, evaluation, quality filtering, and output; the flow from benchmark loading to results is traceable.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "Training data cutoffs are not stated for any of the five models (Qwen2.5-Coder, Llama-3.1, DeepSeek-Coder, CodeGemma) despite evaluating on public benchmarks.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": false, 302 "justification": "Defects4J and HumanEval-Java are widely published benchmarks likely present in LLM training corpora; the paper does not discuss this potential contamination.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": false, 308 "justification": "No discussion of whether Defects4J bugs or HumanEval-Java solutions appeared in the training data of any of the five evaluated models.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants; NA.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants; NA.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants; NA.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants; NA.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants; NA.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants; NA.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants; NA.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": true, 360 "justification": "#Gen metric (average patches generated per correct fix) is reported in Table 4 as a computational cost proxy; lower values indicate higher efficiency.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "A per-bug patch budget cap of 50 is stated, but total GPU hours, wall-clock time, or hardware specification for the full experimental suite is not reported.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "TokenRepair achieves 88 correct fixes on Defects4J 1.2 across all five models, a 7.3% improvement over the best baseline (ChatRepair at 82).", 375 "evidence": "Fig. 4a Venn diagram and Table 4 per-model results summed and verified against ChatRepair totals.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "TokenRepair achieves 139 correct fixes on HumanEval-Java, a 6.1% improvement over ChatRepair (131).", 380 "evidence": "Fig. 4b Venn diagram and Table 4 HumanEval-Java results.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Per-model improvements over the best baseline range from 8.2% to 34.9% on Defects4J 1.2.", 385 "evidence": "Table 4: Llama (53 vs 49 ChatRepair = 8.2%), CodeGemma (58 vs 43 ChatRepair = 34.9%).", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Uncertainty-guided faulty token localization achieves average Top-3 accuracy of 0.589–0.695 across models and benchmarks.", 390 "evidence": "Table 1 reports Avg. column for α=0.5, TopK=3 across all five models on both benchmarks.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "Majority voting for first-token identification is strongly correlated with actual first-token correctness (F1 scores 0.624–0.928).", 395 "evidence": "Table 2 reports precision, recall, and F1 for all models on both benchmarks.", 396 "supported": "strong" 397 }, 398 { 399 "claim": "Uncertainty decrease during iterative repair is predictive of successful patch trajectories, with plausible paths showing 55.8%–80.5% decreasing uncertainty transitions vs. balanced distributions for incorrect paths.", 400 "evidence": "Table 3 shows clear disparity between plausible and incorrect paths across all models and benchmarks.", 401 "supported": "moderate" 402 }, 403 { 404 "claim": "All three components (majority voting, uncertainty localization, quality filtering) independently contribute to performance, with localization being most critical (up to 20.6% drop on removal).", 405 "evidence": "Table 5 ablation study across both benchmarks and all five models.", 406 "supported": "moderate" 407 } 408 ], 409 "methodology_tags": [ 410 "benchmark-eval" 411 ], 412 "key_findings": "TokenRepair achieves new state-of-the-art automated program repair by combining token-level uncertainty-guided fault localization (Top-3 accuracy 0.589–0.695) with quality-aware patch filtering, correctly fixing 88 bugs on Defects4J 1.2 and 139 on HumanEval-Java using five 7B–8B open-source LLMs. Per-model improvements over the best baseline (ChatRepair) range from 8.2% to 34.9% on Defects4J and 3.3% to 16.1% on HumanEval-Java. Ablation confirms uncertainty-guided token localization is the dominant component (up to 20.6% performance drop on removal), while excessive refinement budget allocation (m=9) consistently underperforms due to localization accuracy bounds and model distribution bias. All results are bounded to single-hunk Java bugs; contamination of public benchmarks in LLM training data is unaddressed.", 413 "red_flags": [ 414 { 415 "flag": "No statistical significance tests", 416 "detail": "All comparative claims between TokenRepair and baselines are based on raw bug-fix counts with no hypothesis testing or confidence intervals, making it impossible to assess whether differences are statistically meaningful given the small benchmark sizes (154 and 163 bugs)." 417 }, 418 { 419 "flag": "Benchmark contamination unaddressed", 420 "detail": "Defects4J and HumanEval-Java are widely published benchmarks likely present in the training corpora of all five evaluated models; training data cutoffs are not stated and overlap is not discussed." 421 }, 422 { 423 "flag": "Single-run results only", 424 "detail": "With temperature=1 and non-deterministic LLM inference, results are reported as single-run counts with no variance across multiple runs, making reported improvements potentially unstable." 425 }, 426 { 427 "flag": "Scope overclaim in title and conclusions", 428 "detail": "The paper claims 'state-of-the-art in automated program repair' without noting the restriction to single-hunk Java bugs with small open-source models; results may not transfer to multi-hunk, non-Java, or larger proprietary models." 429 }, 430 { 431 "flag": "Prompts not disclosed", 432 "detail": "The ConstructPrompt function is referenced algorithmically but actual prompt templates are never shown, preventing verification of whether prompt design artifacts drive the improvements." 433 }, 434 { 435 "flag": "No code repository URL", 436 "detail": "The claim of open-source patch release has no accompanying URL, making independent verification or reproduction infeasible." 437 } 438 ], 439 "cited_papers": [ 440 { 441 "title": "Automated program repair via conversation: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT", 442 "relevance": "Primary baseline (ChatRepair); TokenRepair directly extends and compares against this conversational APR paradigm." 443 }, 444 { 445 "title": "Chain-of-thought reasoning without prompting", 446 "relevance": "CoT-Decoding is a direct baseline and TokenRepair's token-guided CoT-Decoding is a core component adapted from this work." 447 }, 448 { 449 "title": "Demystifying Memorization in LLM-Based Program Repair via a General Hypothesis Testing Framework", 450 "relevance": "Provides the Base Sampling baseline and benchmark construction methodology used by TokenRepair." 451 }, 452 { 453 "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs", 454 "relevance": "Primary evaluation benchmark providing 154 single-hunk Java bugs." 455 }, 456 { 457 "title": "Impact of code language models on automated program repair", 458 "relevance": "Introduces HumanEval-Java benchmark used as the second evaluation dataset." 459 }, 460 { 461 "title": "Calibration and correctness of language models for code", 462 "relevance": "Establishes that token-level uncertainty correlates with code correctness, providing empirical foundation for TokenRepair's uncertainty-guided localization." 463 }, 464 { 465 "title": "Uncertainty-guided chain-of-thought for code generation with LLMs", 466 "relevance": "Shows first token uncertainty as proxy for generation quality; motivates TokenRepair's trace quality measurement component." 467 }, 468 { 469 "title": "ContrastRepair: Enhancing Conversation-Based Automated Program Repair via Contrastive Test Case Pairs", 470 "relevance": "Prior work by first and second authors; represents the conversational APR baseline class that TokenRepair extends." 471 }, 472 { 473 "title": "A survey of confidence estimation and calibration in large language models", 474 "relevance": "Provides the probability-difference uncertainty metric (Eq. 1) adopted by TokenRepair for token-level uncertainty computation." 475 }, 476 { 477 "title": "Self-consistency improves chain of thought reasoning in language models", 478 "relevance": "Motivates the majority voting strategy for first-token identification via self-consistency decoding principles." 479 } 480 ], 481 "engagement_factors": { 482 "practical_relevance": { 483 "score": 2, 484 "justification": "APR tools directly address developer debugging time, though the restriction to single-hunk Java bugs with small open-source LLMs limits immediate practitioner applicability." 485 }, 486 "surprise_contrarian": { 487 "score": 1, 488 "justification": "Applying token-level uncertainty for fault localization in APR is a novel angle, but the finding that targeted refinement beats coarse-grained feedback is expected rather than surprising." 489 }, 490 "fear_safety": { 491 "score": 0, 492 "justification": "No AI safety or risk concerns; automated bug fixing is a constructive application." 493 }, 494 "drama_conflict": { 495 "score": 0, 496 "justification": "No controversy or adversarial framing; straightforward systems paper." 497 }, 498 "demo_ability": { 499 "score": 1, 500 "justification": "Uses public benchmarks (Defects4J, HumanEval-Java) that practitioners could re-run, but no live demo, public code repository, or tool release is provided." 501 }, 502 "brand_recognition": { 503 "score": 0, 504 "justification": "Singapore Management University is a reputable institution but not a top-tier AI lab; no famous models or products involved." 505 } 506 }, 507 "hn_data": { 508 "threads": [ 509 { 510 "hn_id": "42889052", 511 "title": "Large language models think too fast to explore effectively", 512 "points": 118, 513 "comments": 41, 514 "url": "https://news.ycombinator.com/item?id=42889052" 515 }, 516 { 517 "hn_id": "46664297", 518 "title": "VaultGemma: A Differentially Private LLM", 519 "points": 3, 520 "comments": 0, 521 "url": "https://news.ycombinator.com/item?id=46664297" 522 }, 523 { 524 "hn_id": "42968402", 525 "title": "Fault Localization via Fine-Tuning LLMs with Mutation Generated Stack Traces", 526 "points": 3, 527 "comments": 0, 528 "url": "https://news.ycombinator.com/item?id=42968402" 529 }, 530 { 531 "hn_id": "46555313", 532 "title": "Name That Part: 3D Part Segmentation and Naming", 533 "points": 2, 534 "comments": 1, 535 "url": "https://news.ycombinator.com/item?id=46555313" 536 }, 537 { 538 "hn_id": "46838079", 539 "title": "VaultGemma: A Differentially Private LLM", 540 "points": 1, 541 "comments": 0, 542 "url": "https://news.ycombinator.com/item?id=46838079" 543 } 544 ], 545 "top_points": 118, 546 "total_points": 127, 547 "total_comments": 42 548 } 549 }