scan.json (30200B)
1 { 2 "paper": { 3 "title": "Enhancing Automated Program Repair via Faulty Token Localization and Quality-Aware Patch Refinement", 4 "authors": [ 5 "Jiaolong Kong", 6 "Xiaofei Xie", 7 "Yiheng Xiong", 8 "Yuekun Wang", 9 "Jian Wang" 10 ], 11 "year": 2025, 12 "venue": "arXiv.org", 13 "arxiv_id": "2511.18001", 14 "doi": "10.48550/arXiv.2511.18001" 15 }, 16 "scan_version": 3, 17 "active_modules": ["experimental_rigor", "data_leakage"], 18 "methodology_tags": ["benchmark-eval"], 19 "key_findings": "TokenRepair combines token-level uncertainty-based fault localization (internal reflection) with quality-aware patch refinement (external feedback) for automated program repair. Across five 7B-class LLMs, it achieves 88 correct fixes on Defects4J 1.2 and 139 on HumanEval-Java, outperforming the best baseline by 8.2%–34.9% on Defects4J and 3.3%–16.1% on HumanEval-Java. Ablation confirms uncertainty-guided localization is the most impactful component, with removal causing up to 20.6% performance degradation.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "Section 7 states 'we have made our patches open-source for public evaluation' but provides no repository URL, download link, or archive reference anywhere in the paper." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The paper uses two standard public benchmarks: Defects4J 1.2 (154 single-hunk bugs) and HumanEval-Java (163 bugs), both publicly available and unmodified." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper lists model names with HuggingFace references and some hyperparameters (temperature=1, TopK=3, α=0.5) but provides no requirements.txt, Dockerfile, library versions, or environment setup instructions sufficient to recreate the environment." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions, README, or runnable scripts are described or referenced in the paper." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "All results in Tables 1–6 are point estimates (counts of correct fixes, accuracy values, percentages). No confidence intervals or error bars are reported despite the stochastic nature of the method (temperature=1)." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper claims TokenRepair 'outperforms' and 'surpasses' baselines based solely on comparing raw numbers (e.g., 63 vs 51 correct fixes). No statistical significance tests are performed." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "Percentage improvements with baseline context are reported throughout: '8.2% to 34.9% across all models on Defects4J 1.2' and '3.3% to 16.1% on HumanEval-Java,' with both baseline and improved counts given." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "The benchmarks contain 154 and 163 bugs respectively. No justification is given for why these sizes are sufficient for the claims made, nor any power analysis." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "Results appear to be from single experimental runs. No standard deviation, variance across seeds, or spread measures are reported, despite using stochastic sampling (temperature=1) where results can vary significantly across runs." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Three baselines are compared: Base Sampling, CoT-Decoding, and ChatRepair, each described in Section 4.1.3 with matched configurations." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "ChatRepair (2024), CoT-Decoding (2024 NeurIPS), and Base Sampling (2025) are all recent and represent the state of the art in LLM-based APR." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "RQ3 (Section 5.3) presents a thorough ablation with three variants: w/o Majority, w/o Localize, and w/o Quality, measuring the contribution of each component across all models and both benchmarks (Table 5)." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Three metrics are used: #Plausible (test-passing patches), #Correct (manually verified correct patches), and #Generate (average patches per correct fix, measuring efficiency)." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": true, 94 "justification": "Section 7 describes manual patch verification: 'inviting three researchers in SE field to check respectively, each dedicating over 10 hours to manually validate the patches. They then discuss the patches where validation answers were inconsistent, ultimately reaching a consensus.'" 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": false, 99 "justification": "Hyperparameters (α, TopK, n, m) are tuned via grid search on the same Defects4J and HumanEval-Java benchmarks used for final evaluation (Tables 1, 6). The best configuration per model in Table 4 matches the best from Table 6, indicating test-set hyperparameter optimization." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Results are broken down per model (5 models) and per benchmark (2 benchmarks) in Table 4. Venn diagrams (Fig. 4) show unique fixes per method. Table 6 provides per-configuration breakdowns." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": false, 109 "justification": "The paper briefly mentions DeepSeek's exception on HumanEval-Java attributed to low localization accuracy, but provides no systematic failure case analysis showing specific bugs TokenRepair fails on or qualitative examples of failures." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "RQ4 reports that m=9 'never achieves the best performance across any model or benchmark' and explains why. The DeepSeek exception on HumanEval-Java (98 vs 99 for Base Sampling) is explicitly reported." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "Abstract claims of 88 bugs on Defects4J 1.2 and 139 on HumanEval-Java match Venn diagram aggregates in Fig. 4. Improvement ranges of 8.2%–34.9% on Defects4J and 3.3%–16.1% on HumanEval-Java are verifiable from Table 4." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "Causal claims ('X improves Y') are supported by controlled ablation studies (RQ3) that systematically remove individual components while holding others constant, demonstrating their contribution." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The paper tests only Java benchmarks with five 7B-class open-source models on single-hunk bugs, but the title ('Enhancing Automated Program Repair') and abstract ('state-of-the-art repair performance') make unbounded claims without qualifying to these specific settings." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "Section 7 discusses threats related to manual verification and LLM non-determinism but does not consider alternative explanations for why the method works (e.g., whether improvements stem from simply exploring more diverse patches rather than targeted token refinement)." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper measures #Correct (manually verified semantic equivalence to ground truth) and #Plausible (passes test suite) and clearly defines both. Claims match the granularity of measurements without overclaiming broader outcomes." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": true, 148 "justification": "Specific model identifiers are given: Qwen2.5-Coder-7B-Instruct, Llama-3.1-8B-Instruct, DeepSeek-Coder-6.7b-Instruct, DeepSeek-Coder-7b-Instruct-V1.5, CodeGemma-7b-it, with corresponding HuggingFace URLs in references." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": false, 153 "justification": "The paper describes prompt contents conceptually ('buggy code and failure information,' 'ConstructPrompt') but never provides actual prompt text used in experiments." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 4.1.1 reports: temperature=1, TopK=3, α=0.5, n∈{2,5}, m∈{3,6,9}, budget=50 patches. RQ4 evaluates sensitivity to n and m." 159 }, 160 "scaffolding_described": { 161 "applies": true, 162 "answer": true, 163 "justification": "The full scaffolding is described in detail: Algorithm 1 gives the complete procedure, Section 3 details the internal reflection loop (uncertainty calculation, CoT-decoding), external feedback loop (quality measurement, patch filtering), and their interaction." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 4.1.2 states: 'We evaluate TokenRepair under the single-hunk fix scenario, and adopt the benchmark construction process in prior research, where the location of the buggy hunk is provided based on the ground truth.' Targets 154 single-hunk bugs in Defects4J 1.2 and all 163 in HumanEval-Java." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section 7 'Threats to Validity' provides substantive discussion of manual verification challenges, experimental non-determinism, and floating-point precision issues." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": true, 180 "justification": "Section 7 discusses study-specific threats: manual patch verification requiring three researchers with 10+ hours each, non-deterministic LLM inference, and floating-point precision accumulation in uncertainty calculations." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": false, 185 "justification": "The paper does not explicitly state what the results do NOT show — e.g., no mention that results are limited to Java, single-hunk bugs, or 7B-class models, and no discussion of what settings or scenarios were not tested." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": false, 192 "justification": "Although patches are claimed to be open-sourced, no URL is provided. Raw experimental data (uncertainty scores, intermediate patches, per-bug outcomes) are not available." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "Section 4.1.2 describes using Defects4J (154 single-hunk bugs from 17 projects) and HumanEval-Java (163 buggy-fixed code pairs), following established benchmark construction from prior research." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants; data sources are standard public benchmarks (Defects4J and HumanEval-Java)." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": true, 207 "justification": "Algorithm 1 documents the full pipeline from buggy code input through patch generation, evaluation, refinement, quality filtering, and output. Section 4.1.4 defines how metrics are computed from patch outcomes." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding section, acknowledgments, or grant references appear anywhere in the paper." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "All five authors are listed as affiliated with Singapore Management University. The paper evaluates open-source models, not SMU products, so no product-affiliation conflict exists." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "Cannot assess funder independence since funding is not disclosed." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests or financial interests statement is present in the paper." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": false, 236 "justification": "No training data cutoff dates are stated for any of the five models used (Qwen2.5-Coder, Llama-3.1, DeepSeek-Coder, CodeGemma)." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": false, 241 "justification": "No discussion of whether Defects4J or HumanEval-Java examples appeared in the training data of any model, despite all models being trained well after these benchmarks were published." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": false, 246 "justification": "Defects4J was published in 2014 and HumanEval in 2021, both long before the 2024 models were trained. No contamination analysis or discussion is provided." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants. The manual patch verification is performed by researchers as evaluators, not as study subjects." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in the study." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in the study." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in the study." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in the study." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in the study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in the study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": false, 290 "justification": "The paper reports #Generate (average patches per correct fix) as an efficiency proxy but does not report actual inference cost in tokens, dollars, GPU time, or wall-clock time." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "Patch generation budget is capped at 50 per bug, but no GPU hours, total wall-clock time, or hardware specifications are reported for the experiments across 5 models × 2 benchmarks × multiple configurations." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "No mention of multiple random seeds or seed sensitivity analysis, despite using stochastic sampling at temperature=1 where results can vary substantially across runs." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": false, 307 "justification": "The paper does not state how many experimental runs produced the reported results. Results appear to be from single runs." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": true, 312 "justification": "RQ1 evaluates α∈{0.2,0.5,0.8} × TopK∈{1,2,3,4,5} and RQ4 evaluates n∈{2,5} × m∈{3,6,9}. All configurations and their results are enumerated in Tables 1 and 6." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": true, 317 "justification": "Selection of α=0.5 and TopK=3 is justified in RQ1 based on average accuracy analysis across models. For n and m, all configurations are reported in Table 6, and the rationale for selecting best configs is discussed." 318 }, 319 "multiple_comparison_correction": { 320 "applies": true, 321 "answer": false, 322 "justification": "The paper makes dozens of comparative claims across 5 models × 2 benchmarks × 4 methods without any statistical tests, let alone multiple comparison correction." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The authors re-implement baselines (ChatRepair, Base Sampling, CoT-Decoding) to match their experimental settings but do not acknowledge the systematic bias identified by Lucic et al. (2018) in author-implemented baselines." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": false, 332 "justification": "All methods are given the same patch budget (50), providing a fair comparison, but no performance-vs-compute curves are shown. The relationship between budget consumption and repair success is not analyzed." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": false, 337 "justification": "No discussion of whether Defects4J or HumanEval-Java actually measure the APR capabilities the paper claims to evaluate. No analysis of construct validity or comparison with alternative benchmarks." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": true, 341 "answer": true, 342 "justification": "The comparison is well-controlled: all four methods (Base Sampling, CoT-Decoding, ChatRepair, TokenRepair) are evaluated on the same five models with matched budgets. The repair strategy (scaffold) is the variable under test, properly isolated from the model variable." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "Defects4J (2014) and HumanEval (2021) predate all five models (2024). Solutions and discussion of these benchmarks are widely available online. No temporal leakage analysis is performed." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "The evaluation provides ground-truth buggy hunk location to the model, acknowledged as standard practice but not discussed as a form of feature leakage that inflates repair rates compared to real-world scenarios." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of whether training data for the five models includes Defects4J projects or HumanEval solutions, nor any analysis of non-independence between train and test data." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": false, 364 "justification": "No leakage detection or prevention method is applied (no canary strings, membership inference, n-gram overlap analysis, or decontamination)." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "TokenRepair achieves 88 correct fixes on Defects4J 1.2 and 139 on HumanEval-Java when aggregating across all five models.", 371 "evidence": "Venn diagrams in Fig. 4 show cumulative correct fixes across all five models: 88 on Defects4J and 139 on HumanEval-Java, compared to 82 and 131 for the next best (ChatRepair).", 372 "supported": "strong" 373 }, 374 { 375 "claim": "TokenRepair achieves improvements of 8.2%–34.9% over the best baseline per model on Defects4J 1.2.", 376 "evidence": "Table 4 shows per-model correct fixes: Qwen 51→63 (23.5%), Llama 49→53 (8.2%), DeepSeek 46→53 (15.2%), CodeGemma 43→58 (34.9%), DeepSeek-V1.5 43→54 (25.6%).", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "Token-level uncertainty localization achieves average accuracy ranging from 0.589 to 0.695 for identifying faulty tokens.", 381 "evidence": "Table 1 presents Top-K accuracy across five models, two benchmarks, and three decay factors. Average accuracy at optimal α=0.5 ranges from 0.589 (DeepSeek on HumanEval-Java) to 0.695 (Llama and DeepSeek on Defects4J).", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "Uncertainty-guided token localization is the most impactful component, with removal causing up to 20.6% performance degradation.", 386 "evidence": "Table 5 ablation: removing localization drops Qwen on Defects4J from 63 to 50 correct fixes (20.6%), the largest drop among all ablation variants across all models.", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "Decreasing uncertainty across repair iterations correlates with correct fixes.", 391 "evidence": "Table 3 shows plausible paths have 54.1%–80.5% decreasing uncertainty transitions, while incorrect paths show near-equal distributions (40.6%–51.9% decreasing).", 392 "supported": "weak" 393 }, 394 { 395 "claim": "TokenRepair uniquely fixes 7 bugs on Defects4J and 2 bugs on HumanEval-Java that no baseline can fix.", 396 "evidence": "Venn diagrams in Fig. 4 show 7 unique fixes on Defects4J and 2 on HumanEval-Java.", 397 "supported": "moderate" 398 } 399 ], 400 "red_flags": [ 401 { 402 "flag": "No variance or seed analysis for stochastic method", 403 "detail": "The method uses temperature=1 sampling, making results inherently non-deterministic. Yet all results are reported as single-run point estimates without any variance, standard deviation, or multiple-seed analysis. The paper itself acknowledges in Section 7 that 'the non-deterministic nature of LLM inference' is a threat, but does not mitigate it." 404 }, 405 { 406 "flag": "Hyperparameter optimization on test set", 407 "detail": "The best (n, m) configuration per model reported in Table 4 corresponds to the best result from the full grid search in Table 6, conducted on the same test benchmarks. No separate validation set is used for hyperparameter selection." 408 }, 409 { 410 "flag": "No statistical significance tests", 411 "detail": "All superiority claims are based on comparing raw counts (e.g., 63 vs 51 correct fixes). With stochastic sampling and small benchmark sizes (154 and 163 bugs), the observed differences may not be statistically significant." 412 }, 413 { 414 "flag": "Contamination risk unaddressed", 415 "detail": "All five models (2024) were trained well after Defects4J (2014) and HumanEval (2021) were published. Solutions to these benchmarks are widely available online. No contamination analysis is performed." 416 }, 417 { 418 "flag": "Self-authored baseline", 419 "detail": "First author Kong is also a co-author of ContrastRepair (cited as [18]), one of the related works. All baselines are re-implemented by the authors to match their experimental settings, introducing potential implementation bias not acknowledged in the paper." 420 }, 421 { 422 "flag": "Limited model diversity", 423 "detail": "All five models are 7B-class open-source models. No larger models (70B+), proprietary models (GPT-4, Claude), or models with different architectures are tested, limiting generalizability of the findings." 424 } 425 ], 426 "cited_papers": [ 427 { 428 "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair", 429 "authors": ["Islem Bouzenia", "Premkumar Devanbu", "Michael Pradel"], 430 "year": 2024, 431 "arxiv_id": "2403.17134", 432 "relevance": "Autonomous LLM-based agent for program repair using dynamic prompting and planning — directly relevant to agentic APR approaches." 433 }, 434 { 435 "title": "ContrastRepair: Enhancing Conversation-Based Automated Program Repair via Contrastive Test Case Pairs", 436 "authors": ["Jiaolong Kong", "Xiaofei Xie", "Mingfei Cheng", "Shangqing Liu", "Xiaoning Du", "Qi Guo"], 437 "year": 2025, 438 "doi": "10.1145/3719345", 439 "relevance": "Conversation-based APR using contrastive test case pairs — pioneering work in multi-turn LLM repair." 440 }, 441 { 442 "title": "Demystifying Memorization in LLM-Based Program Repair via a General Hypothesis Testing Framework", 443 "authors": ["Jiaolong Kong", "Xiaofei Xie", "Shangqing Liu"], 444 "year": 2025, 445 "relevance": "Investigates memorization and contamination issues in LLM-based APR — directly relevant to benchmark contamination concerns." 446 }, 447 { 448 "title": "Impact of code language models on automated program repair", 449 "authors": ["Nan Jiang", "Kevin Liu", "Thibaud Lutellier", "Lin Tan"], 450 "year": 2023, 451 "relevance": "Evaluates code LLMs for APR and introduces HumanEval-Java benchmark used in this paper." 452 }, 453 { 454 "title": "Automated program repair in the era of large pre-trained language models", 455 "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"], 456 "year": 2023, 457 "relevance": "Foundational study on LLM-based APR showing pre-trained models can surpass traditional APR techniques." 458 }, 459 { 460 "title": "Less Training, More Repairing Please: Revisiting Automated Program Repair via Zero-Shot Learning", 461 "authors": ["Chunqiu Steven Xia", "Lingming Zhang"], 462 "year": 2022, 463 "doi": "10.1145/3540250.3549101", 464 "relevance": "Zero-shot LLM approach to APR demonstrating repair without task-specific fine-tuning." 465 }, 466 { 467 "title": "Automated program repair via conversation: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT", 468 "authors": ["Chunqiu Steven Xia", "Lingming Zhang"], 469 "year": 2024, 470 "relevance": "ChatRepair — the primary baseline in this paper, pioneering conversation-based iterative APR with LLMs." 471 }, 472 { 473 "title": "Chain-of-thought reasoning without prompting", 474 "authors": ["Xuezhi Wang", "Denny Zhou"], 475 "year": 2024, 476 "relevance": "CoT-Decoding method used as a baseline — explores alternative decoding paths at inference time." 477 }, 478 { 479 "title": "Calibration and correctness of language models for code", 480 "authors": ["Claudio Spiess", "David Gros", "Kunal Suresh Pai", "Michael Pradel"], 481 "year": 2024, 482 "arxiv_id": "2402.02047", 483 "relevance": "Shows correlation between LLM uncertainty/entropy and code token correctness — foundational finding that TokenRepair builds upon." 484 }, 485 { 486 "title": "CigaR: Cost-efficient Program Repair with LLMs", 487 "authors": ["Dávid Hidvégi", "Khashayar Etemadi", "Sofia Bobadilla", "Martin Monperrus"], 488 "year": 2024, 489 "arxiv_id": "2402.06598", 490 "relevance": "Token-efficient LLM-based APR approach addressing computational cost of conversation-based repair." 491 }, 492 { 493 "title": "RepairLLaMA: Efficient Representations and Fine-Tuned Adapters for Program Repair", 494 "authors": ["André Silva", "Sen Fang", "Martin Monperrus"], 495 "year": 2023, 496 "arxiv_id": "2312.15698", 497 "relevance": "Fine-tuning approach for APR using LoRA adapters on CodeLlama — represents the fine-tuning paradigm in LLM-based repair." 498 }, 499 { 500 "title": "Uncertainty-guided chain-of-thought for code generation with LLMs", 501 "authors": ["Yuqi Zhu", "Ge Li", "Xue Jiang", "Jia Li", "Hong Mei", "Zhi Jin", "Yihong Dong"], 502 "year": 2025, 503 "arxiv_id": "2503.15341", 504 "relevance": "Uses first-token uncertainty as a quality indicator for code generation — directly informs TokenRepair's trace quality measurement design." 505 }, 506 { 507 "title": "Can OpenAI's Codex fix bugs? An evaluation on QuixBugs", 508 "authors": ["Julian Aron Prenner", "Hlib Babii", "Romain Robbes"], 509 "year": 2022, 510 "relevance": "Early evaluation of LLM (Codex) for automated program repair on QuixBugs benchmark." 511 } 512 ], 513 "engagement_factors": { 514 "practical_relevance": { 515 "score": 2, 516 "justification": "Token-level uncertainty refinement for APR is a usable technique but requires access to model logits (token probabilities), limiting it to open-weight models." 517 }, 518 "surprise_contrarian": { 519 "score": 1, 520 "justification": "The internal reflection + external feedback framing is novel but the core idea (uncertainty-guided refinement) is an incremental extension of existing work." 521 }, 522 "fear_safety": { 523 "score": 0, 524 "justification": "No AI safety or security concerns raised — purely a program repair improvement." 525 }, 526 "drama_conflict": { 527 "score": 0, 528 "justification": "No controversy or provocative claims." 529 }, 530 "demo_ability": { 531 "score": 0, 532 "justification": "No code repository URL, demo, or installable tool is provided despite claiming patches are open-sourced." 533 }, 534 "brand_recognition": { 535 "score": 0, 536 "justification": "From Singapore Management University — not a high-profile AI lab. Uses smaller open-source models, not flagship products." 537 } 538 } 539 }