scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (31972B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Enhancing Automated Program Repair via Faulty Token Localization and Quality-Aware Patch Refinement",
      6     "authors": [
      7       "Jiaolong Kong",
      8       "Xiaofei Xie",
      9       "Yiheng Xiong",
     10       "Yuekun Wang",
     11       "Jian Wang"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2511.18001",
     16     "doi": "10.48550/arXiv.2511.18001"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims of 88 bugs on Defects4J 1.2 and 139 on HumanEval-Java match Venn diagram aggregates in Fig. 4. Improvement ranges of 8.2%–34.9% on Defects4J and 3.3%–16.1% on HumanEval-Java are verifiable from Table 4.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims ('X improves Y') are supported by controlled ablation studies (RQ3) that systematically remove individual components while holding others constant, demonstrating their contribution.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper tests only Java benchmarks with five 7B-class open-source models on single-hunk bugs, but the title ('Enhancing Automated Program Repair') and abstract ('state-of-the-art repair performance') make unbounded claims without qualifying to these specific settings.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Section 7 discusses threats related to manual verification and LLM non-determinism but does not consider alternative explanations for why the method works (e.g., whether improvements stem from simply exploring more diverse patches rather than targeted token refinement).",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper measures #Correct (manually verified semantic equivalence to ground truth) and #Plausible (passes test suite) and clearly defines both. Claims match the granularity of measurements without overclaiming broader outcomes.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 7 'Threats to Validity' provides substantive discussion of manual verification challenges, experimental non-determinism, and floating-point precision issues.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section 7 discusses study-specific threats: manual patch verification requiring three researchers with 10+ hours each, non-deterministic LLM inference, and floating-point precision accumulation in uncertainty calculations.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what the results do NOT show — e.g., no mention that results are limited to Java, single-hunk bugs, or 7B-class models, and no discussion of what settings or scenarios were not tested.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding section, acknowledgments, or grant references appear anywhere in the paper.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All five authors are listed as affiliated with Singapore Management University. The paper evaluates open-source models, not SMU products, so no product-affiliation conflict exists.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Cannot assess funder independence since funding is not disclosed.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is present in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "APR is introduced with context, 'internal reflection' and 'external feedback' are operationally defined, and 'plausible' vs 'correct' patches are precisely defined in the metrics section.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The introduction enumerates three explicit contributions: first use of token-level internal reflection for APR, the TokenRepair framework, and comprehensive benchmark evaluation.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 6 covers conversation-based and fine-tuning-based LLM APR and LLM uncertainty quantification, explicitly positioning TokenRepair's novelty relative to ChatRepair, CoT-Decoding, and uncertainty calibration work.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "Section 7 states 'we have made our patches open-source for public evaluation' but provides no repository URL, download link, or archive reference anywhere in the paper.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The paper uses two standard public benchmarks: Defects4J 1.2 (154 single-hunk bugs) and HumanEval-Java (163 bugs), both publicly available and unmodified.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper lists model names with HuggingFace references and some hyperparameters (temperature=1, TopK=3, α=0.5) but provides no requirements.txt, Dockerfile, library versions, or environment setup instructions sufficient to recreate the environment.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions, README, or runnable scripts are described or referenced in the paper.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables 1–6 are point estimates (counts of correct fixes, accuracy values, percentages). No confidence intervals or error bars are reported despite the stochastic nature of the method (temperature=1).",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper claims TokenRepair 'outperforms' and 'surpasses' baselines based solely on comparing raw numbers (e.g., 63 vs 51 correct fixes). No statistical significance tests are performed.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Percentage improvements with baseline context are reported throughout: '8.2% to 34.9% across all models on Defects4J 1.2' and '3.3% to 16.1% on HumanEval-Java,' with both baseline and improved counts given.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The benchmarks contain 154 and 163 bugs respectively. No justification is given for why these sizes are sufficient for the claims made, nor any power analysis.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Results appear to be from single experimental runs. No standard deviation, variance across seeds, or spread measures are reported, despite using stochastic sampling (temperature=1) where results can vary significantly across runs.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Three baselines are compared: Base Sampling, CoT-Decoding, and ChatRepair, each described in Section 4.1.3 with matched configurations.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "ChatRepair (2024), CoT-Decoding (2024 NeurIPS), and Base Sampling (2025) are all recent and represent the state of the art in LLM-based APR.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "RQ3 (Section 5.3) presents a thorough ablation with three variants: w/o Majority, w/o Localize, and w/o Quality, measuring the contribution of each component across all models and both benchmarks (Table 5).",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Three metrics are used: #Plausible (test-passing patches), #Correct (manually verified correct patches), and #Generate (average patches per correct fix, measuring efficiency).",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Section 7 describes manual patch verification: 'inviting three researchers in SE field to check respectively, each dedicating over 10 hours to manually validate the patches. They then discuss the patches where validation answers were inconsistent, ultimately reaching a consensus.'",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": false,
    212           "justification": "Hyperparameters (α, TopK, n, m) are tuned via grid search on the same Defects4J and HumanEval-Java benchmarks used for final evaluation (Tables 1, 6). The best configuration per model in Table 4 matches the best from Table 6, indicating test-set hyperparameter optimization.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down per model (5 models) and per benchmark (2 benchmarks) in Table 4. Venn diagrams (Fig. 4) show unique fixes per method. Table 6 provides per-configuration breakdowns.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "The paper briefly mentions DeepSeek's exception on HumanEval-Java attributed to low localization accuracy, but provides no systematic failure case analysis showing specific bugs TokenRepair fails on or qualitative examples of failures.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "RQ4 reports that m=9 'never achieves the best performance across any model or benchmark' and explains why. The DeepSeek exception on HumanEval-Java (98 vs 99 for Base Sampling) is explicitly reported.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Specific model identifiers are given: Qwen2.5-Coder-7B-Instruct, Llama-3.1-8B-Instruct, DeepSeek-Coder-6.7b-Instruct, DeepSeek-Coder-7b-Instruct-V1.5, CodeGemma-7b-it, with corresponding HuggingFace URLs in references.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "The paper describes prompt contents conceptually ('buggy code and failure information,' 'ConstructPrompt') but never provides actual prompt text used in experiments.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Section 4.1.1 reports: temperature=1, TopK=3, α=0.5, n∈{2,5}, m∈{3,6,9}, budget=50 patches. RQ4 evaluates sensitivity to n and m.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The full scaffolding is described in detail: Algorithm 1 gives the complete procedure, Section 3 details the internal reflection loop (uncertainty calculation, CoT-decoding), external feedback loop (quality measurement, patch filtering), and their interaction.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 4.1.2 states: 'We evaluate TokenRepair under the single-hunk fix scenario, and adopt the benchmark construction process in prior research, where the location of the buggy hunk is provided based on the ground truth.' Targets 154 single-hunk bugs in Defects4J 1.2 and all 163 in HumanEval-Java.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Although patches are claimed to be open-sourced, no URL is provided. Raw experimental data (uncertainty scores, intermediate patches, per-bug outcomes) are not available.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 4.1.2 describes using Defects4J (154 single-hunk bugs from 17 projects) and HumanEval-Java (163 buggy-fixed code pairs), following established benchmark construction from prior research.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; data sources are standard public benchmarks (Defects4J and HumanEval-Java).",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Algorithm 1 documents the full pipeline from buggy code input through patch generation, evaluation, refinement, quality filtering, and output. Section 4.1.4 defines how metrics are computed from patch outcomes.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoff dates are stated for any of the five models used (Qwen2.5-Coder, Llama-3.1, DeepSeek-Coder, CodeGemma).",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of whether Defects4J or HumanEval-Java examples appeared in the training data of any model, despite all models being trained well after these benchmarks were published.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "Defects4J was published in 2014 and HumanEval in 2021, both long before the 2024 models were trained. No contamination analysis or discussion is provided.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants. The manual patch verification is performed by researchers as evaluators, not as study subjects.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in the study.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in the study.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in the study.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in the study.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in the study.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in the study.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "The paper reports #Generate (average patches per correct fix) as an efficiency proxy but does not report actual inference cost in tokens, dollars, GPU time, or wall-clock time.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Patch generation budget is capped at 50 per bug, but no GPU hours, total wall-clock time, or hardware specifications are reported for the experiments across 5 models × 2 benchmarks × multiple configurations.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "No mention of multiple random seeds or seed sensitivity analysis, despite using stochastic sampling at temperature=1 where results can vary substantially across runs.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The paper does not state how many experimental runs produced the reported results. Results appear to be from single runs.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": true,
    386           "justification": "RQ1 evaluates α∈{0.2,0.5,0.8} × TopK∈{1,2,3,4,5} and RQ4 evaluates n∈{2,5} × m∈{3,6,9}. All configurations and their results are enumerated in Tables 1 and 6.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": true,
    392           "justification": "Selection of α=0.5 and TopK=3 is justified in RQ1 based on average accuracy analysis across models. For n and m, all configurations are reported in Table 6, and the rationale for selecting best configs is discussed.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "The paper makes dozens of comparative claims across 5 models × 2 benchmarks × 4 methods without any statistical tests, let alone multiple comparison correction.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The authors re-implement baselines (ChatRepair, Base Sampling, CoT-Decoding) to match their experimental settings but do not acknowledge the systematic bias identified by Lucic et al. (2018) in author-implemented baselines.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "All methods are given the same patch budget (50), providing a fair comparison, but no performance-vs-compute curves are shown. The relationship between budget consumption and repair success is not analyzed.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "No discussion of whether Defects4J or HumanEval-Java actually measure the APR capabilities the paper claims to evaluate. No analysis of construct validity or comparison with alternative benchmarks.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": true,
    421           "answer": true,
    422           "justification": "The comparison is well-controlled: all four methods (Base Sampling, CoT-Decoding, ChatRepair, TokenRepair) are evaluated on the same five models with matched budgets. The repair strategy (scaffold) is the variable under test, properly isolated from the model variable.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "Defects4J (2014) and HumanEval (2021) predate all five models (2024). Solutions and discussion of these benchmarks are widely available online. No temporal leakage analysis is performed.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "The evaluation provides ground-truth buggy hunk location to the model, acknowledged as standard practice but not discussed as a form of feature leakage that inflates repair rates compared to real-world scenarios.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "No discussion of whether training data for the five models includes Defects4J projects or HumanEval solutions, nor any analysis of non-independence between train and test data.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No leakage detection or prevention method is applied (no canary strings, membership inference, n-gram overlap analysis, or decontamination).",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "TokenRepair achieves new state-of-the-art APR performance, fixing 88 bugs on Defects4J 1.2 and 139 on HumanEval-Java aggregated across 5 models.",
    457       "evidence": "Figure 4 Venn diagrams show 88 (D4J) and 139 (HumanEval-Java) correct fixes for TokenRepair vs 82 and 131 for ChatRepair.",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "TokenRepair improves correct bug fixes by 8.2% to 34.9% over the best baseline on Defects4J 1.2 per model.",
    462       "evidence": "Table 4: Qwen 63 vs ChatRepair 51 (+23.5%), CodeGemma 58 vs ChatRepair 43 (+34.9%), Llama 53 vs ChatRepair 49 (+8.2%).",
    463       "supported": "strong"
    464     },
    465     {
    466       "claim": "Token-level uncertainty accurately localizes faulty tokens with average Top-3 accuracy ranging from 0.589 to 0.695 across models and benchmarks.",
    467       "evidence": "Table 1 reports Top-3 accuracy per model-decay configuration; the average column (Avg.) spans 0.589-0.695 at the optimal α=0.5 setting.",
    468       "supported": "strong"
    469     },
    470     {
    471       "claim": "Decreasing uncertainty transitions correlate with successful repair; plausible paths show 55.8%-80.5% decreasing uncertainty vs balanced ~50% in incorrect paths.",
    472       "evidence": "Table 3 reports Uncert.↓ proportions of 55.8%-80.5% for plausible paths vs 40.6%-51.9% for incorrect paths across all models and benchmarks.",
    473       "supported": "moderate"
    474     },
    475     {
    476       "claim": "Uncertainty-guided localization is the most critical component, with its removal degrading performance by up to 20.6% on Defects4J.",
    477       "evidence": "Table 5 (w/o Localize): Qwen drops from 63 to 50 correct fixes on D4J (-20.6%); all models show largest degradation from this ablation.",
    478       "supported": "strong"
    479     },
    480     {
    481       "claim": "Excessively large refinement breadth (m=9) consistently underperforms, due to fault localization inaccuracy wasting budget at incorrect positions.",
    482       "evidence": "Table 6 shows m=9 achieves the best result in zero model-benchmark combinations; the paper explains this via localization accuracy constraints.",
    483       "supported": "moderate"
    484     }
    485   ],
    486   "methodology_tags": [
    487     "benchmark-eval"
    488   ],
    489   "key_findings": "TokenRepair introduces token-level uncertainty analysis to identify suspicious tokens within LLM-generated patches, enabling targeted token-guided CoT refinement combined with uncertainty-based quality filtering of poor candidates. On Defects4J 1.2 and HumanEval-Java with five 7B-class code LLMs, it outperforms ChatRepair by 8.2%-34.9% and 3.3%-16.1% respectively. Ablation confirms uncertainty-guided localization is the dominant contributor (up to 20.6% gain). Critical gaps include absence of significance testing, no variance across runs despite acknowledged non-determinism, and complete omission of benchmark contamination — both benchmarks predate the training cutoffs of all evaluated models.",
    490   "red_flags": [
    491     {
    492       "flag": "No statistical significance tests",
    493       "detail": "All comparative claims on benchmarks of 154 and 163 bugs are reported as raw counts with no significance testing; differences of 1-7 bugs are indistinguishable from run-to-run noise."
    494     },
    495     {
    496       "flag": "No variance across runs",
    497       "detail": "LLM inference is non-deterministic (explicitly noted as a threat) but all results are single-run numbers without standard deviation, confidence intervals, or repeated trials."
    498     },
    499     {
    500       "flag": "Benchmark contamination unaddressed",
    501       "detail": "Defects4J (2014) and HumanEval-Java (2023) predate training cutoffs of all 5 models; models may have memorized solutions, making the evaluation measure recall rather than repair ability."
    502     },
    503     {
    504       "flag": "No code or tool release",
    505       "detail": "The paper claims patches are 'open-source' but provides no URL; the tool's source code is not released, making full reproduction impossible."
    506     },
    507     {
    508       "flag": "SOTA claim without scope bounds",
    509       "detail": "Claiming 'new state-of-the-art' based on single-hunk Java bugs with five small open-source 7B models, without testing closed models, larger models, or multi-hunk settings."
    510     },
    511     {
    512       "flag": "No prompts provided",
    513       "detail": "Exact prompts for patch generation and feedback-guided repair are not disclosed, preventing reproduction of the experimental setup."
    514     }
    515   ],
    516   "cited_papers": [
    517     {
    518       "title": "Automated program repair via conversation: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT (ChatRepair)",
    519       "relevance": "Primary baseline; TokenRepair is directly positioned as improving upon this conversation-based APR approach."
    520     },
    521     {
    522       "title": "Chain-of-thought reasoning without prompting (CoT-Decoding)",
    523       "relevance": "Secondary baseline and technical component; TokenRepair adapts CoT-decoding for token-guided patch refinement."
    524     },
    525     {
    526       "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs",
    527       "relevance": "Primary evaluation benchmark; 154 single-hunk bugs from D4J 1.2 are the main evaluation set."
    528     },
    529     {
    530       "title": "Impact of code language models on automated program repair (introduces HumanEval-Java)",
    531       "relevance": "Source of the HumanEval-Java benchmark used as the second evaluation dataset."
    532     },
    533     {
    534       "title": "Calibration and correctness of language models for code",
    535       "relevance": "Foundational prior work showing entropy-based uncertainty correlates with code token correctness, directly motivating TokenRepair's core mechanism."
    536     },
    537     {
    538       "title": "Uncertainty-guided chain-of-thought for code generation with LLMs",
    539       "relevance": "Establishes that first-token uncertainty proxies overall generation quality, directly motivating the trace quality measurement component."
    540     },
    541     {
    542       "title": "Demystifying Memorization in LLM-Based Program Repair via a General Hypothesis Testing Framework",
    543       "relevance": "By the same first author; provides the Base Sampling baseline methodology and is a closely related predecessor work."
    544     },
    545     {
    546       "title": "Self-consistency improves chain of thought reasoning in language models",
    547       "relevance": "Motivates the majority voting strategy used for first-token identification in TokenRepair."
    548     }
    549   ],
    550   "engagement_factors": {
    551     "practical_relevance": {
    552       "score": 2,
    553       "justification": "Token-level uncertainty refinement for APR is a usable technique but requires access to model logits (token probabilities), limiting it to open-weight models."
    554     },
    555     "surprise_contrarian": {
    556       "score": 1,
    557       "justification": "The internal reflection + external feedback framing is novel but the core idea (uncertainty-guided refinement) is an incremental extension of existing work."
    558     },
    559     "fear_safety": {
    560       "score": 0,
    561       "justification": "No AI safety or security concerns raised — purely a program repair improvement."
    562     },
    563     "drama_conflict": {
    564       "score": 0,
    565       "justification": "No controversy or provocative claims."
    566     },
    567     "demo_ability": {
    568       "score": 0,
    569       "justification": "No code repository URL, demo, or installable tool is provided despite claiming patches are open-sourced."
    570     },
    571     "brand_recognition": {
    572       "score": 0,
    573       "justification": "From Singapore Management University — not a high-profile AI lab. Uses smaller open-source models, not flagship products."
    574     }
    575   },
    576   "hn_data": {
    577     "threads": [
    578       {
    579         "hn_id": "42889052",
    580         "title": "Large language models think too fast to explore effectively",
    581         "points": 118,
    582         "comments": 41,
    583         "url": "https://news.ycombinator.com/item?id=42889052"
    584       },
    585       {
    586         "hn_id": "46664297",
    587         "title": "VaultGemma: A Differentially Private LLM",
    588         "points": 3,
    589         "comments": 0,
    590         "url": "https://news.ycombinator.com/item?id=46664297"
    591       },
    592       {
    593         "hn_id": "42968402",
    594         "title": "Fault Localization via Fine-Tuning LLMs with Mutation Generated Stack Traces",
    595         "points": 3,
    596         "comments": 0,
    597         "url": "https://news.ycombinator.com/item?id=42968402"
    598       },
    599       {
    600         "hn_id": "46555313",
    601         "title": "Name That Part: 3D Part Segmentation and Naming",
    602         "points": 2,
    603         "comments": 1,
    604         "url": "https://news.ycombinator.com/item?id=46555313"
    605       },
    606       {
    607         "hn_id": "46838079",
    608         "title": "VaultGemma: A Differentially Private LLM",
    609         "points": 1,
    610         "comments": 0,
    611         "url": "https://news.ycombinator.com/item?id=46838079"
    612       }
    613     ],
    614     "top_points": 118,
    615     "total_points": 127,
    616     "total_comments": 42
    617   }
    618 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs