scan.json (31920B)
1 { 2 "paper": { 3 "title": "Revisiting Unnaturalness for Automated Program Repair in the Era of Large Language Models", 4 "authors": [ 5 "Aidan Z.H. Yang", 6 "Sophia Kolak", 7 "Vincent J. Hellendoorn", 8 "Ruben Martins", 9 "Claire Le Goues" 10 ], 11 "year": 2024, 12 "venue": "International Conference on Software Engineering (ICSE)", 13 "arxiv_id": "2404.15236", 14 "doi": "10.1109/ICSE55347.2025.00089" 15 }, 16 "scan_version": 3, 17 "active_modules": ["experimental_rigor", "data_leakage"], 18 "methodology_tags": ["benchmark-eval"], 19 "key_findings": "LLM entropy is highly complementary with existing fault localization tools, improving SBFL Top-5 by up to 54% when used as a re-ranking signal. Entropy-delta, a measure of entropy change between buggy and patched code, reduces the number of patches that must be tested by a mean of 24 per bug for template-based repair (TBar). For patch correctness ranking, entropy-delta outperforms the state-of-the-art Shibboleth by 49% on Top-1 and classifies correct patches with 18% higher precision than Panther, all without requiring project-specific training data or test-suite execution.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "The paper states 'Our data, tool, and results are available and will be released as open-source' with a Zenodo URL provided (https://zenodo.org/records/10851256). A concrete archive link is given, not just a promise." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The evaluation uses publicly available Defects4J (V1.2 and V2.0) and curated datasets from Shibboleth and Panther replication packages. Their own replication package is at the Zenodo link." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "No requirements.txt, Dockerfile, or environment setup section is described. Model names and sizes are given (InCoder 6.7B, Starcoder 15.5B, Code-Llama2 7B) but no dependency or library versions are specified." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions are included in the paper. A Zenodo replication package is linked but the paper itself contains no 'Reproducing Results' section or commands to run." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "All results in Tables II, III, IV, and V are point estimates (counts or ratios). No confidence intervals, error bars, or ± notation is provided anywhere." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper makes multiple comparative claims (e.g., '108% improvement', '49% improvement in Top-1') based solely on comparing raw numbers without any statistical significance tests (no p-values, t-tests, or bootstrap tests)." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "Percentage improvements are reported with baseline context throughout (e.g., 'Top-1 score of 50 (108% improvement)' over SBFL's 24; 'entropy-delta improves upon Shibboleth by 49% for Top-1' from 85 to 127). Raw counts and percentages are provided together." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "The sample sizes (395 bugs for FL, 72 for patch efficiency, 197 for patch correctness) are dictated by the Defects4J dataset and prior work, but no power analysis or justification for whether these are sufficient is provided." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single runs. The LLM temperature is set to 0.5 (introducing stochasticity) but no multi-run analysis is presented." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "The paper compares against multiple baselines: SBFL (Ochiai), TransferFL, LLMAO for fault localization; vanilla TBar for patch generation; SBFL, Shibboleth for patch ranking; PATCH-SIM, Panther for patch classification." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "Baselines include TransferFL (2022), LLMAO (2023), Shibboleth (2022), and Panther (2023), which are state-of-the-art at the time of writing. SBFL/Ochiai is older but included as the standard practice baseline." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Table II reports 24 configurations testing the contribution of each component: three LLMs, three FL tools, two filter sizes, and entropy alone. This systematically isolates the effect of entropy re-ranking across different settings." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Multiple metrics are used: Top-1/3/5 for fault localization, mean/median rank decrease for patch efficiency (Table III, Figure 4/5), and accuracy/precision/recall/F1 for patch classification (Table V)." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": false, 94 "justification": "All evaluation is automated using test suite pass/fail and Top-N ranking metrics. No human evaluation of patch quality, readability, or maintainability is included." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": false, 99 "justification": "The entropy approach requires no training, so there is no tuning/dev set per se. However, the choice of filter sizes (6, 10) and temperature (0.5) are not validated on a held-out set. All configurations are reported, but no explicit dev/test separation is described." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Results are broken down by Defects4J project (Chart, Closure, Lang, Math, Mockito, Time) in Tables III and IV, and Figures 4, 5, and 6." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "The paper discusses cases where entropy hurts: 'we observe an overall decrease in Top-N scores using either Code-Llama2 or Starcoder entropy with a 10-filter' for SBFL, and '6-filter InCoder-entropy with TransferFL yields a Top-1 score of 57, which is a 17% decrease.' Mockito's lack of improvement is also discussed." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Several negative results are reported: 10-filter sometimes hurts SBFL performance; entropy re-ranking shows only marginal improvements over LLMAO; entropy alone is 'only somewhat useful' for FL; entropy-delta performs worse than Panther on -recall (62.4% vs 69.6%)." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "Abstract claims are supported: '50% Top-5 score improvement over SBFL' is verified in Table II (94→145 for 6-filter Llama2, ~54%). '49% improvement in Top-1' for patch ranking matches Table IV (85→127). 'Reduce running tests for 24 patches' matches Table III mean rank decrease." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "Causal claims ('entropy can improve fault localization') are supported by controlled ablation experiments — the same FL tools are tested with and without entropy re-ranking, isolating entropy's contribution. The choice of TBar (non-ML) specifically avoids confounding learned patterns." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The title 'Revisiting Unnaturalness for Automated Program Repair' and abstract claim about 'Automated Program Repair' are framed broadly, but all results are on Java bugs from Defects4J only. The threats section acknowledges Defects4J dependence but the framing remains broad." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": true, 136 "justification": "Section VII discusses LLM selection bias as an alternative explanation (larger models might behave differently), Defects4J data leakage (entropy scores could be contaminated), and manual labeling reliability. The paper also discusses why InCoder outperforms larger models (bidirectional training objective)." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper clearly distinguishes between its proxy measures (Top-N counts, entropy-delta values) and the actual outcomes they target (fault localization accuracy, repair efficiency, patch correctness). It explicitly discusses test-suite overfitting as a gap between test-passing and true correctness." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": true, 148 "justification": "Specific open-source models with parameter counts are identified: InCoder (6.7B), Starcoder (15.5B), Code-Llama2 (7B). These are fixed-weight downloadable models, not continuously-updated APIs, so the model name and size uniquely identify the weights." 149 }, 150 "prompts_provided": { 151 "applies": false, 152 "answer": false, 153 "justification": "The paper does not use prompting in the traditional sense. It computes entropy by masking code lines and using the model's log probabilities via infilling — described in Sections III-A and III-B with Figures 2 and 3. No prompt templates or instructions are sent to the models." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section IV-A states 'we set the LLM temperature to 0.5' and Section III-A describes 'a sliding context window with 2048 tokens.' Model parameter counts are also reported." 159 }, 160 "scaffolding_described": { 161 "applies": false, 162 "answer": false, 163 "justification": "No agentic scaffolding is used. The approach is a straightforward pipeline: compute entropy scores, then re-rank FL or patch lists. No retry logic, memory, or tool use is involved." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section IV-B describes dataset selection: which Defects4J versions are used for which RQ, which project subsets are included, and Table I shows the exact counts. The sliding context window masking approach for entropy computation is detailed in Section III-A." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section VII 'Threats' provides substantive discussion organized into external validity, internal validity, and construct validity subsections." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": true, 180 "justification": "Specific threats are discussed: 'the potential selection bias of our three selected LLMs' (external), 'Much larger LLMs (> 20 billion parameters) might have a stronger ability' (external), 'Data leakage of Defects4J as training data for our selected LLMs is possible' (external), 'manual labeling of plausible patches' (internal)." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": true, 185 "justification": "The paper explicitly bounds scope: limited to 3 LLMs with infill ability, Java-only Defects4J bugs, template-based repair (TBar), and 6 of 17 Defects4J V2.0 projects for patch correctness. They state they chose not to use ML-based APR 'because our goal is a controlled evaluation of entropy without learned patterns.'" 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": true, 192 "justification": "Defects4J is publicly available with all source code and test suites. The Zenodo replication package contains their generated data. Prior patch datasets from Ghanbari et al. and Tian et al. are from published replication packages." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "Section IV-B describes data collection: Defects4J V1.2 for FL and patch generation (395 bugs), Defects4J V2.0 for patch correctness. Table I shows exact per-project counts. Curated patch datasets from prior work are identified by source." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants. All data comes from the standard Defects4J benchmark and published patch datasets." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": true, 207 "justification": "The pipeline is documented: bugs selected from Defects4J → entropy computed via sliding window masking → FL scores combined with entropy → patches generated by TBar → entropy-delta computed for patches → ranking/classification evaluated. Table I shows exact subset counts for each RQ." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": false, 214 "justification": "No acknowledgments section or funding disclosure is present in the paper text." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "All five authors are clearly identified as affiliated with Carnegie Mellon University. They do not evaluate their own commercial product." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "No funding source is disclosed, so independence cannot be assessed." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests or financial disclosure statement is present in the paper." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": false, 236 "justification": "The paper describes what data the models were trained on (e.g., InCoder on 'public open-source repositories on GitHub and GitLab') but does not state specific training data cutoff dates for any of the three models." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": true, 241 "justification": "Section VII explicitly states 'Data leakage of Defects4J as training data for our selected LLMs is possible' and describes two mitigation strategies: using entropy rather than direct generation, and applying entropy-delta on recently generated patches not available online. They also note GPT-4 'reports the git commit associated with the fix, implicating data leakage.'" 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": true, 246 "justification": "The paper acknowledges that Defects4J code may be in training data and mitigates this by (1) using entropy rather than direct LLM code generation, (2) testing on plausible patches 'collected in 2023 (past the LLM training data cutoff of all our chosen LLMs).' The GPT-4 data leakage footnote in Section II concretely demonstrates the contamination risk." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study. All evaluation is on automated benchmarks." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": false, 290 "justification": "No inference costs, wall-clock times, or API costs are reported. The approach requires computing entropy for every line in every file and for every generated patch, which could be expensive, but this is not quantified." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "No GPU hours, hardware specifications, or total compute budget is reported despite using three LLMs (6.7B, 7B, 15.5B parameters) for entropy computation over hundreds of bugs." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "LLM temperature is set to 0.5, introducing stochasticity in entropy computation, but no multi-seed or multi-run analysis is reported. All results appear to be from single runs." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": false, 307 "justification": "The number of experimental runs is never stated. Results are presented as single values without indicating whether they are from one run or averaged over multiple." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": false, 312 "justification": "Temperature (0.5) and context window size (2048) are reported but the paper does not describe how these were selected or what alternatives were tried. No hyperparameter search budget is stated." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": true, 317 "justification": "Table II reports all 24 FL configurations transparently (3 LLMs × 3 FL tools × 2 filter sizes + baselines). The reader can see every result, and no configurations appear hidden. Filter sizes are justified as 'two quantities near the Top-5 that can still significantly impact Top-5 scores.'" 318 }, 319 "multiple_comparison_correction": { 320 "applies": true, 321 "answer": false, 322 "justification": "The paper makes numerous comparisons across 24 FL configurations, multiple projects, and multiple metrics without any correction for multiple comparisons (no Bonferroni, Holm, or similar)." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The authors use prior tools' replication packages directly rather than re-implementing baselines, which partially mitigates this concern. However, they do not explicitly discuss or acknowledge author-evaluation bias." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": false, 332 "justification": "InCoder (6.7B) outperforms larger Starcoder (15.5B) and Code-Llama2 (7B), which is noted, but no systematic compute-vs-performance analysis is presented. The computational cost of entropy computation itself is not reported." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": false, 337 "justification": "The paper does not discuss whether Defects4J is a valid proxy for real-world APR. It is used because prior tools are compatible with it, but its representativeness of real-world bugs is not questioned." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": false, 341 "answer": false, 342 "justification": "No scaffolding is involved. The approach computes entropy scores and re-ranks existing FL/patch lists without agentic scaffolding." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": true, 349 "justification": "The paper explicitly states that the 2,147 plausible patches used for RQ3 were 'collected in 2023 (past the LLM training data cutoff of all our chosen LLMs),' directly addressing temporal leakage for patch correctness. For FL and RQ2, the concern is acknowledged in threats to validity." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "The paper argues that using entropy rather than direct generation reduces leakage risk, but does not analyze whether the entropy signal itself could be inflated by the model having seen Defects4J code during training. The models' familiarity with the codebase could systematically affect entropy values." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of whether training data and Defects4J test examples share structural similarities (e.g., same repositories, similar code patterns). The open-source training data for all three models likely includes Java repositories from which Defects4J bugs were drawn." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": true, 364 "justification": "Two concrete methods are used: (1) temporal separation — patch correctness dataset collected in 2023 after all models' training cutoffs, and (2) anecdotal detection — demonstrating GPT-4 data leakage by showing it 'reports the git commit associated with the fix.' The approach also uses entropy rather than direct generation as a structural mitigation." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "Entropy re-ranking improves SBFL Top-5 fault localization by up to 54% (from 94 to 145 bugs)", 371 "evidence": "Table II shows SBFL alone achieves Top-5=94; 6-filter with Llama2 entropy achieves Top-5=145. InCoder entropy with 6-filter achieves Top-1=55 (129% over SBFL's 24). Section V RQ1.", 372 "supported": "strong" 373 }, 374 { 375 "claim": "Entropy-delta reduces the mean number of patches tested by 24 per bug for TBar", 376 "evidence": "Table III shows entropy-delta improves 60 of 72 patch rankings with mean rank decrease of 24. Figure 5 shows median patches tested decreases across most projects. Section V RQ2.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "Entropy-delta outperforms Shibboleth by 49% on Top-1 for patch correctness ranking", 381 "evidence": "Table IV: entropy-delta Top-1=127 vs Shibboleth Top-1=85 across 1,290 patches on Defects4J V2.0. Top-2 also improves: 165 vs 130 (27% improvement). Section V RQ3.", 382 "supported": "strong" 383 }, 384 { 385 "claim": "Entropy-delta classifies correct patches with 18% higher precision than Panther", 386 "evidence": "Table V: entropy-delta precision=0.900 vs Panther precision=0.760 on 2,147 plausible patches. F1 also improves: 0.824 vs 0.750 (10% improvement). Section V RQ3.", 387 "supported": "strong" 388 }, 389 { 390 "claim": "InCoder (6.7B) is the most effective LLM for entropy-based fault localization despite being the smallest model", 391 "evidence": "Table II: InCoder consistently achieves the highest Top-N scores across configurations (e.g., entropy-alone Top-5=116 vs Starcoder 55 vs Llama2 41). Attributed to InCoder's bidirectional infilling training objective. Section V RQ1.", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "LLMs can complement APR while minimizing test-suite overfitting and data leakage", 396 "evidence": "The entropy approach avoids direct code generation (reducing leakage risk) and does not require test execution for ranking (reducing overfitting). However, leakage mitigation is argued structurally rather than empirically verified. Sections I, VII, VIII.", 397 "supported": "moderate" 398 } 399 ], 400 "red_flags": [ 401 { 402 "flag": "No statistical significance testing", 403 "detail": "All comparative claims (e.g., '108% improvement', '49% improvement') are based on comparing raw counts without any significance tests. With the stochasticity introduced by temperature=0.5, results could vary across runs, but no uncertainty quantification is provided." 404 }, 405 { 406 "flag": "Single benchmark only", 407 "detail": "All experiments use only Defects4J (Java). The title and abstract frame contributions for 'Automated Program Repair' broadly, but generalization to other languages, bug types, or benchmarks is untested." 408 }, 409 { 410 "flag": "Contamination risk not fully mitigated", 411 "detail": "All three LLMs were trained on open-source code that likely includes Defects4J repositories. While entropy rather than direct generation is used, the entropy values themselves could be affected by the model's familiarity with these specific codebases. Training cutoff dates are not stated." 412 }, 413 { 414 "flag": "No compute cost reporting", 415 "detail": "Computing entropy for every line in every file across 395 bugs using three LLMs (6.7B-15.5B parameters) is computationally expensive, but no wall-clock time, GPU hours, or costs are reported. This matters for practical applicability." 416 } 417 ], 418 "cited_papers": [ 419 { 420 "title": "Tbar: Revisiting template-based automated program repair", 421 "authors": ["K. Liu", "A. Koyuncu", "D. Kim", "T. F. Bissyandé"], 422 "year": 2019, 423 "relevance": "Primary template-based APR tool used as the patch generation baseline in efficiency experiments." 424 }, 425 { 426 "title": "Automated program repair in the era of large pre-trained language models", 427 "authors": ["C. S. Xia", "Y. Wei", "L. Zhang"], 428 "year": 2023, 429 "relevance": "Directly applied LLMs for APR and found they can suggest multi-line fixes more accurately than prior APR tools." 430 }, 431 { 432 "title": "Large language models for test-free fault localization", 433 "authors": ["A. Z. Yang", "R. Martins", "C. Le Goues", "V. J. Hellendoorn"], 434 "year": 2023, 435 "arxiv_id": "2310.01726", 436 "relevance": "LLMAO: state-of-the-art LLM-based fault localization tool used as a baseline in this paper." 437 }, 438 { 439 "title": "Patch correctness assessment in automated program repair based on the impact of patches on production and test code", 440 "authors": ["A. Ghanbari", "A. Marcus"], 441 "year": 2022, 442 "relevance": "Shibboleth: state-of-the-art patch ranking tool and source of the 1,290 patch dataset used for evaluation." 443 }, 444 { 445 "title": "The best of both worlds: Combining learned embeddings with engineered features for accurate prediction of correct patches", 446 "authors": ["H. Tian", "K. Liu", "Y. Li"], 447 "year": 2023, 448 "relevance": "Panther: state-of-the-art patch classifier using BERT embeddings and source of the 2,147 patch dataset." 449 }, 450 { 451 "title": "Evaluating large language models trained on code", 452 "authors": ["M. Chen", "J. Tworek", "H. Jun"], 453 "year": 2021, 454 "arxiv_id": "2107.03374", 455 "relevance": "Codex paper establishing LLM code generation evaluation methodology." 456 }, 457 { 458 "title": "InCoder: A generative model for code infilling and synthesis", 459 "authors": ["D. Fried", "A. Aghajanyan", "J. Lin"], 460 "year": 2022, 461 "arxiv_id": "2204.05999", 462 "relevance": "Primary LLM used for entropy computation; its bidirectional infilling training proved most effective for repair tasks." 463 }, 464 { 465 "title": "Starcoder: may the source be with you!", 466 "authors": ["R. Li", "L. B. Allal", "Y. Zi"], 467 "year": 2023, 468 "arxiv_id": "2305.06161", 469 "relevance": "One of three LLMs evaluated for entropy-based program repair." 470 }, 471 { 472 "title": "On the naturalness of buggy code", 473 "authors": ["B. Ray", "V. Hellendoorn", "S. Godhane"], 474 "year": 2016, 475 "relevance": "Foundational work establishing that n-gram language models find buggy code more surprising — the hypothesis this paper revisits with modern LLMs." 476 }, 477 { 478 "title": "Leak, cheat, repeat: Data contamination and evaluation malpractices in closed-source LLMs", 479 "authors": ["S. Balloccu", "P. Schmidtová", "M. Lango", "O. Dušek"], 480 "year": 2024, 481 "arxiv_id": "2402.03927", 482 "relevance": "Directly relevant to data contamination concerns in LLM evaluation." 483 }, 484 { 485 "title": "A large-scale empirical review of patch correctness checking approaches", 486 "authors": ["J. Yang", "Y. Wang", "Y. Lou", "M. Wen", "L. Zhang"], 487 "year": 2023, 488 "relevance": "Found that naturalness-based techniques outperform static techniques for patch correctness, motivating this paper's entropy approach." 489 }, 490 { 491 "title": "Improving fault localization and program repair with deep semantic features and transferred knowledge", 492 "authors": ["X. Meng", "X. Wang", "H. Zhang"], 493 "year": 2022, 494 "relevance": "TransferFL: state-of-the-art transfer-learning-based fault localization tool used as baseline." 495 }, 496 { 497 "title": "On the efficiency of test suite based program repair: A systematic assessment of 16 automated repair systems for java programs", 498 "authors": ["K. Liu", "S. Wang", "A. Koyuncu"], 499 "year": 2020, 500 "relevance": "Established patch evaluation count as a hardware-independent efficiency metric for APR, used in this paper's RQ2." 501 }, 502 { 503 "title": "Is the cure worse than the disease? overfitting in automated program repair", 504 "authors": ["E. K. Smith", "E. T. Barr", "C. Le Goues", "Y. Brun"], 505 "year": 2015, 506 "doi": "10.1145/2786805.2786825", 507 "relevance": "Foundational work on test-suite overfitting in APR — the key problem that entropy-delta aims to mitigate." 508 } 509 ], 510 "engagement_factors": { 511 "practical_relevance": { 512 "score": 2, 513 "justification": "Entropy-delta could be integrated into existing APR tools to reduce test overhead and improve patch ranking, with a modified TBar released." 514 }, 515 "surprise_contrarian": { 516 "score": 1, 517 "justification": "The finding that the smallest model (InCoder 6.7B) outperforms larger ones for entropy-based FL is mildly surprising but consistent with prior work on infilling models." 518 }, 519 "fear_safety": { 520 "score": 0, 521 "justification": "No AI safety or security concerns raised; the work focuses on improving automated program repair." 522 }, 523 "drama_conflict": { 524 "score": 0, 525 "justification": "No controversy or provocative claims; the paper proposes a complementary technique rather than challenging existing approaches." 526 }, 527 "demo_ability": { 528 "score": 1, 529 "justification": "A Zenodo replication package is provided but requires Java/Defects4J setup and local LLM inference — not trivially runnable." 530 }, 531 "brand_recognition": { 532 "score": 1, 533 "justification": "Authors are at Carnegie Mellon (well-known in SE research) and published at ICSE, but no product branding or famous-lab association." 534 } 535 } 536 }