scan.json (32267B)
1 { 2 "paper": { 3 "title": "Reinforcement Learning for Mutation Operator Selection in Automated Program Repair", 4 "authors": [ 5 "Carol Hanna", 6 "Aymeric Blot", 7 "Justyna Petke" 8 ], 9 "year": 2023, 10 "venue": "International Conference on Automated Software Engineering", 11 "arxiv_id": "2306.05792", 12 "doi": "10.1007/s10515-025-00501-z" 13 }, 14 "scan_version": 3, 15 "active_modules": [ 16 "experimental_rigor", 17 "data_leakage" 18 ], 19 "methodology_tags": [ 20 "benchmark-eval" 21 ], 22 "key_findings": "The paper evaluates RL-based mutation operator selection for heuristic-based automated program repair across 30,080 independent repair attempts on 353 Defects4J bugs. Epsilon-greedy with average credit assignment was the most effective RL strategy, but did not significantly improve the number of bugs patched compared to baseline random selection. The RL approach generated more test-passing variants, suggesting the fitness function is too coarse (Boolean pass/fail) to effectively guide learning. Grouping 18 PAR templates into 7 semantically meaningful arms improved performance over using all 18 individually.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "Section XI states 'All source code, supplementary materials, and instructions needed to replicate our results are publicly available at: https://anonymous.4open.science/r/mutationLearner.' A URL to a code repository is provided." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The paper uses the publicly available Defects4J 2.0.0 benchmark and references the publicly available JaRFly replication package (ref [48]). The artifact link also contains supplementary materials." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "Section V-C describes hardware specifications of the compute cluster (RAM, SSD, CPU ranges) but does not provide software dependency specifications such as requirements.txt, Dockerfile, or specific library versions needed to recreate the environment." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": true, 43 "justification": "Section XI provides a link to the artifact with 'instructions needed to replicate our results.' Section V-C describes search parameters, hyperparameters (Table II), and references JaRFly's replication scripts used for launching experiments." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "All tables (III-IX) report only point estimates for success rates, bug counts, and variant numbers. No confidence intervals, error bars, or ± notation are reported anywhere in the paper." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper makes comparative claims (e.g., 'epsilon-greedy algorithm with average credit assignment is the best reinforcement learning-based mutation selection strategy') based solely on comparing raw percentages and counts. No statistical significance tests (p-values, t-tests, etc.) are reported." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "Effect sizes are reported as percentage differences with baseline context throughout. For example, Section VI-C: 'epsilon-greedy experiment 43 unique bugs were patched (2 more than the baseline) with a 4% higher success rate.' Tables provide both absolute numbers and rates enabling comparison." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": true, 65 "justification": "Section V-B justifies using 353 bugs (same set as JaRFly for direct comparison). Section V-C justifies 20 repeat runs 'to account for the heuristic nature of the underlying genetic algorithm.' They justify not extending to 835 bugs due to computational cost ('about 3.5 years of continuous computation')." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "While 20 independent repair attempts are conducted per bug, only aggregate success rates and median/average variant numbers are reported. No standard deviations, interquartile ranges, or other spread measures across the 20 runs are provided." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "The baseline is JaRFly's default uniform random selection strategy. Table V presents baseline results with both GenProg (3 operators) and GenProg+PAR (18 operators). All RL variants are compared against this baseline." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "The baseline is the standard uniform selection approach used in JaRFly, which is described as a 'novel open-source framework for search-based APR' and represents the current standard practice in heuristic-based APR. The choice is justified in Section V-A." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "The paper systematically varies individual components: 4 selection strategies (PM, AP, epsilon-greedy, UCB), 2 credit assignment techniques, 2 reward types (raw vs relative fitness), 2 activation points, and 3 arm configurations (3, 7, 18). Table VII presents a comprehensive ablation of these factors." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "Multiple metrics are reported throughout: success rate (% of repair attempts producing patches), unique bugs patched, average variant number, median variant number, and patch quality scores (min, mean, median, max, % at 100% quality in Table IX)." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": true, 97 "justification": "Section VI-E states 'after manual investigation of the patches, we found that only 2 of the 93 patches in the baseline were correct and corresponded to 1 unique bug. None of the patches generated using the RL strategy were correct.' Manual evaluation of patch correctness was performed." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section V-C describes using held-out evaluation test suites: 'The authors of JaRFly use two versions of the automated test generation tool EvoSuite to create held-out evaluation test suites.' These are separate from the test oracle used during repair." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Results are broken down by operator selection strategy (Tables IV, VI), credit assignment technique (Table IV), number of arms (Tables V, VII), reward type (Table VIII), and activation point (Table VII). Table I breaks down bugs by Defects4J project." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper extensively discusses failures: the 'every mutation' activation point yielded 0% success rate (Table VII), RL with 18 arms didn't improve over baseline (Section VI-D), and none of the RL patches on the 289 previously-unpatched bugs were correct (Section VI-E). Section VII discusses three hypothesized reasons for failure." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The paper is fundamentally a negative result: RL-based selection did not improve the number of bugs patched over random baseline. They report that every-mutation activation yielded 0% success, 40-generation experiments performed worse, and none of the RL patches on new bugs were correct." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "The abstract claims RL 'results in a higher number of test-passing variants, but does not exhibit a noticeable improvement in the number of bugs patched.' This is directly supported by results in Tables IV-IX. The abstract appropriately hedges with 'has not shown such improvements.'" 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "Causal claims such as 'epsilon-greedy is the most effective mutation operator selection strategy' are supported by controlled experiments where only the selection strategy varies while all other parameters are held constant (same benchmark, same tool, same search settings, same 20 repeated runs)." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": true, 134 "justification": "The abstract states evaluation is 'on 353 real-world bugs from the Defects4J benchmark.' Section IX explicitly acknowledges 'our methodology uses Java which may not generalise to other programming languages' and discusses external validity threats specific to their benchmark and language choices." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": true, 139 "justification": "Section VII discusses three alternative explanations for why RL didn't improve APR: (1) learning too slow within the budget, (2) fitness function too coarse (Boolean) to guide learning effectively, (3) edit type may not be sufficient information to steer toward correct patches." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": true, 144 "justification": "The paper measures bugs patched, success rate, and variant efficiency, which directly correspond to the claimed outcomes. They additionally assess patch quality using held-out test suites (Table IX), distinguishing between test-suite-adequate patches and correct patches via manual evaluation." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": false, 151 "justification": "The paper specifies Defects4J version 2.0.0 and EvoSuite versions (v1.0.3 and v1.0.6), but describes JaRFly only as 'the latest version' without providing a specific version number or commit hash. JaRFly is the central tool and its exact version matters for reproducibility." 152 }, 153 "prompts_provided": { 154 "applies": false, 155 "answer": false, 156 "justification": "The paper does not use any prompting or LLMs. The approach implements RL algorithms from scratch for mutation operator selection in genetic programming." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Table II lists all hyperparameters: Pmin, Pmax, beta (0.8), epsilon (0.2), E (10). Section V-C reports search parameters: 10 generations, population size 40, 20 repair attempts. Section VI-A reports tuned learning rates for each strategy. Preliminary experiments test alpha values of 0.2, 0.4, 0.6, 0.8." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "No agentic scaffolding is used. The approach modifies the mutation operator selection within JaRFly's genetic programming search." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section V-B documents the data pipeline: started with 395 bugs from JaRFly, excluded Mockito project, removed 4 deprecated bugs, resulting in 353 bugs. Table I provides per-project breakdown. The preliminary subset selection (5 bugs, first lexicographically from each project) is also documented." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section IX 'Threats to Validity' is a dedicated section covering external, internal, and construct validity threats with substantive discussion across multiple paragraphs." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": true, 183 "justification": "Section IX identifies specific threats: Defects4J generalizability, Java language limitation, heterogeneous cluster hardware, JaRFly baseline reproduction discrepancy (41/49 vs 49 reported), exclusion of 3 PAR templates, and version mismatch issues." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": true, 188 "justification": "Section IX states 'our methodology uses Java which may not generalise to other programming languages' and 'Our choices of benchmark and language enable direct comparison with state-of-the-art approaches.' They explicitly note the approach was tested only on Defects4J and with one APR tool." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section XI provides a link to 'All source code, supplementary materials, and instructions needed to replicate our results' at the anonymous artifact repository. The JaRFly replication package (ref [48]) with bug details and repair scripts is also publicly available." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section V-B describes data collection: used Defects4J bugs from the JaRFly study, explains selection criteria (same bugs as JaRFly, excluding Mockito and deprecated bugs). Table I provides the complete breakdown by project with counts of active and patched bugs." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants. The data source is a standard public benchmark (Defects4J), a well-documented collection of real-world Java bugs." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "The pipeline is documented: 395 JaRFly bugs → exclude Mockito → remove 4 deprecated → 353 bugs (Table I). For RQ1-2: 49 GenProg-patched bugs. For RQ3: 64 GenProg+PAR bugs. For RQ4: 353 total bugs. Preliminary tuning used 5 bugs (first lexicographically per project)." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding source or acknowledgments section is present in the paper text. The research appears to be academic (UCL and Université de Rennes affiliations) but funding is not disclosed." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Author affiliations are clearly listed: University College London and Université de Rennes. These are academic institutions with no obvious conflict regarding the tools or benchmarks evaluated." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": false, 227 "justification": "Since no funding is disclosed, independence cannot be assessed. Academic affiliations suggest independence, but absence of funding disclosure means this criterion is not met." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests statement or financial disclosure is present in the paper." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": false, 238 "answer": false, 239 "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. The RL algorithms learn online from scratch during each repair attempt. There is no pre-trained model with a training data cutoff to report." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": false, 243 "answer": false, 244 "justification": "No pre-trained model is evaluated. The RL approach learns operator selection probabilities from scratch during the repair process for each bug. Train/test overlap in the pre-trained model sense does not apply." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": false, 248 "answer": false, 249 "justification": "No pre-trained model is used. The RL algorithms are implemented from scratch and learn during each repair session. Benchmark contamination through pre-training data is not applicable." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants. The study is a benchmark evaluation of RL-based mutation operator selection on Defects4J software bugs." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants. The study analyzes automated program repair on software bugs." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in this study." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": true, 293 "justification": "Section VI reports 'Computation time for each repair attempt ranged, on average, from 45 minutes to 1.5 hours, up to 12 hours in longest runs.' The total budget is described as '30,080 independent repair attempts' amounting to 'about 3.5 years of continuous computation.'" 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": true, 298 "justification": "Section V-C describes the compute cluster (1028 nodes with varying specs). Section VI states the total of 30,080 repair attempts amounting to about 3.5 years of continuous computation or 'multiple months of active cluster usage.' Hardware details include RAM (16-375GB), SSD (80-780GB), Intel Xeon CPUs (4-48 cores)." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "While 20 independent repair attempts are run per bug, results are only reported as aggregate success rates and median/average variant numbers. Section VI-C notes 'We observed variance even when the same seeds were used' but no systematic seed sensitivity analysis is presented." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": true, 310 "justification": "Section V-C clearly states 'For each bug, 20 repair attempts were launched independently, to account for the heuristic nature of the underlying genetic algorithm.' The preliminary study used 1600 total runs (Section V-C). Total: 30,080 repair attempts (Section VI)." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": true, 315 "justification": "Section V-C describes preliminary experiments: 4 learning rates × 4 operator selection strategies × 5 bugs × 20 repeats = 1600 repair runs for tuning. The 4 alpha values (0.2, 0.4, 0.6, 0.8) and selection criteria are clearly stated in Table III." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": true, 320 "justification": "Section VI-A states 'For each operator selection strategy, we chose the learning rate value that maximised the number of successful repair attempts.' Selection was on a separate subset of 5 bugs (~10% of the 49 patched bugs), preventing overfitting to the full evaluation set." 321 }, 322 "multiple_comparison_correction": { 323 "applies": true, 324 "answer": false, 325 "justification": "The paper compares 4 selection strategies × 2 credit assignments × 2 reward types × multiple arm configurations without applying any correction for multiple comparisons. No statistical significance tests are performed at all." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "The authors implement RL variants within JaRFly and compare against JaRFly's own baseline. While they note baseline replication discrepancies (41/49 vs 49 in Section VI-C), they do not explicitly discuss the bias inherent in evaluating their own system against their own baseline implementation." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": true, 335 "justification": "All main experiments use matched compute budgets (10 generations × 40 population = 400 evaluations per attempt). The 40-generation variant (40 × 10 = 400) maintains the same total evaluations. Table VII explicitly compares different generation/population configurations at matched total compute." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": false, 340 "justification": "The paper justifies using Defects4J as 'the most comprehensive and popular dataset for evaluating Java APR tools' (Section V-B) but does not discuss whether it adequately measures the capabilities the paper claims to evaluate, or whether the benchmark has known limitations." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "No agentic scaffolding is used. The approach modifies mutation operator selection within a single tool (JaRFly) with all other components held constant." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "The paper does not discuss whether the time between bug creation and the experiment could affect results, or whether the repair tool could benefit from temporal information leakage through the code corpus used for mutations." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "The paper does not discuss whether the test oracle provides information that could constitute feature leakage, or whether the fitness function based on test passing leaks information about the correct fix." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "The paper does not discuss potential non-independence among Defects4J bugs (e.g., bugs from the same project sharing structural properties) or how this could affect generalizability of results." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": false, 367 "justification": "No concrete leakage detection or prevention method is applied. The paper does not use temporal splits, decontamination, or any other leakage mitigation technique." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "Average credit assignment is best suited for mutation operator selection in heuristic-based APR, indicating the search process is stationary.", 374 "evidence": "Table IV shows that in all experiments (PM, UCB, epsilon-greedy), the success rate with average credit assignment is higher than with exponential recency-weighted average. Only AP shows a marginal advantage for exponential on unique bugs patched (Section VI-B).", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "Epsilon-greedy is the most effective mutation operator selection strategy, while probability matching is the most efficient.", 379 "evidence": "Table IV shows epsilon-greedy achieves 47.9% success rate and patches 43/49 bugs (highest). Table VI shows PM has the lowest median (50) and average (97) variant numbers among RL strategies for common bugs (Section VI-C).", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "Increasing the number of mutation operators from 3 to 18 does not improve bug patching with RL-based selection.", 384 "evidence": "Section VI-D reports 18-arm epsilon-greedy achieved 35.2% success rate and 51 unique bugs vs baseline 35.6% and 52 bugs with 18 operators (Table V).", 385 "supported": "strong" 386 }, 387 { 388 "claim": "Grouping PAR templates into 7 semantic arms improves over 18 individual arms.", 389 "evidence": "Table VII shows 7 arms achieve 43% success rate vs 31% for 18 arms on the 5-bug sample. Table VIII shows 7-arm configuration patches 48-51 bugs on the 64-bug set.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "RL-based mutation operator selection generates more test-passing variants but does not significantly improve bugs patched compared to baseline.", 394 "evidence": "Tables IV-VIII show RL variants have comparable or slightly better success rates but similar unique bug counts. Section VI-E shows none of the RL patches on the 289 previously-unpatched bugs were correct vs 2 correct baseline patches.", 395 "supported": "strong" 396 }, 397 { 398 "claim": "The coarse Boolean fitness function limits the effectiveness of RL-guided operator selection.", 399 "evidence": "Section VII hypothesizes this based on observed results. Table VII shows 'every mutation' activation (more frequent updates) yields 0% success, suggesting the RL signal is insufficient. No direct experimental test of this hypothesis is provided.", 400 "supported": "weak" 401 } 402 ], 403 "red_flags": [ 404 { 405 "flag": "No statistical significance tests", 406 "detail": "All comparative claims (e.g., 'epsilon-greedy is the most effective') are based on comparing raw percentages and counts without any statistical significance tests. With 20 runs per bug, the observed differences (e.g., 43.9% vs 47.9% success rate) could be within normal variation." 407 }, 408 { 409 "flag": "Baseline reproduction failure", 410 "detail": "The authors could only reproduce 41/49 bugs from the JaRFly baseline despite using the same parameters and seeds. This 16% discrepancy casts doubt on the comparability of the baseline and experimental results, attributed to version differences and excluded PAR templates." 411 }, 412 { 413 "flag": "Preliminary tuning on only 5 bugs", 414 "detail": "Hyperparameter tuning (learning rate selection) was performed on only 5 bugs, which is ~10% of the 49 bugs used for RQ1-2 evaluation. This tiny tuning set may not be representative, and the first-lexicographically selection criterion is arbitrary." 415 }, 416 { 417 "flag": "No variance or uncertainty measures", 418 "detail": "Despite running 20 independent trials per bug, the paper never reports standard deviations, confidence intervals, or any measure of result stability across runs. This makes it impossible to assess whether observed differences are meaningful." 419 } 420 ], 421 "cited_papers": [ 422 { 423 "title": "Automated program repair", 424 "authors": ["C. Le Goues", "M. Pradel", "A. Roychoudhury"], 425 "year": 2019, 426 "relevance": "Comprehensive survey of APR approaches spanning constraint-based, learning-based, and heuristic-based techniques." 427 }, 428 { 429 "title": "GenProg: A generic method for automatic software repair", 430 "authors": ["C. Le Goues", "T. V. Nguyen", "S. Forrest", "W. Weimer"], 431 "year": 2012, 432 "relevance": "Foundational heuristic-based APR tool using genetic programming with statement-level mutations." 433 }, 434 { 435 "title": "Sapfix: Automated end-to-end repair at scale", 436 "authors": ["A. Marginean", "J. Bader", "S. Chandra", "M. Harman", "Y. Jia", "K. Mao", "A. Mols", "A. Scott"], 437 "year": 2019, 438 "relevance": "Industrial-scale automated program repair deployment at Meta, demonstrating real-world APR adoption." 439 }, 440 { 441 "title": "ARJA: Automated Repair of Java Programs via Multi-Objective Genetic Programming", 442 "authors": ["Y. Yuan", "W. Banzhaf"], 443 "year": 2018, 444 "relevance": "Multi-objective genetic programming approach to APR evaluated on Defects4J benchmark." 445 }, 446 { 447 "title": "Quality of Automated Program Repair on Real-World Defects", 448 "authors": ["M. Motwani", "M. Soto", "Y. Brun", "R. Just", "C. Le Goues"], 449 "year": 2022, 450 "relevance": "JaRFly framework used as the base tool in this study, with patch quality evaluation methodology on Defects4J." 451 }, 452 { 453 "title": "Automatic Patch Generation Learned from Human-Written Patches", 454 "authors": ["D. Kim", "J. Nam", "J. Song", "S. Kim"], 455 "year": 2013, 456 "relevance": "Introduces PAR templates for APR, providing 18 fix templates used as mutation operators in this study." 457 }, 458 { 459 "title": "An analysis of the automatic bug fixing performance of chatgpt", 460 "authors": ["D. Sobania", "M. Briesch", "C. Hanna", "J. Petke"], 461 "year": 2023, 462 "relevance": "Evaluates LLM-based (ChatGPT) automated program repair performance on benchmarks." 463 }, 464 { 465 "title": "Cure: Code-aware neural machine translation for automatic program repair", 466 "authors": ["N. Jiang", "T. Lutellier", "L. Tan"], 467 "year": 2021, 468 "relevance": "Neural machine translation approach to APR demonstrating learning-based repair techniques." 469 }, 470 { 471 "title": "Less training, more repairing please: Revisiting automated program repair via zero-shot learning", 472 "authors": ["C. S. Xia", "L. Zhang"], 473 "year": 2022, 474 "relevance": "Zero-shot LLM-based APR approach, demonstrating the shift toward language model-based repair." 475 }, 476 { 477 "title": "Automated program repair in the era of large pre-trained language models", 478 "authors": ["C. S. Xia", "Y. Wei", "L. Zhang"], 479 "year": 2023, 480 "relevance": "Evaluates large pre-trained language models for automated program repair." 481 }, 482 { 483 "title": "On the introduction of automatic program repair in bloomberg", 484 "authors": ["S. Kirbas", "E. Windels", "O. Mcbello"], 485 "year": 2020, 486 "relevance": "Industrial deployment of heuristic-based APR at Bloomberg, demonstrating real-world adoption." 487 }, 488 { 489 "title": "Is the cure worse than the disease? Overfitting in automated program repair", 490 "authors": ["E. K. Smith", "E. T. Barr", "C. Le Goues", "Y. Brun"], 491 "year": 2015, 492 "relevance": "Identifies the overfitting problem in APR where patches pass test suites but are incorrect." 493 } 494 ], 495 "engagement_factors": { 496 "practical_relevance": { 497 "score": 1, 498 "justification": "The approach is implemented in JaRFly but shows no significant improvement over random selection, limiting immediate practical utility." 499 }, 500 "surprise_contrarian": { 501 "score": 1, 502 "justification": "The negative result — that RL does not help APR despite helping other evolutionary algorithm domains — is mildly surprising but in a niche area." 503 }, 504 "fear_safety": { 505 "score": 0, 506 "justification": "No AI safety or security concerns are raised by this work on mutation operator selection." 507 }, 508 "drama_conflict": { 509 "score": 0, 510 "justification": "No controversy or conflict; the paper presents straightforward negative experimental results." 511 }, 512 "demo_ability": { 513 "score": 1, 514 "justification": "Source code is available but requires setting up JaRFly, Defects4J, and a Java environment — not easily demoed." 515 }, 516 "brand_recognition": { 517 "score": 0, 518 "justification": "Academic work from UCL and Université de Rennes; not a well-known lab or product for general audiences." 519 } 520 } 521 }