scan.json (31253B)
1 { 2 "paper": { 3 "title": "Rethinking Kernel Program Repair: Benchmarking and Enhancing LLMs with RGym", 4 "authors": [ 5 "Kareem Shehada", 6 "Yifan Wu", 7 "Wyatt D. Feng", 8 "Adithya Iyer", 9 "Gryphon Kumfert", 10 "Yangruibo Ding", 11 "Zhiyun Qian" 12 ], 13 "year": 2025, 14 "venue": "NeurIPS 2025 Workshop: Evaluating the Evolving LLM Lifecycle", 15 "arxiv_id": "2511.15757", 16 "doi": "10.48550/arXiv.2511.15757" 17 }, 18 "scan_version": 3, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "methodology_tags": ["benchmark-eval"], 21 "key_findings": "RGym is a lightweight, platform-agnostic framework for evaluating LLM-based automated program repair on Linux kernel KASAN bugs, designed to run on local commodity hardware instead of requiring GCP. Using practical localization (bug-inducing commits and call stacks) instead of oracle guidance, the simple APR pipeline achieves 37.76% (GPT-4o with feedback) to 43.36% (GPT-5 Thinking) pass rates at under $0.20 per bug on 143 verified bugs. Manual verification of 31 patches reveals only 32% are plausibly correct while 45% are wrong despite passing the crash test, highlighting the unreliability of crash-based patch evaluation.", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": false, 27 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. RGym is described but no release link is given." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": false, 32 "justification": "The dataset of 143 KASAN bugs is described as organized from Syzbot 'into an easily consumable format' but no download link or data archive is provided. While Syzbot itself is public, the curated 143-bug subset is not released." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "Hardware is described (56 cores, 160GB RAM, 1TB SSD) and Docker/QEMU are mentioned, but no requirements.txt, Dockerfile, conda environment, or detailed dependency list is provided. Not enough detail to recreate the environment." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "No step-by-step reproduction instructions, README, or reproduction scripts are provided. The paper describes the architecture at a high level but does not give commands or procedures to replicate experiments." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "All results in Table 1 are point estimates (e.g., '37.76%', '43.36%') with no confidence intervals, error bars, or uncertainty quantification." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "Claims like 'SimpleAgent+Feedback achieves 37.76%' vs 'kGym-oracle achieves 2.8%' are made by comparing raw numbers with no statistical significance tests (no p-values, t-tests, or bootstrap tests)." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "Improvements are reported with baseline context throughout: 'a 6.99% improvement over kGym's oracle-guided solution,' 'bad patches are reduced by 76%,' and costs are compared in absolute terms (e.g., '$0.18 vs $21.62 per bug')." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "The 143-bug dataset results from filtering 6,088 Syzbot bugs, but no justification is given for whether 143 is sufficient for the claims made. No power analysis or discussion of statistical adequacy." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "Results appear to be from single runs per configuration. No standard deviations, variance across seeds, or spread measures are reported in any table or figure." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "Multiple baselines are included: kGym-oracle with GPT-4-turbo and GPT-4o, kGym-oracle+functionwise, and SimpleAgent without BIC. CrashFixer is discussed as external comparison (Table 1)." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "kGym (2024) and CrashFixer (2025) are recent, relevant baselines. The paper also tests with state-of-the-art models (GPT-5 Thinking, Claude Opus 4.1)." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "Table 1 systematically ablates components: function-wise patching, BIC localization, feedback/retries, and function exploration. Section 3.1 explicitly isolates each component's contribution." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "Three metrics are reported: pass rate, bad patch rate, and average cost per bug (Table 1). Compute hours are also provided (Table 3). Manual correctness evaluation adds plausible/helpful/wrong categories (Table 2)." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": true, 96 "justification": "Table 2 reports manual verification of 31 patches, classifying them as plausibly correct (10), helpful (7), or wrong (14). Section 5.1 describes the methodology." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": false, 101 "justification": "All 143 bugs are used as the test set for all configurations. There is no explicit dev/test separation, and prompt designs may have been developed using the same bugs that are evaluated." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Table 4 breaks down reproducer job outputs (pass, trigger, racey, boot fail, other). Table 5 shows build job outputs. Figure 2 shows unique solves per configuration. The dataset targets three bug types (OOB, UAF, NPD)." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "Failure modes are discussed extensively: bad patches, compilation failures (Tables 4-5), non-deterministic reproducers (Section 3.1), and wrong patches that pass crash tests but don't fix the root cause (Table 2, Section 5.1)." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "Function Exploration Agent achieves only 15.38% pass rate despite being more expensive ($0.12/bug), and Claude Opus 4.1 costs 4.05x more than GPT-5 while performing 11.2% worse. These unfavorable results are reported honestly." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "Abstract claims of '43.36% pass rate with GPT-5 Thinking' and 'cost of under $0.20 per bug' are directly supported by Table 1 (43.36% pass rate, $0.18/bug). The claim of 'lightweight, platform-agnostic' is supported by the Docker/QEMU-based architecture." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "Causal claims are made via controlled ablation: adding function-wise patching to kGym improves pass rate from 2.8% to 10.49%; adding BIC improves from 17.48% to 21.67%. Each ablation changes a single variable, supporting causal attribution." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": true, 133 "justification": "The paper focuses specifically on Linux kernel KASAN bugs (memory corruption: OOB, UAF, NPD) from Syzbot. The title specifies 'Kernel Program Repair' and claims are consistently scoped to this domain." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "No alternative explanations are discussed for the observed improvements. For example, the paper doesn't consider whether GPT-5's improvement is due to having seen Syzbot fixes in training, or whether the BIC improvement is partly due to the BIC revealing the fix location rather than providing localization insight." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": true, 143 "justification": "The paper explicitly acknowledges the proxy gap in Section 5.1: of 31 passing patches manually verified, only 32% are plausibly correct and 45% are wrong. They note 'it is insufficient to simply rely on observing the absence of crashes to verify the correctness of patches.'" 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": false, 150 "justification": "Models are listed as 'GPT-4o', 'GPT-4-turbo', 'GPT-5 Thinking', and 'Claude Opus 4.1' without snapshot dates or API version identifiers. No specific version strings like 'gpt-4o-2024-05-13' are provided." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": false, 155 "justification": "Prompts are described in natural language ('bug-type specific instructions,' 'in-context learning examples for OOB, UAF, and NPD bugs') but the actual prompt text is never shown in the paper or appendix, and no repository link is provided." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": false, 160 "justification": "No temperature, top-p, max tokens, or other API parameters are reported for any of the LLM configurations tested." 161 }, 162 "scaffolding_described": { 163 "applies": true, 164 "answer": true, 165 "justification": "The scaffolding is described with a workflow diagram (Figure 1), two agent types (Simple Agent and Function Exploration Agent), function-wise patching mechanism, retry logic with error summary feedback, and the RGym test loop. The code-viewing capability of the Function Exploration Agent is described." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "The filtering pipeline is documented: 6,088 Syzbot bugs → filter for fix commits, reproducers, crash reports, kernel configs → filter to KASAN bugs → verify reproducibility at parent of fix commit → 143 bugs. Criteria at each stage are stated." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": false, 177 "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion is brief (Section 4) and does not discuss limitations substantively." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": false, 182 "justification": "No specific threats to validity are discussed anywhere in the paper. There is no discussion of threats from non-deterministic reproducers affecting results, potential contamination, or small sample size for manual evaluation." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": false, 187 "justification": "The paper does not explicitly state what the results do NOT show. It does not discuss exclusion of non-KASAN bugs, non-Linux kernels, or the limited generalizability of results from 143 bugs to kernel APR broadly." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": false, 194 "justification": "No raw data (bug reports, patches generated, evaluation logs) is made available. Only aggregated results in tables are presented." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Data collection is described: bugs sourced from Syzbot, filtered by availability of fix commits, reproducers, crash reports, and kernel configs, restricted to KASAN bugs, and verified for reproducibility. Source (Syzbot) and criteria are stated." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants. Data source is Syzbot, a public automated bug reporting system." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": false, 209 "justification": "While filtering criteria are described, intermediate counts at each stage are missing. The paper jumps from '6,088 Syzbot bugs' to '143 reproducible KASAN bugs' without stating how many bugs remained after each filter step." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding sources or acknowledgments section is present in the paper. No grants, sponsors, or funding agencies are mentioned." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "All author affiliations are clearly listed: University of California Riverside and Columbia University. The paper evaluates commercial LLMs (GPT, Claude) but the authors have no disclosed affiliation with those companies." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "No funding is disclosed, so independence of funders cannot be verified. The paper does not include any statement about funding or competing interests." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests or financial interests statement is present in the paper." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "No training data cutoff dates are stated for GPT-4o, GPT-4-turbo, GPT-5 Thinking, or Claude Opus 4.1. This is critical since Syzbot bugs and their fix commits are publicly available." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": false, 243 "justification": "No discussion of whether the LLMs may have seen Syzbot bug reports or their fix commits during training. Syzbot data is publicly indexed and could easily appear in training corpora." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": false, 248 "justification": "Syzbot bugs and fix commits are publicly available online and predate the models' training. The paper does not address whether the models may have memorized the fixes, which would inflate pass rates." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in the study. This is a benchmark evaluation of LLM-based automated program repair." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants. The study evaluates LLMs on automated kernel bug fixes from Syzbot." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in the study." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in the study." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in the study." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in the study." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in the study." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": true, 292 "justification": "Table 1 reports average cost per bug for every configuration, ranging from $0.05 (kGym+GPT-4o) to $0.73 (Claude Opus 4.1). CrashFixer's cost of $21.62/bug is cited for comparison." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": true, 297 "justification": "Table 3 reports clock hours for each configuration. Hardware is specified (two 56-core machines, 160GB RAM, 1TB SSD). Section 5.2 describes the compute setup in detail." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "Results appear to be from single runs per configuration. No random seed sensitivity analysis or multi-seed results are reported." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": false, 309 "justification": "The number of LLM generation runs per configuration is not explicitly stated. For feedback experiments, retry counts are clear (up to 3), but for non-feedback experiments the paper does not state whether each bug was attempted once or multiple times." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "No hyperparameter search is described. It is unclear how prompt designs, retry counts, or other configuration choices were selected." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": true, 319 "justification": "All configurations are reported in Table 1 with their full results, not just the best one. The ablation is systematic and transparent." 320 }, 321 "multiple_comparison_correction": { 322 "applies": false, 323 "answer": false, 324 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "The authors re-implement kGym's baseline and report lower numbers than originally published (1.4% vs 5.38%), but do not discuss bias from evaluating their own system against their own baseline reimplementation." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": true, 334 "justification": "Table 1 reports both pass rate and cost per bug for each configuration, and Section 3.2 explicitly discusses cost-effectiveness trade-offs. Table 3 provides compute hours." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": true, 339 "justification": "Section 5.1 explicitly questions whether crash absence equals correctness, showing only 32% of passing patches are plausibly correct. The paper acknowledges 'it is insufficient to simply rely on observing the absence of crashes to verify the correctness of patches.'" 340 }, 341 "scaffold_confound_addressed": { 342 "applies": true, 343 "answer": true, 344 "justification": "When comparing GPT-4o, Claude Opus 4.1, and GPT-5 Thinking, the same SimpleAgent scaffold is used, isolating the model variable. Different scaffolds (SimpleAgent vs Function Exploration) are compared separately." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "Syzbot bugs and their fix commits have been publicly available for years. The paper does not discuss whether models trained after these bugs were published could have memorized the fixes." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "The BIC (bug-inducing commit) is provided as input, which includes the actual code change that introduced the bug. While the paper argues BICs are obtainable via SymBisect, there is no discussion of whether providing the BIC leaks too much information about the fix." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "No discussion of whether the 143 bugs are independent or whether they share structural similarities (e.g., same subsystems, similar bug patterns) that could bias results." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": false, 366 "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, or decontamination applied." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "SimpleAgent with feedback achieves 37.76% pass rate with GPT-4o at $0.17 per bug", 373 "evidence": "Table 1 shows SimpleAgent+Feedback with GPT-4o: 37.76% pass rate, 4.89% bad patch rate, $0.17 avg cost/bug. Section 3.1 details the retry breakdown: 34 bugs first attempt, 8 second, 12 third.", 374 "supported": "strong" 375 }, 376 { 377 "claim": "GPT-5 Thinking achieves 43.36% pass rate at $0.18 per bug, outperforming Claude Opus 4.1 (32.16% at $0.73)", 378 "evidence": "Table 1 directly reports these numbers. Section 3.3 discusses the comparison: GPT-5 costs 4.05x less while performing 11.2% better.", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Function-wise patching reduces bad patches by 76%", 383 "evidence": "Table 1: kGym-oracle bad patch rate drops from 51.88% to 12.14% when function-wise patching is added. Pass rate increases from 2.8% to 10.49%.", 384 "supported": "strong" 385 }, 386 { 387 "claim": "BIC-based localization improves pass rate by 4.19% over call stack alone", 388 "evidence": "Table 1: SimpleAgent-nobic achieves 17.48% vs SimpleAgent with BIC at 21.67%, a 4.19 percentage point improvement (Section 3.1).", 389 "supported": "strong" 390 }, 391 { 392 "claim": "Combined pass rate of all configurations reaches 68.53%", 393 "evidence": "Section 3.3 states this as the union of solved bugs across all configurations at an average cost of $1.33 per bug. Figure 2 shows unique solves per configuration.", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "kGym's originally reported 5.38% pass rate drops to 1.4% with stricter multi-VM testing", 398 "evidence": "Section 3.1 explains the discrepancy: kGym uses a single VM for 10 minutes, but roughly one-third of bugs have non-deterministic reproducers (Table 4). Using 26 VMs reveals this.", 399 "supported": "moderate" 400 }, 401 { 402 "claim": "Only 32% of passing patches are plausibly correct, with 45% being wrong", 403 "evidence": "Table 2 reports manual verification of 31 patches: 10 plausibly correct (32.23%), 7 helpful (22.58%), 14 wrong (45.16%). Section 5.1 describes the verification methodology.", 404 "supported": "moderate" 405 }, 406 { 407 "claim": "Simple APR designs can achieve comparable results to CrashFixer at 120x lower cost", 408 "evidence": "Section 3.3 compares: SimpleAgent GPT-5 at $0.18/bug vs CrashFixer at $21.62/bug (120.11x more expensive). CrashFixer achieves 65.6% vs 43.36%, only 22.24% higher. However, datasets differ and CrashFixer uses oracle localization.", 409 "supported": "weak" 410 } 411 ], 412 "red_flags": [ 413 { 414 "flag": "No contamination analysis", 415 "detail": "Syzbot bugs and their fix commits are publicly available and indexed online. LLMs like GPT-5 and Claude Opus 4.1 may have seen these exact bugs and fixes during training. The paper does not address this at all, which could significantly inflate pass rates — especially for GPT-5 Thinking's strong 43.36% result." 416 }, 417 { 418 "flag": "No code or data release", 419 "detail": "Neither RGym, the APR tool, the curated 143-bug dataset, nor the generated patches are released. This completely prevents independent verification or reproduction of results." 420 }, 421 { 422 "flag": "No statistical significance testing", 423 "detail": "All comparisons between configurations are based on raw pass rate differences (e.g., '4.19% improvement') without any significance tests. With only 143 bugs, many of these differences may not be statistically significant." 424 }, 425 { 426 "flag": "Small manual correctness sample", 427 "detail": "Only 31 of the passing patches were manually verified for correctness (Table 2). This is a small sample for drawing conclusions about the overall correctness distribution, especially when extrapolating to other configurations or models." 428 }, 429 { 430 "flag": "Indirect comparison with CrashFixer", 431 "detail": "The paper compares against CrashFixer's reported numbers on kBenchSyz, described only as 'similar enough' to their dataset. The datasets differ, CrashFixer uses oracle localization, and CrashFixer is closed source — making the comparison unreliable for strong claims about relative performance." 432 }, 433 { 434 "flag": "Single-run results without variance", 435 "detail": "All configurations appear to be evaluated once. Given that LLM outputs are stochastic and reproducers are non-deterministic (one-third of bugs per Table 4), results could vary significantly across runs." 436 } 437 ], 438 "cited_papers": [ 439 { 440 "title": "SWE-bench: Can language models resolve real-world github issues?", 441 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R. Narasimhan"], 442 "year": 2024, 443 "relevance": "Major LLM coding benchmark that the paper positions kernel APR as complementing, since SWE-bench focuses on userspace applications." 444 }, 445 { 446 "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair", 447 "authors": ["Islem Bouzenia", "Premkumar Devanbu", "Michael Pradel"], 448 "year": 2025, 449 "relevance": "Autonomous LLM-based program repair agent, directly relevant as a concurrent approach to automated bug fixing." 450 }, 451 { 452 "title": "SWE-agent: Agent-computer interfaces enable automated software engineering", 453 "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik R. Narasimhan", "Ofir Press"], 454 "year": 2024, 455 "relevance": "Foundational agent-computer interface framework for LLM-based software engineering, cited as part of the APR landscape." 456 }, 457 { 458 "title": "Kgym: A platform and dataset to benchmark large language models on linux kernel crash resolution", 459 "authors": ["Alex Mathai", "Chenxi Huang", "Petros Maniatis", "Aleksandr Nogikh", "Franjo Ivancic", "Junfeng Yang", "Baishakhi Ray"], 460 "year": 2024, 461 "arxiv_id": "2407.02680", 462 "relevance": "Direct predecessor: the platform and dataset that RGym improves upon for kernel APR evaluation." 463 }, 464 { 465 "title": "CrashFixer: A crash resolution agent for the linux kernel", 466 "authors": ["Alex Mathai", "Chenxi Huang", "Suwei Ma", "Jihwan Kim", "Hailie Mitchell", "Aleksandr Nogikh", "Petros Maniatis", "Franjo Ivančić", "Junfeng Yang", "Baishakhi Ray"], 467 "year": 2025, 468 "relevance": "Primary comparison point: complex debug-tree-based kernel APR achieving 65.6% pass rate at $21.62/bug using oracle localization." 469 }, 470 { 471 "title": "OpenHands: An open platform for AI software developers as generalist agents", 472 "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"], 473 "year": 2025, 474 "relevance": "Open platform for AI software engineering agents, part of the broader LLM-based development tool ecosystem." 475 }, 476 { 477 "title": "InferFix: End-to-end program repair with LLMs", 478 "authors": ["Matthew Jin", "Syed Shahriar", "Michele Tufano"], 479 "year": 2023, 480 "relevance": "End-to-end LLM-based program repair system, part of the APR literature this work builds upon." 481 }, 482 { 483 "title": "Automated program repair via conversation: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT", 484 "authors": ["Chunqiu Steven Xia", "Lingming Zhang"], 485 "year": 2024, 486 "relevance": "Demonstrates cost-effective conversational APR with ChatGPT, directly relevant to the cost-efficiency claims in this paper." 487 }, 488 { 489 "title": "How far can we go with practical function-level program repair?", 490 "authors": ["Jiahong Xiang", "Xiaoyang Xu", "Fanchu Kong"], 491 "year": 2024, 492 "relevance": "Explores function-level program repair, directly relevant to the function-wise patching technique used in RGym." 493 }, 494 { 495 "title": "Aligning the Objective of LLM-Based Program Repair", 496 "authors": ["Junjielong Xu", "Ying Fu", "Shin Hwei Tan", "Pinjia He"], 497 "year": 2025, 498 "relevance": "Addresses alignment of LLM objectives for program repair, relevant to the broader APR methodology." 499 }, 500 { 501 "title": "A case study of LLM for automated vulnerability repair: Assessing impact of reasoning and patch validation feedback", 502 "authors": ["Ummay Kulsum", "Haotian Zhu", "Bowen Xu", "Marcelo d'Amorim"], 503 "year": 2024, 504 "relevance": "Studies feedback-based LLM vulnerability repair, directly relevant to the retry/feedback mechanism in this paper." 505 }, 506 { 507 "title": "Boosting automated program repair with bug-inducing commits", 508 "authors": ["Ming Wen", "Yepang Liu", "Shing-Chi Cheung"], 509 "year": 2020, 510 "relevance": "Foundational work on using bug-inducing commits for APR localization, the key localization strategy adopted by RGym." 511 }, 512 { 513 "title": "SymBisect: Accurate bisection for Fuzzer-Exposed vulnerabilities", 514 "authors": ["Zheng Zhang", "Yu Hao", "Weiteng Chen"], 515 "year": 2024, 516 "relevance": "Automated bug-inducing commit identification tool achieving 75% accuracy on Syzbot bugs, key enabler for the realistic localization strategy." 517 } 518 ], 519 "engagement_factors": { 520 "practical_relevance": { 521 "score": 2, 522 "justification": "RGym is designed for local commodity hardware, making kernel APR evaluation more accessible, though the tool itself is not released." 523 }, 524 "surprise_contrarian": { 525 "score": 2, 526 "justification": "Challenges the assumption that kernel APR requires complex pipelines like CrashFixer, showing a simple approach achieves comparable results at 120x lower cost." 527 }, 528 "fear_safety": { 529 "score": 0, 530 "justification": "No AI safety or security concerns raised; the paper focuses on fixing kernel bugs, not attacking systems." 531 }, 532 "drama_conflict": { 533 "score": 1, 534 "justification": "Implicitly critiques kGym's GCP dependency and CrashFixer's complexity/cost, but the tone is measured and academic." 535 }, 536 "demo_ability": { 537 "score": 0, 538 "justification": "No code, data, or demo is released; the tool cannot be tried by readers." 539 }, 540 "brand_recognition": { 541 "score": 1, 542 "justification": "Authors are from UC Riverside and Columbia; tests GPT-5 and Claude Opus which are recognizable but the paper/lab is not from a major AI company." 543 } 544 } 545 }