scan.json (29281B)
1 { 2 "paper": { 3 "title": "RGFL: Reasoning Guided Fault Localization for Automated Program Repair Using Large Language Models", 4 "authors": [ 5 "Melika Sepidband", 6 "Hamed Taherkhani", 7 "Hung Viet Pham", 8 "Hadi Hemmati" 9 ], 10 "year": 2026, 11 "venue": "arXiv.org", 12 "arxiv_id": "2601.18044", 13 "doi": "10.48550/arXiv.2601.18044" 14 }, 15 "scan_version": 3, 16 "active_modules": ["experimental_rigor", "data_leakage"], 17 "methodology_tags": ["benchmark-eval"], 18 "key_findings": "RGFL introduces per-candidate LLM reasoning into fault localization for automated program repair, improving file-level Hit@1 from 71.4% to 85% and element-level exact match from 36% to 69% on SWE-bench Verified. End-to-end repair resolved rate improved from 51.6% to 58.2%. Embedding-based reranking of reasoning consistently underperformed LLM-based reranking. Counterfactual ablation showed file localization errors have the highest per-case repair impact (50% recovery), while line localization errors are most frequent but hardest to fix in isolation.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "Section 8 states: 'We release the source code of our experiments' with a GitHub URL: https://github.com/MelikaSepidband/RGFL." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "The paper uses publicly available SWE-bench Verified, SWE-bench Lite, and SWE-bench Java datasets, all referenced with Hugging Face dataset URLs." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is provided in the paper." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "The paper provides a GitHub link but no step-by-step reproduction instructions, commands to run, or a 'Reproducing Results' section." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "All results are reported as point estimates (e.g., '85%', '69%', '58.2%') with no confidence intervals or error bars." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper claims RGFL 'improves' and 'outperforms' baselines by comparing raw percentages without any statistical significance tests (no p-values, t-tests, or bootstrap tests)." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Effect sizes are reported with baseline context throughout: 'Hit@1 improves from 71.4% to 85% (19.05% improvement)', 'exact match increases to 69%, which is 91.67% improvement over Agentless', and absolute percentages with both before/after values." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "No justification provided for dataset sizes (500, 300, 91 instances). The paper uses standard benchmarks but does not discuss whether these sizes are adequate for the claims made." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "Despite generating 40 candidate patches per instance (10 samples × 4 runs), no variance, standard deviation, or spread measure is reported across runs." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper compares against multiple baselines: Agentless, OpenHands, AutoCodeRover, OrcaLoca, and a RAG baseline with BM25 retrieval (Tables 4, 5, 6)." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "Baselines include state-of-the-art systems from the SWE-bench leaderboard: OpenHands (Claude 4 Sonnet), OrcaLoca (Claude 4 Sonnet), Agentless, and TRAE — all very recent systems." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper ablates: (1) file-only vs file+element reasoning (Table 3, Table 6), (2) embedding-based vs LLM-based reranking (Table 2), (3) three different LLMs for reasoning, and (4) counterfactual ablation injecting ground-truth at each localization stage (Section 5.4)." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "Multiple metrics are used: Hit@k, Recall@k, MRR for file localization; Exact Match for element localization; Resolved Rate for end-to-end repair." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": false, 93 "justification": "All evaluation is fully automated via test-suite-based patch validation and metric computation. No human evaluation of the system's outputs is performed." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": false, 98 "justification": "The paper uses SWE-bench Verified for both model selection (comparing Gemini 2.5 Pro, Claude 4 Sonnet, o4-mini in RQ1.1) and final result reporting (RQ3). The LLM chosen for downstream stages was selected based on performance on the same test set." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Results are broken down by dataset (Verified, Lite, Java), by localization level (file, element), by LLM (Gemini, Claude, o4-mini), and by error category (file miss, element miss, line miss, repair miss) in the Venn diagram analysis." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 5.4 conducts detailed error analysis on 209 unresolved instances with a Venn diagram of failure categories, counterfactual ablation, and specific examples where element-level localization hurt performance (Django expressions.py case)." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "Embedding-based reranking consistently underperformed (Table 2). The paper also reports that providing ground-truth elements can sometimes reduce repair power (9 cases where file-only was better), and o4-mini performed substantially worse than other LLMs." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "Abstract claims are directly supported: file-level Hit@1 71.4%→85% and MRR 81.8%→88.8% (Table 2), element-level Exact Match 36%→69% (Table 3), and 12.8% relative repair improvement ((58.2-51.6)/51.6 ≈ 12.8%, Table 6)." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "Causal claims ('reasoning improves localization') are supported by controlled ablation studies that isolate the file and element localization components while keeping other pipeline stages unchanged. The counterfactual analysis further supports causal attribution." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": true, 130 "justification": "The abstract specifies 'Python and Java projects from SWE-bench Verified, Lite, and Java.' Section 6 explicitly acknowledges: 'results may not fully generalize to other ecosystems such as C++, JavaScript/TypeScript, or Rust.'" 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper tests embedding-based vs LLM-based reranking to distinguish reasoning from representation quality. Section 5.4 discusses cases where element localization can hurt. Section 6 discusses model choice dependency and notes OpenHands's improvement 'is very much LLM-dependent.'" 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper measures test-suite passing ('resolved rate') as a proxy for correct repair but does not discuss the gap between passing tests and actual patch correctness. A patch can pass tests while being semantically incorrect." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "Models are referred to by marketing names: 'Gemini 2.5 Pro', 'Claude 4 Sonnet', 'o4-mini'. No API versions, snapshot dates, or exact model IDs are provided." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 3.1 provides the full text of all four prompts used: file-level reasoning prompt, file ranking prompt, element-level reasoning prompt, and element ranking prompt. Fill values come from the public SWE-bench dataset, making prompts fully reconstructible." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": false, 157 "justification": "No temperature, top-p, max tokens, or other generation hyperparameters are reported for any of the LLMs used. Only the number of samples (10 per instance, 4 runs) is stated." 158 }, 159 "scaffolding_described": { 160 "applies": true, 161 "answer": true, 162 "justification": "The pipeline is described in detail in Section 3 with Figure 2 showing the overview: file retrieval → file-level reasoning → file ranking → element extraction → element-level reasoning → element ranking → line localization → repair → validation." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 4.1 describes the datasets and their composition. The methodology explains how Agentless converts the codebase to a tree structure, filters folders, embeds code chunks, and retrieves candidate files before reasoning is applied." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 6 'Threats to Validity' provides a dedicated section with substantive discussion of multiple limitations." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 6 discusses specific threats: language limitation to Python and Java only (not C++/JS/Rust), Gemini 2.5 Pro selection may not generalize to other models, reliance on regression tests without reproduction tests, and repair evaluation limited to SWE-bench Verified only." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": true, 184 "justification": "Section 6 explicitly states what was not tested: other languages (C++, JS/TS, Rust), other model configurations, and repair evaluation on Lite and Java datasets. It also notes that 'Extending repair evaluation to SWE-bench Lite and SWE-bench Java would strengthen the generality.'" 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "Source code is released but per-instance experimental data (localization outputs, generated patches, reasoning text) is not mentioned as being released." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 4.1 describes SWE-bench Verified (500 human-verified instances), Lite (300 instances), and Java (91 instances) with fields and composition." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants. Data source is standard SWE-bench benchmarks." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "The full pipeline from bug report through file retrieval, reasoning generation, ranking, element localization, line localization, patch generation, and validation is documented across Sections 3 and 4." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No acknowledgments section or funding information is provided anywhere in the paper." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "All four authors are listed as affiliated with York University, Canada. The authors do not evaluate their own commercial product." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "No funding information is disclosed, so funder independence cannot be assessed." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial interests statement is present in the paper." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": true, 234 "answer": false, 235 "justification": "Training data cutoff dates are not stated for any of the LLMs used (Gemini 2.5 Pro, Claude 4 Sonnet, o4-mini)." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": true, 239 "answer": false, 240 "justification": "SWE-bench instances come from public GitHub repositories. The paper does not discuss whether the LLMs may have been trained on the benchmark data or the underlying repositories." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": true, 244 "answer": false, 245 "justification": "SWE-bench issues and patches are publicly available on GitHub and Hugging Face. No discussion of whether models may have seen these during training." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": true, 289 "justification": "Section 4.3 states: 'The average cost of RGFL per sample is around $4.4, so it incurs a higher computational cost than Agentless (3.7×).' Also mentions ~$400 for Gemini vs ~$2,000 for Claude on Verified." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "Per-sample cost ($4.4) and comparative costs are mentioned, but total computational budget (total API spend, GPU hours, wall-clock time) is not explicitly stated." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "Despite generating 40 patches per instance across 4 independent runs, no variance or sensitivity analysis across runs is reported." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": true, 306 "justification": "Section 4.3 states: 'we generate 10 samples per instance, repeated across 4 independent runs, resulting in 40 candidate patches per instance.'" 307 }, 308 "hyperparameter_search_budget": { 309 "applies": true, 310 "answer": false, 311 "justification": "No hyperparameter search is described. Key design choices (top-k=3, 10 samples, 4 runs) appear fixed without search." 312 }, 313 "best_config_selection_justified": { 314 "applies": true, 315 "answer": true, 316 "justification": "The LLM selection is justified in RQ1.1: Gemini 2.5 Pro was chosen based on comparable accuracy to Claude 4 Sonnet at substantially lower cost (~$400 vs ~$2,000). The reasoning is transparent." 317 }, 318 "multiple_comparison_correction": { 319 "applies": true, 320 "answer": false, 321 "justification": "Multiple comparisons are made across datasets, models, and metrics, but no statistical tests are performed at all, let alone corrections for multiple comparisons." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": false, 326 "justification": "The authors evaluate their own RGFL system against baselines. For Agentless, they use their own re-implementation with Gemini 2.5 Pro. For OrcaLoca, they ran Claude 4 Sonnet since trajectory data was unavailable. The self-comparison bias is not acknowledged." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": true, 330 "answer": true, 331 "justification": "Section 4.3 discusses the cost-performance tradeoff: 'RGFL per sample is around $4.4 (3.7× Agentless)... the +12.8% absolute increase in resolved rate provides a compelling justification for the added cost.'" 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": false, 336 "justification": "SWE-bench is used without discussion of whether test-suite passing accurately measures repair correctness, or whether SWE-bench instances are representative of real-world bugs." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": true, 340 "answer": false, 341 "justification": "Internal comparisons control for scaffold by modifying only the localization module. However, cross-system comparisons (OpenHands, OrcaLoca, AutoCodeRover) use different scaffolds and different LLMs simultaneously. The paper notes OpenHands's results are 'LLM-dependent' but does not systematically address the scaffold confound." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "SWE-bench issues originate from public GitHub repositories created before the LLMs' training cutoffs. Temporal leakage is not discussed." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether the evaluation setup leaks information (e.g., whether the problem statement contains hints about the fix location)." 354 }, 355 "non_independence_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of whether SWE-bench instances share structural similarities (e.g., multiple issues from the same repository like SymPy or Django)." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": false, 363 "justification": "No leakage detection or prevention method is applied (no canary strings, membership inference, or decontamination)." 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "RGFL improves file-level localization: Hit@1 from 71.4% to 85% and MRR from 81.8% to 88.8% on SWE-bench Verified with Gemini 2.5 Pro.", 370 "evidence": "Table 2 reports Hit@k, Recall@k, and MRR for three LLMs with and without reasoning. Consistent improvements across all three models (Gemini, Claude, o4-mini). Section 5.1, RQ1.1.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "RGFL improves element-level exact match from 36% to 69% on SWE-bench Verified, nearly closing the gap to the theoretical upper bound of 80%.", 375 "evidence": "Table 3 shows progressive improvement: Agentless files + Agentless elements = 36%, RGFL files + Agentless elements = 41%, RGFL files + RGFL elements = 69%, with upper bound 80%. Section 5.1, RQ1.2.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Embedding-based reranking of reasoning consistently underperforms LLM-based reranking and often degrades localization compared to the Agentless baseline.", 380 "evidence": "Table 2 shows Hit@1 drops for all embedding models: Gemini-embedding-001 (64.4% vs 71.4% baseline), Voyage 3.5 (60.8% vs 75.4%), text-embedding-3-small (52% vs 68.4%). Section 5.1.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "RGFL generalizes across datasets and programming languages, achieving best Hit@3, Recall@3, and MRR on SWE-bench Verified, Lite, and Java.", 385 "evidence": "Table 4 reports cross-dataset file localization. Table 5 reports element-level exact match. RGFL achieves best or near-best performance on all three datasets. Section 5.2.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Integrating RGFL into Agentless yields a 12.8% end-to-end repair improvement on SWE-bench Verified (51.6% → 58.2%).", 390 "evidence": "Table 6 shows resolved rate: Agentless 51.6%, RGFL file only 55.8%, RGFL file+element 58.2%. However, results are on a single benchmark with no variance reported. Section 5.3.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "File localization errors have the highest per-case impact on repair: correcting file misses resolves 50% of affected cases, vs 26% for element and 19% for line.", 395 "evidence": "Section 5.4 counterfactual ablation on 209 unresolved instances: GT files resolved 14/28 (50%), GT elements resolved 29/111 (26%), GT lines resolved 33/175 (19%).", 396 "supported": "moderate" 397 } 398 ], 399 "red_flags": [ 400 { 401 "flag": "No statistical significance testing", 402 "detail": "All comparative claims ('improves', 'outperforms') are based on raw percentage differences without any statistical tests. Given that results come from stochastic LLM outputs (40 patches per instance), variance could be substantial, and observed differences may not be statistically significant." 403 }, 404 { 405 "flag": "No variance reported despite multiple runs", 406 "detail": "The paper generates 40 candidate patches per instance (10 samples × 4 independent runs) but never reports variance across these runs. This conceals result stability and makes it impossible to assess whether differences are meaningful." 407 }, 408 { 409 "flag": "SWE-bench contamination risk unaddressed", 410 "detail": "SWE-bench issues and patches are publicly available on GitHub and Hugging Face. The LLMs used (Gemini 2.5 Pro, Claude 4 Sonnet, o4-mini) were likely trained on data that includes these repositories and possibly the benchmark itself. No training cutoffs are stated and no contamination analysis is performed." 411 }, 412 { 413 "flag": "Test set used for model selection", 414 "detail": "SWE-bench Verified was used both for selecting the best-performing LLM (Gemini 2.5 Pro over Claude 4 Sonnet and o4-mini in RQ1.1) and for reporting final results. This selection bias inflates reported performance." 415 }, 416 { 417 "flag": "Cross-system comparisons confound model and scaffold", 418 "detail": "External baseline comparisons use different LLMs (e.g., OpenHands with Claude 4 Sonnet vs RGFL with Gemini 2.5 Pro). The paper acknowledges OpenHands's results are 'LLM-dependent' but still draws comparative conclusions from these confounded comparisons." 419 } 420 ], 421 "cited_papers": [ 422 { 423 "title": "Agentless: Demystifying LLM-based Software Engineering Agents", 424 "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"], 425 "year": 2024, 426 "arxiv_id": "2407.01489", 427 "relevance": "Core baseline and foundation for RGFL's pipeline; state-of-the-art modular APR framework on SWE-bench." 428 }, 429 { 430 "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents", 431 "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"], 432 "year": 2024, 433 "arxiv_id": "2407.16741", 434 "relevance": "End-to-end LLM-based APR agent; key baseline achieving top SWE-bench leaderboard performance." 435 }, 436 { 437 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 438 "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig"], 439 "year": 2023, 440 "arxiv_id": "2310.06770", 441 "relevance": "Primary evaluation benchmark for LLM-based program repair; defines the task and evaluation protocol used in this paper." 442 }, 443 { 444 "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering", 445 "authors": ["John Yang", "Carlos E Jimenez", "Alexander Wettig"], 446 "year": 2024, 447 "relevance": "Pioneering agentic APR system that equips LLMs with structured agent-computer interfaces for repository navigation and editing." 448 }, 449 { 450 "title": "AutoCodeRover: Autonomous Program Improvement", 451 "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"], 452 "year": 2024, 453 "relevance": "Pipeline-based APR system using program-structure-aware search APIs; key baseline for localization comparison." 454 }, 455 { 456 "title": "OrcaLoca: An LLM Agent Framework for Software Issue Localization", 457 "authors": ["Zhongming Yu", "Hejia Zhang", "Yujie Zhao"], 458 "year": 2025, 459 "arxiv_id": "2502.00350", 460 "relevance": "Navigation-based fault localization approach with priority scheduling and distance-aware context pruning; competitive FL baseline." 461 }, 462 { 463 "title": "LocAgent: Graph-Guided LLM Agents for Code Localization", 464 "authors": ["Zhaoling Chen", "Xiangru Tang", "Gangda Deng"], 465 "year": 2025, 466 "arxiv_id": "2503.09089", 467 "relevance": "Graph-based LLM fault localization using repository-level code graphs; related FL-only approach." 468 }, 469 { 470 "title": "Evaluating Large Language Models Trained on Code", 471 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 472 "year": 2021, 473 "arxiv_id": "2107.03374", 474 "relevance": "Foundational paper on LLMs for code (Codex); establishes the paradigm of using LLMs for software engineering tasks." 475 }, 476 { 477 "title": "Agentic Software Issue Resolution with Large Language Models: A Survey", 478 "authors": ["Zhonghao Jiang", "David Lo", "Zhongxin Liu"], 479 "year": 2025, 480 "arxiv_id": "2512.22256", 481 "relevance": "Comprehensive survey of LLM-based APR approaches; provides the taxonomy used to position RGFL." 482 }, 483 { 484 "title": "TRAE Agent: An LLM-Based Agent for Software Engineering with Test-Time Scaling", 485 "authors": ["Pengfei Gao", "Zhao Tian", "Xiangxin Meng"], 486 "year": 2025, 487 "arxiv_id": "2507.23370", 488 "relevance": "End-to-end LLM agent for software engineering emphasizing test-time scaling and aggressive tool-based exploration." 489 }, 490 { 491 "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair", 492 "authors": ["Islem Bouzenia", "Premkumar Devanbu", "Michael Pradel"], 493 "year": 2024, 494 "arxiv_id": "2403.17134", 495 "relevance": "Autonomous LLM-based APR agent; part of the growing landscape of agentic program repair approaches." 496 }, 497 { 498 "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", 499 "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"], 500 "year": 2020, 501 "relevance": "Foundational RAG paper; RGFL's approach extends RAG-based code retrieval with explicit reasoning." 502 } 503 ], 504 "engagement_factors": { 505 "practical_relevance": { 506 "score": 2, 507 "justification": "The method is directly applicable to APR tools and code is released, but requires significant infrastructure (LLM APIs, SWE-bench setup) to deploy." 508 }, 509 "surprise_contrarian": { 510 "score": 1, 511 "justification": "The finding that explicit reasoning helps LLMs is intuitive; the negative result that embedding-based reranking hurts is mildly surprising." 512 }, 513 "fear_safety": { 514 "score": 0, 515 "justification": "No AI safety or security concerns raised; this is a software engineering tool improvement paper." 516 }, 517 "drama_conflict": { 518 "score": 0, 519 "justification": "No controversy; straightforward method comparison with existing tools." 520 }, 521 "demo_ability": { 522 "score": 1, 523 "justification": "Code is released on GitHub but requires LLM API access, SWE-bench setup, and substantial compute to run." 524 }, 525 "brand_recognition": { 526 "score": 1, 527 "justification": "Uses well-known benchmarks (SWE-bench) and models (Gemini, Claude), but the authors and lab (York University) are not widely known." 528 } 529 } 530 }