scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31092B)
      1 {
      2   "paper": {
      3     "title": "Repair Ingredients Are All You Need: Improving Large Language Model-Based Program Repair via Repair Ingredients Search",
      4     "authors": [
      5       "Jiayi Zhang",
      6       "Kai Huang",
      7       "Jian Zhang",
      8       "Yang Liu",
      9       "Chunyang Chen"
     10     ],
     11     "year": 2025,
     12     "venue": "ICSE 2026",
     13     "arxiv_id": "2506.23100",
     14     "doi": "10.48550/arXiv.2506.23100"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "ReinFix integrates internal repair ingredients (via Joern-based dependency analysis) and external repair ingredients (via enhanced RAG on historical bug-fix pairs) into an LLM-based ReAct agent for automated program repair. On Defects4J V1.2 and V2.0, ReinFix with GPT-4o fixes 146 and 145 bugs respectively, outperforming the best baselines by 32 and 38 bugs. An ablation study shows both internal (31 bug improvement) and external (23 bug improvement) ingredients contribute, with synergistic combination yielding 61 additional fixes over the base model. The framework generalizes to post-training-cutoff benchmarks (RWB) and multiple foundation models.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper states 'Our source code and data are available at: https://sites.google.com/view/repairingredients' (Section 1, contributions). A URL is provided."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The benchmarks used (Defects4J, RWB) are publicly available, and the authors state data is available at their project page. The TRANSFER dataset used for the vector database is also a publicly available corpus."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper mentions LangChain, Joern, and specific OpenAI models but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are provided in the paper. The project page URL is given but the paper itself contains no reproduction guide."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "All results in Tables 3, 4, 5, and 6 are point estimates (counts of bugs fixed) with no confidence intervals or error bars."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper claims 'outperforms' and 'surpasses' multiple baselines based solely on comparing bug-fix counts without any statistical significance tests."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper consistently reports absolute differences and percentage improvements with baseline context, e.g., 'fixed 32 more bugs (146 vs. 114)', '71.76% improvement' (Section 5.1, 5.3)."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No justification is given for benchmark sizes. The paper uses standard benchmarks (Defects4J V1.2: 391 bugs, V2.0: 438 bugs, RWB: 44 and 29 bugs) without discussing whether these sizes are adequate for the claims made."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Despite using temperature=1 which introduces stochasticity, no variance, standard deviation, or results across multiple runs are reported. All results appear to be from single experimental runs."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Extensive baselines are included: ChatRepair, ThinkRepair, RepairAgent, FitRepair, GAMMA, TENURE, Tare, AlphaRepair, RAP-Gen, KNOD, Recoder, and TBar (Table 3)."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Baselines include recent 2024 works: ChatRepair (ISSTA 2024), ThinkRepair (ISSTA 2024), RepairAgent (2024). The baselines represent the state of the art in LLM-based APR."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "RQ3 (Section 5.3) presents a thorough ablation study with 6 variants: ReinFix_NN (no ingredients), ReinFix_DN (internal only), ReinFix_NP (external only), ReinFix_FP (FitRepair's approach), ReinFix_DR (RAP-Gen's approach), and ReinFix_DP (full system), shown in Table 5."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The paper reports correct fixes and plausible fixes (Table 3), unique fixes (Figure 8), per-scenario breakdowns (Table 4), and average cost per bug fix (Section 6)."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Section 4.5 states 'the plausible patches are manually reviewed to verify the semantic correctness.' Human review distinguishes correct fixes from merely plausible (test-passing) patches."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Defects4J V1.2 and V2.0 are established held-out benchmarks. Additionally, RWB V1.0 and V2.0 were specifically constructed with bugs after LLM training cutoffs to serve as uncontaminated test sets (Section 4.2)."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Table 3 breaks down results by project (Chart, Closure, Lang, Math, Mockito, Time). Table 4 breaks down by repair scenario (multi-function, single-function, single-hunk, single-line). Table 6 breaks down RWB by project."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 5.3 discusses specific failure cases: Figure 10 shows Closure-102 where ReinFix_FP failed due to irrelevant donor code, and Section 5.3.2 discusses why ReinFix_DR retrieved irrelevant repair behavior for Closure-51."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The ablation variants that perform worse are reported (ReinFix_NN=85, ReinFix_FP=110 vs full=146). On RWB V2.0 single-line scenario, ReinFix_GPT3.5 (47) slightly underperforms ChatRepair (48) on Defects4J V2.0."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims are supported: 'fixes 146 bugs, 32 more than baselines on V1.2' matches Table 3; '38 more bugs than SOTA on V2.0' matches Table 3 (145 vs 107); 'maintains best performance on recent benchmarks' matches Table 6."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Causal claims ('repair ingredients enhance repair capabilities') are supported by the ablation study (Section 5.3) which systematically removes components. The controlled single-variable manipulation across 6 variants in Table 5 is adequate for the causal claims made."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title 'Repair Ingredients Are All You Need' and abstract claim broad applicability to 'LLM-based program repair' generally. However, all experiments are on Java bugs only (Defects4J, RWB). The paper does not explicitly bound its claims to Java or discuss generalizability to other languages."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The threats section (Section 6) discusses data leakage and repair costs but does not consider alternative explanations such as: whether improvements come from increased prompt length/context rather than ingredient quality, whether the Joern-based analysis simply provides more tokens rather than better information, or confounds from different sampling budgets across baselines."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper measures 'correct fixes' (human-verified semantic equivalence to ground truth) and 'plausible fixes' (pass all tests). These are direct measurements of repair capability and the claims match this granularity."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The paper specifies 'gpt-3.5-turbo-0125' (ref [35]), 'gpt-4o-2024-05-13' (ref [39]), 'gpt-4-0613' (ref [36]), and 'gpt-4-1106-preview' (ref [38]) with specific version identifiers including snapshot dates."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "No actual prompt text is provided. The paper describes the approach conceptually (ReAct framework, tool definitions) and shows tool formulations, but the system prompts, user prompts, and reasoning prompts used with the LLMs are not reproduced."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 4.5 reports: sampling temperature=1, max 3 repair attempts per bug, max 3 repair suggestions per attempt, max 5 candidate patches per suggestion (total patch space 3*3*5=45), top-1 plausible patch retained."
    159       },
    160       "scaffolding_described": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "The ReAct agent framework is described in detail: Algorithm 1 shows the full workflow, Table 1 describes 9 code analysis tools, Figure 3 shows the system architecture, and Figure 7 shows the execution chain with Thought-Action-Observation cycles."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 4.4 documents vector database construction: TRANSFER dataset → random 100K sample selection → GPT-4o root cause labeling → text-embedding-3-large embedding → exact match filtering against benchmarks → vector database storage."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 6 'Threats to Validity' is a dedicated section discussing data leakage and repair costs as threats."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "The threats section discusses specific concerns: training data cutoff dates for GPT-3.5 (September 2021) and DeepSeek-Coder (February 2023), specific cost figures ($0.06 and $1.45 per bug fix vs ChatRepair's $0.42), and how they used RWB benchmarks to mitigate data leakage."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The paper does not explicitly state scope boundaries. It evaluates only on Java bugs but does not acknowledge this limitation. It does not discuss what the results do NOT show (e.g., applicability to other languages, other bug types beyond Defects4J, real-world deployment scenarios without perfect fault localization)."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The paper states 'Our source code and data are available at: https://sites.google.com/view/repairingredients' and uses publicly available benchmarks (Defects4J, RWB, TRANSFER dataset)."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The benchmarks are well-described: Defects4J V1.2 (391 bugs), V2.0 (438 bugs), RWB V1.0 (44 bugs after Oct 2021), V2.0 (29 bugs after March 2023). Vector database: 100K samples from TRANSFER dataset (Section 4.2, 4.4)."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. All data sources are standard benchmarks (Defects4J, RWB) and publicly available datasets (TRANSFER)."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The data pipeline is documented: TRANSFER → 100K random sample → GPT-4o root cause generation → embedding via text-embedding-3-large → exact match deduplication against benchmarks → vector database (Section 4.4, Figure 5)."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding acknowledgment or grant information is mentioned anywhere in the paper."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: Nanyang Technological University (Singapore) and Technical University of Munich (Germany). The authors are not affiliated with OpenAI or any LLM provider whose models are evaluated."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funding is disclosed, making it impossible to assess funder independence. Absence of disclosure is not absence of conflict."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests statement or financial interest declaration appears in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "Section 4.2 states: 'pre-training data for ChatGPT (GPT-3.5) was collected before September 2021' and 'pre-training data for DeepSeek-Coder was collected from GitHub before February 2023.'"
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "Section 4.2 and 6 discuss potential overlap. RWB benchmarks were specifically collected after training cutoffs (V1.0: after Oct 2021, V2.0: after March 2023). Section 4.4 describes exact match filtering to prevent vector database overlap with benchmarks."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "Section 5.4 (RQ4) is dedicated to evaluating on post-training-cutoff benchmarks to mitigate contamination. Section 6 explicitly discusses data leakage risk and how the RWB evaluation and ablation study demonstrate gains are independent of data leakage."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study. All evaluation is on benchmark bug datasets."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants. The study evaluates automated program repair tools on code benchmarks."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Section 6 reports: 'ReinFix_GPT3.5 incurs an average cost of $0.06 per bug fix, while ReinFix_GPT4o averages $1.45 per bug fix,' compared to ChatRepair's $0.42 per bug fix."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Per-bug costs are reported but total computational budget (total API spend, total GPU hours for Joern analysis, embedding costs) is not stated."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No results across multiple random seeds are reported. The paper uses temperature=1 (introducing randomness) but reports single-run results without seed sensitivity analysis."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The paper states the maximum patch space (3*3*5=45) but does not state how many independent experimental runs produced the reported results."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "No hyperparameter search is described. The settings (3 attempts, 3 suggestions, 5 patches, temperature=1) appear to be fixed a priori without justification or search."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "The paper does not explain why 3 attempts, 3 suggestions, and 5 patches were chosen as the configuration. No validation-based selection is described."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "The paper makes numerous comparisons across 12+ baselines, 2 benchmarks, 4 repair scenarios, and 6 ablation variants without any statistical testing, let alone multiple comparison correction."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "Section 4.3 states they 'reuse the reported results from previous studies instead of directly running the APR tools,' which is good practice. However, the authors do not explicitly discuss self-comparison bias or acknowledge this as a bias mitigation strategy."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "Sampling budgets vary dramatically across baselines: ReinFix uses 45 patches, ChatRepair 500, AlphaRepair 5000. While ReinFix's cost efficiency is noted in Section 6, no systematic performance-at-matched-compute analysis is provided."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The paper does not discuss whether Defects4J accurately measures general program repair capability. Known limitations of Defects4J (e.g., Java-only, curated bugs, perfect FL assumption) are not discussed."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "ReinFix introduces a ReAct agent scaffold with specialized tools, while baselines use different scaffolding (ChatRepair: conversational, ThinkRepair: CoT). Cross-baseline comparisons conflate model, scaffold, and ingredient effects. The ablation addresses internal scaffold effects but not cross-baseline scaffold confounds."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": true,
    349         "justification": "Section 4.2 discusses training cutoff dates and Section 5.4 evaluates on RWB benchmarks specifically constructed with bugs after LLM training cutoffs to prevent temporal leakage."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "The paper uses perfect fault localization (Section 4.5) which provides the exact buggy location — information not available in real-world use. While this follows APR convention, it constitutes feature leakage that is not critically discussed."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "The vector database uses 100K samples from TRANSFER dataset filtered by exact match against benchmarks. However, near-duplicate or structurally similar bugs are not addressed. No analysis of whether TRANSFER entries share structural similarities with Defects4J bugs beyond exact matches."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": true,
    364         "justification": "Two concrete methods are applied: (1) exact match filtering to remove overlapping samples between the TRANSFER vector database and benchmarks (Section 4.4), and (2) temporal splits using RWB benchmarks constructed after model training cutoffs (Section 4.2)."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "ReinFix fixes 146 bugs on Defects4J V1.2, 32 more than the best baseline ChatRepair (114).",
    371       "evidence": "Table 3 shows ReinFix_GPT4o achieves 146 correct fixes vs ChatRepair's 114 on Defects4J V1.2 (Section 5.1).",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "ReinFix fixes 145 bugs on Defects4J V2.0, 38 more than the best baseline ThinkRepair (107).",
    376       "evidence": "Table 3 shows ReinFix_GPT4o achieves 145 correct fixes vs ThinkRepair's 107 on Defects4J V2.0 (Section 5.1).",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Internal dependency-analysis-based ingredient search improves repair by 31 bugs over the base model (116 vs 85).",
    381       "evidence": "Table 5 ablation: ReinFix_DN (116) vs ReinFix_NN (85) on Defects4J V1.2 (Section 5.3.1).",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "External pattern-matching-based ingredient search improves repair by 23 bugs over the base model (108 vs 85).",
    386       "evidence": "Table 5 ablation: ReinFix_NP (108) vs ReinFix_NN (85) on Defects4J V1.2 (Section 5.3.1).",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "The combined approach yields a 71.76% improvement over the base model, fixing 61 more bugs (146 vs 85).",
    391       "evidence": "Table 5: ReinFix_DP (146) vs ReinFix_NN (85) on Defects4J V1.2 (Section 5.3.1).",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "ReinFix's dependency-analysis approach outperforms FitRepair's similarity-based approach by 36 bugs (146 vs 110).",
    396       "evidence": "Table 5: ReinFix_DP (146) vs ReinFix_FP (110) on Defects4J V1.2 (Section 5.3.2).",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "ReinFix maintains best performance on recent benchmarks free of data leakage risk, achieving 10.34% improvement over ThinkRepair on RWB.",
    401       "evidence": "Table 6 shows ReinFix_GPT3.5 fixes 20/44 on RWB V1.0 (vs ThinkRepair 19) and ReinFix_DSC fixes 12/29 on RWB V2.0 (vs ThinkRepair* 10) (Section 5.4).",
    402       "supported": "moderate"
    403     },
    404     {
    405       "claim": "ReinFix_GPT3.5 achieves 21 unique fixes compared to recent APR tools using the same base model.",
    406       "evidence": "Figure 8 Venn diagram on Defects4J V1.2 showing 21 unique fixes for ReinFix_GPT3.5 (Section 5.1).",
    407       "supported": "moderate"
    408     }
    409   ],
    410   "red_flags": [
    411     {
    412       "flag": "No uncertainty quantification",
    413       "detail": "All results are point estimates with no error bars, confidence intervals, or variance across runs, despite using temperature=1 which introduces substantial randomness. Different random seeds could yield different bug-fix counts."
    414     },
    415     {
    416       "flag": "Unequal sampling budgets across baselines",
    417       "detail": "ReinFix uses a maximum of 45 patches (3*3*5) while baselines use vastly different budgets: AlphaRepair 5000, FitRepair 4000, ChatRepair 500, ThinkRepair 125. Direct comparison of bug-fix counts without normalizing for compute effort is misleading. While ReinFix's lower cost is noted, no performance-at-matched-budget analysis is provided."
    418     },
    419     {
    420       "flag": "Reused baseline results",
    421       "detail": "Section 4.3 acknowledges baseline results are reused from prior publications rather than reproduced. Different papers may use different hardware, API versions, prompt formats, or even different subsets of the benchmark, making direct comparison less reliable."
    422     },
    423     {
    424       "flag": "Perfect fault localization assumption",
    425       "detail": "Section 4.5 uses perfect fault localization, meaning the exact buggy location is given to the tool. This significantly simplifies the repair task and is unrealistic for practical deployment, though it follows APR convention."
    426     },
    427     {
    428       "flag": "Java-only evaluation with broad claims",
    429       "detail": "The title claims 'Repair Ingredients Are All You Need' for LLM-based program repair generally, but all experiments are exclusively on Java bugs (Defects4J, RWB). No evidence is provided for other languages."
    430     }
    431   ],
    432   "cited_papers": [
    433     {
    434       "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair",
    435       "authors": ["Islem Bouzenia", "Premkumar Devanbu", "Michael Pradel"],
    436       "year": 2024,
    437       "arxiv_id": "2403.17134",
    438       "relevance": "LLM-based agent for autonomous program repair, directly comparable baseline and demonstrates the agent paradigm in APR."
    439     },
    440     {
    441       "title": "Thinkrepair: Self-directed Automated Program Repair",
    442       "authors": ["Xin Yin", "Chao Ni", "Shaohua Wang", "Zhenhao Li", "Limin Zeng", "Xiaohu Yang"],
    443       "year": 2024,
    444       "relevance": "Key baseline using Chain-of-Thought prompting on ChatGPT for program repair with self-directed reasoning."
    445     },
    446     {
    447       "title": "Automated Program Repair via Conversation: Fixing 162 out of 337 Bugs for $0.42 Each using ChatGPT",
    448       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    449       "year": 2024,
    450       "relevance": "ChatRepair introduces conversational repair paradigm with ChatGPT, key baseline for LLM-based APR cost and effectiveness."
    451     },
    452     {
    453       "title": "The Plastic Surgery Hypothesis in the Era of Large Language Models",
    454       "authors": ["Chunqiu Steven Xia", "Yifeng Ding", "Lingming Zhang"],
    455       "year": 2023,
    456       "relevance": "FitRepair leverages repair ingredients via fine-tuning CodeT5 on buggy projects, directly related prior work on internal repair ingredients."
    457     },
    458     {
    459       "title": "RAP-Gen: Retrieval-Augmented Patch Generation with CodeT5 for Automatic Program Repair",
    460       "authors": ["Weishi Wang", "Yue Wang", "Shafiq Joty", "Steven CH Hoi"],
    461       "year": 2023,
    462       "relevance": "RAG-based approach to retrieve external bug-fix pairs for patch generation, directly related prior work on external repair ingredients."
    463     },
    464     {
    465       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    466       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik Narasimhan", "Yuan Cao"],
    467       "year": 2023,
    468       "relevance": "The ReAct framework for LLM-based agents that ReinFix is built upon, foundational for agentic AI workflows."
    469     },
    470     {
    471       "title": "A Unified Debugging Approach via LLM-Based Multi-Agent Synergy",
    472       "authors": ["Cheryl Lee", "Chunqiu Steven Xia", "Jen-tse Huang", "Zhouruixin Zhu", "Lingming Zhang", "Michael R Lyu"],
    473       "year": 2024,
    474       "arxiv_id": "2404.17153",
    475       "relevance": "FixAgent demonstrates multi-agent collaboration for debugging, relevant to LLM agent capabilities in software engineering."
    476     },
    477     {
    478       "title": "Large Language Model-Based Agents for Software Engineering: A Survey",
    479       "authors": ["Junwei Liu", "Kaixin Wang", "Yixuan Chen", "Xin Peng", "Zhenpeng Chen", "Lingming Zhang", "Yiling Lou"],
    480       "year": 2024,
    481       "arxiv_id": "2409.02977",
    482       "relevance": "Comprehensive survey of LLM-based agents in software engineering tasks including program repair."
    483     },
    484     {
    485       "title": "Copiloting the copilots: Fusing large language models with completion engines for automated program repair",
    486       "authors": ["Yuxiang Wei", "Chunqiu Steven Xia", "Lingming Zhang"],
    487       "year": 2023,
    488       "relevance": "Combines LLMs with code completion engines for APR, demonstrating hybrid approaches to enhance LLM repair capabilities."
    489     },
    490     {
    491       "title": "Demystifying LLM-Based Software Engineering Agents",
    492       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    493       "year": 2025,
    494       "relevance": "Study of LLM-based software engineering agents examining their effectiveness, relevant to understanding agent-based approaches."
    495     },
    496     {
    497       "title": "Understanding Software Engineering Agents: A Study of Thought-Action-Result Trajectories",
    498       "authors": ["Islem Bouzenia", "Michael Pradel"],
    499       "year": 2025,
    500       "arxiv_id": "2506.18824",
    501       "relevance": "Analysis of SE agent trajectories relevant to understanding how LLM agents reason through software tasks."
    502     },
    503     {
    504       "title": "An Empirical Study on Fine-Tuning Large Language Models of Code for Automated Program Repair",
    505       "authors": ["Kai Huang", "Xiangxin Meng", "Jian Zhang", "Yang Liu", "Wenjie Wang", "Shuhao Li", "Yuqing Zhang"],
    506       "year": 2023,
    507       "relevance": "Empirical study on fine-tuning LLMs for APR, foundational work on LLM capabilities for code repair."
    508     }
    509   ],
    510   "engagement_factors": {
    511     "practical_relevance": {
    512       "score": 2,
    513       "justification": "ReinFix is a usable framework built on LangChain with released code, applicable to Java APR with any LLM, though requires Joern setup and vector database construction."
    514     },
    515     "surprise_contrarian": {
    516       "score": 1,
    517       "justification": "Confirms the intuition that providing LLMs with more context (repair ingredients) helps, rather than challenging conventional wisdom."
    518     },
    519     "fear_safety": {
    520       "score": 0,
    521       "justification": "No safety, security, or AI risk concerns — the paper is about improving automated bug fixing."
    522     },
    523     "drama_conflict": {
    524       "score": 0,
    525       "justification": "No controversial claims or conflicts; straightforward empirical improvement over baselines."
    526     },
    527     "demo_ability": {
    528       "score": 1,
    529       "justification": "Code is released at a Google Sites page, but requires substantial setup (Joern, vector database, OpenAI API) to try."
    530     },
    531     "brand_recognition": {
    532       "score": 1,
    533       "justification": "Authors from NTU and TUM (reputable but not headline-grabbing labs); uses GPT-4o which is well-known but the tool itself is not a major brand."
    534     }
    535   }
    536 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs