scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30835B)
      1 {
      2   "paper": {
      3     "title": "Specification Vibing for Automated Program Repair",
      4     "authors": [
      5       "Taohong Zhu",
      6       "Lucas C. Cordeiro",
      7       "Mustafa A. Mustafa",
      8       "Youcheng Sun"
      9     ],
     10     "year": 2026,
     11     "venue": "arXiv",
     12     "arxiv_id": "2602.08263"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "VibeRepair, a specification-centric APR approach, repairs 174 bugs on Defects4J v1.2 (19% over the best baseline) and 178 on Defects4J v2.0 (23% improvement) using a much smaller patch space (max 15 candidates vs hundreds-thousands for baselines). Selectively enabling an optional reasoning component only on default-repair failures achieves the best effectiveness-cost tradeoff. The approach generalizes across GPT-4, GPT-3.5, and DeepSeek-Coder, consistently matching or exceeding baselines.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Section 9 states: 'Our source code is publicly available at https://anonymous.4open.science/r/VibeRepair-9D73/' and the contributions list mentions open-sourcing the complete framework."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "All benchmarks used are public: Defects4J is available from its official repository, and RWB datasets are available at https://github.com/vinci-grape/ThinkRepair/ as stated in Section 9."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper states VibeRepair is implemented on LangChain and uses specific LLM APIs, but provides no requirements.txt, Dockerfile, or detailed dependency list with library versions."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided in the paper. A repository link is given but the paper itself lacks a 'Reproducing Results' section or specific commands to run."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Tables 3-6 report only point estimates (raw counts of correct/plausible patches) with no confidence intervals or error bars, despite using a stochastic LLM with temperature=1."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No statistical significance tests are used. All claims of superiority (e.g., '19% improvement', '23% improvement') are based on raw count comparisons without any hypothesis tests."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper reports absolute differences and percentage improvements with baseline context, e.g., 'VibeRepair correctly repairs 174 bugs, exceeding the strongest state-of-the-art baseline by 28 bugs, which corresponds to a 19% improvement' (Section 5.1.1)."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No justification for sample sizes. Defects4J v1.2 has 391 bugs and v2.0 has 438, but no power analysis or discussion of whether these sizes are adequate for the comparative claims."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. With temperature=1, outputs are stochastic, but only single-run results appear to be reported."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Table 3 compares against 12 baselines including both LLM-based (ReinFix, ChatRepair, ThinkRepair, RepairAgent, FitRepair, AlphaRepair, RAP-Gen) and traditional (GAMMA, TENURE, Tare, KNOD, Recoder) approaches."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Baselines include ReinFix (2025), ChatRepair (2024), ThinkRepair (2024), and RepairAgent (2024), representing the current state of the art in LLM-based APR."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "RQ2 (Section 5.2, Table 5) compares three variants: no reasoning component (VibeRepair_GPT4o), minimal reasoning (miniR), and always-on reasoning (maxR), isolating the reasoning component's contribution."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Two evaluation metrics are used throughout: number of correct patches and number of plausible patches (Section 4.3). Table 5 additionally reports average time and average cost per bug."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Section 4.3 states: 'a correct patch is a plausible patch that is semantically or syntactically equivalent to the reference developer patch, as determined through manual inspection.'"
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Defects4J and RWB are standardized benchmarks with predefined test suites. No tuning is performed on the test bugs. RWB datasets specifically collected after LLM training cutoffs provide additional held-out evaluation."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table 3 provides per-project breakdowns (Chart, Closure, Lang, Math, Mockito, Time). Table 4 provides per-scenario breakdowns (MF, SF, SH, SL). Figure 7 shows unique fix analysis."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 2 (Motivation) provides a detailed failure case analysis of code-centric repair on Cli-20. Section 5.2 discusses cases where always-on reasoning degrades performance, noting 'excessive auxiliary information can negatively affect the focus of LLM during repair.'"
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "RQ2 shows that VibeRepair_maxR (always-on reasoning) underperforms VibeRepair_miniR (174 vs 178 correct on D4J v2.0), demonstrating that more information can hurt. SL performance is slightly below ReinFix in some cases (Table 4)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims of 174 bugs on D4J v1.2 (19% improvement) and 178 bugs on D4J v2.0 (23% improvement) are directly supported by Table 3. Claims of generalizability are supported by Table 6 across multiple LLMs."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper claims the specification-centric approach causes improvements ('VibeRepair consistently outperforms'), but comparisons with baselines confound the spec-centric design with differences in prompt engineering, feedback loops, and scaffolding. The RQ2 ablation only isolates the reasoning component, not the core spec-centric idea vs. direct code repair."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title 'Specification Vibing for Automated Program Repair' and conclusion ('a promising direction for LLM-based APR') suggest general applicability, but all evaluation is exclusively on Java bugs from Defects4J and RWB. No other languages, domains, or bug types are tested."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "Section 6 discusses threats to validity (data leakage, nondeterminism, manual inspection) but does not discuss alternative explanations for the observed improvements, such as whether prompt design quality, iterative feedback, or the specific LangChain implementation (rather than the spec-centric paradigm) drives the gains."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper measures correct patches (semantically equivalent to developer fixes) and calls this 'repair effectiveness.' The measurement directly matches the claim with no proxy gap."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "References [30]-[32] specify exact model version IDs: gpt-3.5-turbo-0125, gpt-4-0613, gpt-4o-2024-05-13. Section 4.4 links each experiment to these specific versions."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Figures 4-6 show prompt structure diagrams and the specification template is given verbatim, but the full actual prompt text sent to the LLM is not provided. The role designation, context briefing, and reasoning guidance are described in natural language rather than reproduced as sent."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 4.4 states 'For all LLMs, the temperature is set to 1.' Repair attempts are limited to 5 per bug with up to 3 rounds of feedback per attempt. Similarity threshold for example retrieval is 0.6 (Section 3.2.2)."
    157       },
    158       "scaffolding_described": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The three-phase pipeline (transformation, repair, generation) is described in detail (Section 3). The reasoning component's tools library (Table 1), example database with embedding-based retrieval, and ReAct framework are all documented."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 4.2 describes the datasets: Defects4J v1.2 (391 bugs, 6 projects) and v2.0 (438 bugs, 9 projects) with Table 2 showing distributions. RWB v1.0/v2.0 are described with collection dates and project counts. Perfect fault localization is stated as the assumption."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 6 'Threats to Validity' contains dedicated Internal Validity and External Validity subsections with substantive discussion spanning multiple paragraphs."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 6 discusses study-specific threats: data leakage from LLM training corpora, nondeterminism of LLMs controlled by using same base model as ReinFix, ensuring same fault localization assumptions, manual inspection subjectivity mitigated by following prior criteria, and cost estimation tied to specific pricing at evaluation time."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "Section 6 notes 'VibeRepair may not cover all possible repair scenarios' but does not explicitly state what was NOT tested — no mention of excluded languages, bug types, non-Java settings, or non-benchmark repair scenarios."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 9 states: 'Our implementation of VibeRepair, the experimental scripts, and the repair results are publicly available at https://anonymous.4open.science/r/VibeRepair-9D73/ to support reproducibility.'"
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 4.2 describes both benchmark datasets in detail: Defects4J (391+438 bugs from 15 Java projects, divided into v1.2 and v2.0), RWB v1.0 (27 bugs from 5 projects, after Oct 2021), RWB v2.0 (27 bugs, after March 2023)."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. Data sources are standard established benchmarks (Defects4J, RWB)."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The pipeline is documented: buggy code → transformation to specification → specification repair (optionally with reasoning) → code generation → validation against test suite → manual correctness inspection. Section 4.4 describes the experimental pipeline with specific limits (5 attempts, 3 feedback rounds)."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding sources, acknowledgments, or grants are mentioned anywhere in the paper."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly listed: University of Manchester (Zhu, Cordeiro, Mustafa) and Mohamed bin Zayed University of Artificial Intelligence (Sun). They are not evaluating their own institution's products."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "Cannot assess funder independence since no funding is disclosed. The absence of funding disclosure prevents verification."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests statement, patent disclosures, or financial interest declarations are found in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The paper references specific model versions (gpt-4o-2024-05-13, etc.) but does not explicitly state the training data cutoff dates. It implicitly relies on RWB collection dates being after cutoffs without stating the cutoffs themselves."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "Section 6 states: 'To mitigate this risk, we follow prior work and evaluate VibeRepair on bug cases that were collected after the known cutoff dates of the LLM training corpora.' RWB datasets are used specifically for this purpose."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "Section 4.2 explains RWB datasets are used 'to mitigate the risk of data leakage,' with RWB v1.0 bugs from after Oct 2021 and RWB v2.0 from after March 2023. Section 6 acknowledges contamination as a threat and describes their mitigation strategy."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. All evaluation is automated via test suites plus manual patch inspection."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. The study involves only automated benchmark evaluation."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Table 5 reports average cost per bug: $0.066-$0.221 on D4J v2.0 (GPT-4o) and $0.491-$1.738 on RWB v1.0 (GPT-4). Pricing rates are stated: GPT-4o at $5/$15M tokens, GPT-4 at $30/$60M tokens."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Table 5 reports average time per bug (107-161s on D4J v2.0, 278-358s on RWB v1.0) and average cost per bug across all three configurations, allowing total compute derivation."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No seed sensitivity analysis is reported. Temperature is set to 1 (stochastic) but results are reported from what appears to be a single experimental run with no variance across seeds."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The number of full experimental runs is not stated. The paper specifies '5×3' as the sampling budget (5 repair attempts × 3 feedback rounds per attempt), but does not state whether the entire experiment was run multiple times."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No hyperparameter search budget is reported. Temperature (1), repair attempts (5), and feedback rounds (3) appear to be fixed without search. No justification for why these specific values were chosen."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "RQ2 (Table 5) systematically compares three reasoning strategies (none, miniR, maxR) on both effectiveness and cost, justifying the selection of miniR as the recommended configuration through transparent comparison."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied. Comparisons across 12+ baselines, multiple benchmarks, and multiple scenarios are all done via raw count comparisons."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "While baseline results are reused from prior studies (avoiding re-implementation bias for baselines), the paper does not explicitly acknowledge or discuss the bias of authors evaluating their own system. The manual correctness determination is done by the authors themselves."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "Table 3 header shows sampling budgets for all tools (VibeRepair: 5×3, baselines: 100-5000). Section 5.1.1 explicitly notes VibeRepair achieves improvements 'with the smallest exploration budget among the compared approaches.' Table 5 compares cost across configurations."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "No discussion of whether Defects4J and RWB actually measure the claimed 'repair effectiveness' or whether they are representative of real-world bug distributions. The benchmarks are used without questioning their construct validity."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "VibeRepair uses LangChain with a custom 3-phase pipeline and optional reasoning agent, while baselines use entirely different scaffolding. When comparing VibeRepair_GPT4o vs ReinFix_GPT4o, the LLM is controlled but the scaffold differs substantially. This confound is not discussed."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "Section 4.2 and 6 discuss temporal leakage: RWB v1.0 bugs collected after Oct 2021 and RWB v2.0 after March 2023 are specifically used to evaluate on data post-training-cutoff."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "The study uses perfect fault localization (Section 4.4), providing the model with precise bug locations that would not be available in practice. This is acknowledged as standard practice but not discussed as a form of information leakage that inflates results."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether Defects4J bugs from the same projects share structural similarities or whether training data may contain code from the same repositories, potentially creating non-independence between training and test data."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": true,
    362         "justification": "Temporal splits are used as a concrete prevention method: RWB datasets contain bugs collected after the models' training cutoff dates (Section 4.2, Section 6)."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "VibeRepair correctly repairs 174 bugs on Defects4J v1.2, exceeding the strongest baseline (ReinFix) by 28 bugs (19% improvement).",
    369       "evidence": "Table 3 shows VibeRepair_miniR_GPT4o achieves 174/223 correct/plausible patches vs ReinFix_GPT4o's 146/207 on D4J v1.2 (Section 5.1.1).",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "VibeRepair correctly repairs 178 bugs on Defects4J v2.0, outperforming prior approaches by 33 bugs (23% improvement).",
    374       "evidence": "Table 3 shows VibeRepair_miniR_GPT4o achieves 178/223 vs ReinFix_GPT4o's 145/190 on D4J v2.0 (Section 5.1.1).",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "VibeRepair achieves strong results with a much smaller patch exploration space (5×3 = max 15) compared to baselines (100-5000).",
    379       "evidence": "Table 3 header lists sampling budgets. Section 5.1.1 states VibeRepair has 'the smallest exploration budget among the compared approaches.'",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Selectively enabling the reasoning component only upon default-repair failure (miniR) provides the best effectiveness-cost tradeoff.",
    384       "evidence": "Table 5 shows miniR achieves 178 correct fixes at $0.074/bug vs maxR's 174 at $0.221/bug on D4J v2.0 (Section 5.2).",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "VibeRepair generalizes well across different LLM backbones including GPT-4, GPT-3.5, and DeepSeek-Coder.",
    389       "evidence": "Table 6 shows VibeRepair_miniR_GPT4 repairs 13 bugs (vs ReinFix's 10) on RWB v1.0. VibeRepair_miniR_DSC matches ReinFix_DSC (10 each) on RWB v2.0 (Section 5.3).",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "VibeRepair produces 41 unique correct fixes on both Defects4J v1.2 and v2.0 that no other evaluated tool can fix.",
    394       "evidence": "Figure 7 Venn diagrams show unique fix counts across tools (Section 5.1.3).",
    395       "supported": "moderate"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "No variance or uncertainty quantification",
    401       "detail": "All results are reported from apparently single experimental runs with temperature=1 (stochastic). No standard deviations, confidence intervals, or multi-run statistics are provided. With stochastic LLM outputs, a single run may not be representative."
    402     },
    403     {
    404       "flag": "No statistical significance tests",
    405       "detail": "All comparisons across 12+ baselines are raw count differences. Claims like '19% improvement' and '23% improvement' are not supported by any hypothesis testing, making it impossible to assess whether differences are statistically meaningful."
    406     },
    407     {
    408       "flag": "Baseline results from prior papers, not re-run",
    409       "detail": "Section 4.3 states: 'we reuse the repair results reported in prior studies for these baseline techniques rather than re-running all tools.' While this avoids re-implementation bias, it means baseline results may have been produced under subtly different conditions."
    410     },
    411     {
    412       "flag": "Perfect fault localization assumption",
    413       "detail": "All experiments assume perfect fault localization (Section 4.4), providing the exact bug location. This is standard in APR research but significantly inflates absolute repair numbers relative to real-world applicability."
    414     },
    415     {
    416       "flag": "Java-only evaluation with broad claims",
    417       "detail": "All benchmarks (Defects4J, RWB) are exclusively Java. The title, abstract, and conclusion frame the contribution as generally applicable to 'APR' and 'the era of vibe coding' without bounding claims to Java."
    418     },
    419     {
    420       "flag": "Scaffold confound in baseline comparisons",
    421       "detail": "VibeRepair uses a fundamentally different scaffolding (LangChain 3-phase pipeline + optional reasoning agent) than baselines like ReinFix. Even when the same LLM is used, the scaffold difference confounds attribution of improvements to the specification-centric paradigm."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "Automated program repair via conversation: Fixing 162 out of 337 bugs for $0.42 each using chatgpt",
    427       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    428       "year": 2024,
    429       "relevance": "LLM-based conversational program repair baseline; demonstrates cost-effective LLM repair via iterative feedback."
    430     },
    431     {
    432       "title": "Thinkrepair: Self-directed automated program repair",
    433       "authors": ["Xin Yin", "Chao Ni", "Shaohua Wang", "Zhenhao Li", "Limin Zeng", "Xiaohu Yang"],
    434       "year": 2024,
    435       "relevance": "Chain-of-thought prompting for LLM-based APR; provides the RWB benchmark used in evaluation."
    436     },
    437     {
    438       "title": "Repair Ingredients Are All You Need: Improving Large Language Model-Based Program Repair via Repair Ingredients Search",
    439       "authors": ["Jiayi Zhang", "Kai Huang", "Jian Zhang", "Yang Liu", "Chunyang Chen"],
    440       "year": 2025,
    441       "arxiv_id": "2506.23100",
    442       "relevance": "Primary baseline using repair ingredients (internal/external elements) to guide LLM-based program repair."
    443     },
    444     {
    445       "title": "Repairagent: An autonomous, llm-based agent for program repair",
    446       "authors": ["Islem Bouzenia", "Premkumar Devanbu", "Michael Pradel"],
    447       "year": 2024,
    448       "arxiv_id": "2403.17134",
    449       "relevance": "Autonomous LLM agent for program repair; demonstrates agentic approach to APR."
    450     },
    451     {
    452       "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning",
    453       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    454       "year": 2022,
    455       "relevance": "AlphaRepair: zero-shot LLM-based APR using infilling, establishing the LLM-as-repair paradigm."
    456     },
    457     {
    458       "title": "SpecRover: Code intent extraction via llms",
    459       "authors": ["Haifeng Ruan", "Yuntong Zhang", "Abhik Roychoudhury"],
    460       "year": 2024,
    461       "arxiv_id": "2408.02232",
    462       "relevance": "Closest related work: extracts intended program behavior to guide LLM repair, but relies on issue statements and operates at code level."
    463     },
    464     {
    465       "title": "ReAct: Synergizing reasoning and acting in language models",
    466       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik R Narasimhan", "Yuan Cao"],
    467       "year": 2022,
    468       "relevance": "Foundational reasoning-and-acting framework used for VibeRepair's optional reasoning component."
    469     },
    470     {
    471       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    472       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    473       "year": 2022,
    474       "relevance": "Core prompting methodology used in VibeRepair's repair phase for step-by-step reasoning guidance."
    475     },
    476     {
    477       "title": "Automated program repair in the era of large pre-trained language models",
    478       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    479       "year": 2023,
    480       "relevance": "Systematic empirical study of LLM-based APR across models and prompting strategies."
    481     },
    482     {
    483       "title": "Revisiting the plastic surgery hypothesis via large language models",
    484       "authors": ["Chunqiu Steven Xia", "Yifeng Ding", "Lingming Zhang"],
    485       "year": 2023,
    486       "arxiv_id": "2303.10494",
    487       "relevance": "FitRepair: LLM-based APR using the plastic surgery hypothesis for patch generation."
    488     },
    489     {
    490       "title": "Rap-gen: Retrieval-augmented patch generation with codet5 for automatic program repair",
    491       "authors": ["Weishi Wang", "Yue Wang", "Shafiq Joty", "Steven CH Hoi"],
    492       "year": 2023,
    493       "relevance": "Retrieval-augmented patch generation approach for APR using code models."
    494     },
    495     {
    496       "title": "Aligning Requirement for Large Language Model's Code Generation",
    497       "authors": ["Zhao Tian", "Junjie Chen"],
    498       "year": 2025,
    499       "arxiv_id": "2509.01313",
    500       "relevance": "Studies how LLMs may distort requirement-related information during code generation, motivating specification-centric approaches."
    501     }
    502   ],
    503   "engagement_factors": {
    504     "practical_relevance": {
    505       "score": 2,
    506       "justification": "APR practitioners could adopt the specification-centric pipeline, but it requires LLM API access, LangChain setup, and perfect fault localization."
    507     },
    508     "surprise_contrarian": {
    509       "score": 1,
    510       "justification": "The specification-centric vs code-centric framing is a reasonable shift but not deeply contrarian to prevailing views on LLM-based repair."
    511     },
    512     "fear_safety": {
    513       "score": 0,
    514       "justification": "No AI safety or security concerns; focuses on improving automated bug repair."
    515     },
    516     "drama_conflict": {
    517       "score": 0,
    518       "justification": "No controversy or conflict; a straightforward technical contribution."
    519     },
    520     "demo_ability": {
    521       "score": 1,
    522       "justification": "Anonymous review repository is provided but it is not a pip-installable tool or live demo."
    523     },
    524     "brand_recognition": {
    525       "score": 1,
    526       "justification": "Uses GPT-4o (recognized model) but authors are from University of Manchester and MBZUAI, not top-tier AI brand labs."
    527     }
    528   }
    529 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs