scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25111B)
      1 {
      2   "paper": {
      3     "title": "A Survey of LLM-based Software Repair: Taxonomies, Design Paradigms, and Applications",
      4     "authors": [
      5       "Boyang Yang",
      6       "Zijian Cai",
      7       "Fengling Liu",
      8       "Bach Le",
      9       "Lingming Zhang",
     10       "Tegawendé F. Bissyandé",
     11       "Yang Liu",
     12       "Haoye Tian"
     13     ],
     14     "year": 2025,
     15     "venue": "ACM Transactions on Software Engineering and Methodology",
     16     "arxiv_id": "2506.23749",
     17     "doi": "10.1145/nnnnnnn.nnnnnnn"
     18   },
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper provides a GitHub repository URL (https://github.com/GLEAM-Lab/ProgramRepair) multiple times (Section 1, Section 2.2, Section 2.4) containing their scripted survey pipeline, screening scripts, per-stage record lists, and analysis notebooks."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper states that their replication package includes per-stage record lists, extracted metadata, the representation papers list, and analysis notebooks. The artifacts at the GitHub repository are described as 'publicly available' (Section 1)."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No environment or dependency specifications (requirements.txt, library versions, etc.) are mentioned in the paper for running their survey pipeline scripts."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "While the paper mentions scripts and notebooks are available, no step-by-step reproduction instructions are provided in the paper itself. The paper states the artifacts are 'publicly available' but does not describe how to execute the pipeline."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "This is a survey paper that does not run its own experiments. It reports descriptive counts and published results from other papers, not its own statistical estimates."
     47       },
     48       "significance_tests": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "As a systematic survey, this paper does not make comparative claims based on its own experiments. It summarizes published results from other papers."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "The paper does not run its own experiments; it summarizes effect sizes reported by the surveyed papers."
     57       },
     58       "sample_size_justified": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "As a survey, the paper does not run experiments with sample sizes. The corpus size (62 papers) is the result of a systematic screening process rather than a statistical sample."
     62       },
     63       "variance_reported": {
     64         "applies": false,
     65         "answer": false,
     66         "justification": "No experiments are conducted; this is a survey paper that reports published results from other studies."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper compares its taxonomy and coverage against nine prior surveys in Table 8 (Section 9, Related Work), identifying specific gaps each prior survey has and how this survey addresses them."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The prior surveys compared in Table 8 range from 2022 to 2025, including recent works by Zhang et al. (2024), Haque et al. (2025), He et al. (2025), and Zhou et al. (2025). These are contemporary and relevant comparisons."
     79       },
     80       "ablation_study": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "As a survey paper, there is no system with components to ablate."
     84       },
     85       "multiple_metrics": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "The paper does not run its own evaluation experiments. It reports metrics from surveyed papers (pass@k, accuracy, F1, etc.)."
     89       },
     90       "human_evaluation": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "This is a survey paper classifying and comparing existing systems. Human evaluation of system outputs is not relevant to the paper's claims, which are about taxonomy organization and coverage gaps."
     94       },
     95       "held_out_test_set": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "No experiments are conducted; this is a survey."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper provides per-paradigm breakdowns (Tables 3-9), per-year adoption counts (Table 4), per-benchmark trend snapshots (Tables 5-7), and per-sub-paradigm detailed system information (Table 9). Figure 4 shows defect scope distribution and Figure 5 shows venue distribution."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper discusses limitations and failure modes of each paradigm throughout Sections 4-7. Section 10 (Open Challenges) systematically addresses bottlenecks including evaluation reliability issues, workflow robustness problems, retrieval failures, and security concerns."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper reports negative findings: prompt sensitivity causing repair failures (Section 5.1), diminishing returns with verbose context (Section 5.1), benchmark contamination inflating scores (Section 10), weak test suites hiding semantic faults (Section 10), and high coordination costs offsetting accuracy gains in agentic systems (Section 7.3)."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims a unified taxonomy of 62 systems into four paradigms with two augmentation layers, consolidation of evaluation practice on benchmarks, and public artifact release. All are supported: Table 3 maps all 62 systems, Tables 5-7 consolidate benchmark results, and the GitHub URL is provided."
    121       },
    122       "causal_claims_justified": {
    123         "applies": false,
    124         "answer": false,
    125         "justification": "The paper does not make causal claims. It describes taxonomic organization, adoption trends, and benchmark results from other papers. Language like 'adoption shifts' describes observed patterns, not causal effects."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper explicitly states it treats the 62 papers as 'a representative sample of current LLM-based software repair research at these venues rather than a complete survey of all existing systems' (Section 2.4). External validity threats are discussed in Section 11.2."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section 8 (Answer to RQ3) explicitly discusses how protocol differences, not just model quality, shape reported repair rates. The paper notes that improvements may arise from 'better models, from more generous pass@k, or from stronger evaluation setups' (Section 8). Section 11 discusses threats to validity including internal, external, and construct validity."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "This is a survey paper that does not use any LLM models in its own methodology. It reports model versions used by the surveyed papers."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use prompting in its own methodology; it surveys papers that use prompting."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "No LLM experiments are conducted. The paper's methodology is a manual systematic review process."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used in this survey's own methodology."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The paper documents its filtering pipeline in detail: 1,581 initial hits → 994 after repair-related filter → 726 after LLM-related filter → 449 after benchmark-related filter → 462 after snowballing → 62 representative works (Table 2). Filtering criteria at each stage are described in Sections 2.2-2.4, with inclusion/exclusion criteria explicitly stated in Section 2.3. Keyword families are listed in Table 1."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 11 ('Threats to Validity') presents a dedicated three-part discussion covering internal, external, and construct validity threats."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The threats are specific to this study: representation focuses on tier-1 venues (excluding proprietary-data-only tools), headline scores reported as published without normalization of pass@k or budgets, no joint reevaluation performed, and coding scheme ambiguity is mitigated by inter-rater discussion (Sections 11.1-11.3)."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly states its scope boundaries: year filter >=2022, tier-1 SE/AI/Security venues, papers with reproducible benchmark evaluation, 62 representative rather than exhaustive sample. Section 2.3 lists detailed exclusion criteria. Section 11.2 states 'tools evaluated only on proprietary data fall outside our scope.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The paper states that 'Per-stage record lists, the scripts for fetching and filtering papers from multiple sources, and analysis notebooks that regenerate our tables and figures are provided in our replication package' (Section 2.4)."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 2.2 describes the data collection procedure in detail: structured queries on ACM DL, IEEE Xplore, DBLP, and arXiv; query templates combining core repair terms, LLM names, benchmark names, and tool names (Table 1); year filter >=2022; last query run on 31 October 2025; backward citation snowballing."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants are involved. The 'sample' is a corpus of research papers collected via database queries and snowballing."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Figure 3 shows the full pipeline workflow. Table 2 reports exact record counts at each filtering stage (1,581 → 994 → 726 → 449 → 462 → 62). Section 2.4 describes each stage including automatic filters, snowballing, and manual screening with inter-rater agreement via Cohen's kappa."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding or acknowledgments section is present in the paper. There is no mention of funding sources."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All author affiliations are listed prominently on the first page. Several authors (Boyang Yang, Haoye Tian) are authors of multiple surveyed systems (CREF, KGCompass, MORepair, LANTERN), and these affiliations are visible through the reference list."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed at all. Without knowing the funding source, independence cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This is a survey paper that does not evaluate any pre-trained model on a benchmark."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "This is a survey paper that does not evaluate any pre-trained model on a benchmark. (The paper does discuss contamination as an issue in the surveyed works, but this checklist item is about the paper's own methodology.)"
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "This is a survey paper that does not evaluate any pre-trained model on a benchmark."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this survey paper."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this survey paper."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this survey paper."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this survey paper."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this survey paper."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are involved in this survey paper."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants are involved in this survey paper."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "This is a survey paper. It does not propose or run its own method with inference costs. It reports costs from surveyed papers (e.g., ChatRepair at ~$0.42 per bug)."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "This is a survey paper. Its own methodology involves literature search and manual screening, not compute-intensive model runs."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "Adoption of LLM-based software repair paradigms has shifted from single-turn prompting toward procedural and agentic workflows between 2022 and October 2025.",
    296       "evidence": "Table 4 shows year-by-year distribution: prompting dominated in 2022-2023 (8 of 14 papers), while procedural (8) and agentic (10) systems together account for 18 of 29 papers in 2025. (Section 3.4, Answer to RQ1)",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "The unified four-paradigm taxonomy (fine-tuning, prompting, procedural, agentic) with orthogonal retrieval and analysis augmentation layers organizes the LLM-based software repair design space more precisely than prior surveys.",
    301       "evidence": "Table 8 compares nine prior surveys and identifies three specific gaps: no unified design space treating all four as first-class paradigms, no orthogonal augmentation modeling, and no cell-wise benchmark comparison. The paper maps 62 systems into this framework with Table 3 and Table 9. (Section 9)",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Reported repair rates are shaped as much by protocol design (fault localization assumptions, pass@k budgets, benchmark subsets) as by model quality.",
    306       "evidence": "Tables 5-7 record evaluation assumptions alongside scores, showing that systems evaluate on different subsets of Defects4J, use varying pass@k budgets (pass@1 to pass@5000), and make different fault localization assumptions (perfect FL vs. end-to-end). The answer to RQ3 makes this point explicit. (Section 8)",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "Four bottlenecks remain for semantic correctness and repository-scale repair: shallow test suites, expensive multi-step workflows, noisy retrieval, and weak patch assurance.",
    311       "evidence": "Section 10 discusses each bottleneck with specific references to surveyed systems: data leakage and weak test suites inflating scores, token and compute costs in agentic workflows, embedding-based retrieval missing key files, and LLM-generated patches introducing latent bugs. (Section 10, Answer to RQ4)",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "Two authors independently screened 462 records with disagreements resolved through discussion and a third author consulted for borderline cases, using Cohen's kappa for inter-rater agreement.",
    316       "evidence": "Section 2.4 states this procedure but the actual Cohen's kappa value and detailed disagreement statistics are deferred to the replication package rather than reported in the paper.",
    317       "supported": "weak"
    318     }
    319   ],
    320   "methodology_tags": [
    321     "meta-analysis"
    322   ],
    323   "key_findings": "This systematic survey proposes a unified taxonomy of 62 LLM-based software repair systems organized along two axes: control authority (who steers the repair loop) and parameter adaptation (whether the base model is fine-tuned or frozen), yielding four paradigms (fine-tuning, prompting, procedural, agentic) with retrieval and analysis augmentation as orthogonal layers. The survey finds that adoption has shifted from single-turn prompting toward procedural and agentic workflows, and that reported repair rates are heavily influenced by evaluation protocol choices (benchmark subsets, pass@k budgets, fault localization assumptions), making cross-paper comparisons unreliable without accounting for these differences. The paper consolidates benchmark trends across Defects4J, HumanEval-Java, and SWE-bench, revealing that progress on repository-level repair is closely tied to better context management and feedback rather than model scaling alone.",
    324   "red_flags": [
    325     {
    326       "flag": "Authors survey their own systems",
    327       "detail": "Several authors (Boyang Yang, Haoye Tian, and others) are creators of multiple surveyed systems including CREF [126], KGCompass [127], MORepair [128], LANTERN [79], and Luo et al. [78]. At least 5 of the 62 surveyed papers are authored by members of the survey team. This potential conflict of interest is not disclosed or discussed."
    328     },
    329     {
    330       "flag": "Inter-rater agreement statistics deferred to replication package",
    331       "detail": "The paper states that Cohen's kappa was measured for the screening process but does not report the actual value in the paper. The coding guidelines and detailed statistics are deferred to the replication package, making it impossible to assess screening reliability from the paper alone."
    332     },
    333     {
    334       "flag": "Survey does not perform quality assessment of individual studies",
    335       "detail": "While the paper records 'basic quality indicators' for each paper (mentioned in Section 2.3 and 2.4), it does not systematically assess the methodological quality of the 62 surveyed studies. The survey summarizes claimed results at face value without evaluating whether reported numbers are credible or methodologically sound, which risks laundering weak results."
    336     },
    337     {
    338       "flag": "No funding or competing interests disclosure",
    339       "detail": "The paper contains no acknowledgments section, no funding disclosure, and no competing interests statement, despite the authors being affiliated with multiple universities and having published several of the surveyed systems."
    340     }
    341   ],
    342   "cited_papers": [
    343     {
    344       "title": "Agentless: Demystifying LLM-Based Software Engineering Agents",
    345       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    346       "year": 2025,
    347       "relevance": "Key procedural pipeline for repository-level repair, achieving competitive results on SWE-bench without agentic autonomy."
    348     },
    349     {
    350       "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    351       "authors": ["John Yang", "Carlos Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    352       "year": 2024,
    353       "relevance": "Foundational tool-augmented agent for software repair using ReAct loop with GPT-4 on SWE-bench."
    354     },
    355     {
    356       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    357       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"],
    358       "year": 2024,
    359       "relevance": "The primary repository-level repair benchmark used to evaluate agentic and procedural repair systems."
    360     },
    361     {
    362       "title": "AutoCodeRover: Autonomous Program Improvement",
    363       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    364       "year": 2024,
    365       "relevance": "Tool-augmented agent using AST navigation and spectrum-based fault localization for SWE-bench repair."
    366     },
    367     {
    368       "title": "ChatRepair: Automated Program Repair via Conversation",
    369       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    370       "year": 2024,
    371       "relevance": "Pioneering test-in-the-loop procedural approach demonstrating cost-effective conversational repair with ChatGPT."
    372     },
    373     {
    374       "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair",
    375       "authors": ["Islem Bouzenia", "Premkumar Devanbu", "Michael Pradel"],
    376       "year": 2025,
    377       "relevance": "Agent-based repair system using finite-state controller with tool access, achieving state-of-the-art on Defects4J."
    378     },
    379     {
    380       "title": "SWE-RL: Advancing LLM Reasoning via Reinforcement Learning on Open Software Evolution",
    381       "authors": ["Yuxiang Wei", "Olivier Duchenne", "Jade Copet"],
    382       "year": 2025,
    383       "relevance": "Demonstrates RL fine-tuning on real GitHub evolution traces for repository-level repair, state-of-the-art among open-source LLMs on SWE-bench Verified."
    384     },
    385     {
    386       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    387       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    388       "year": 2024,
    389       "relevance": "Open platform for tool-augmented agentic software development and repair on SWE-bench."
    390     },
    391     {
    392       "title": "Abstain and Validate: A Dual-LLM Policy for Reducing Noise in Agentic Program Repair",
    393       "authors": ["José Cambronero", "Michele Tufano", "Sherry Shi"],
    394       "year": 2025,
    395       "arxiv_id": "2510.03217",
    396       "relevance": "Industrial-scale LLM-as-judge approach with abstention and validation policies for production repair at Google."
    397     },
    398     {
    399       "title": "MAGIS: LLM-Based Multi-Agent Framework for GitHub Issue Resolution",
    400       "authors": ["Wei Tao", "Yucheng Zhou", "Yanlin Wang", "Wenqiang Zhang", "Hongyu Zhang", "Yu Cheng"],
    401       "year": 2024,
    402       "relevance": "Self-controlled multi-agent system with role separation for repository-level repair on SWE-bench."
    403     },
    404     {
    405       "title": "RepairLLaMA: Efficient Representations and Fine-Tuned Adapters for Program Repair",
    406       "authors": ["André Silva", "Sen Fang", "Martin Monperrus"],
    407       "year": 2025,
    408       "relevance": "Demonstrates parameter-efficient fine-tuning (LoRA) for program repair with competitive results at low compute cost."
    409     },
    410     {
    411       "title": "A Systematic Literature Review on Large Language Models for Automated Program Repair",
    412       "authors": ["Quanjun Zhang", "Chunrong Fang", "Yang Xie", "YuXiang Ma", "Weisong Sun", "Yun Yang", "Zhenyu Chen"],
    413       "year": 2024,
    414       "arxiv_id": "2405.01466",
    415       "relevance": "Most closely related prior survey classifying LLM-based repair by utilization modes, which this survey aims to supersede with a control-paradigm taxonomy."
    416     }
    417   ]
    418 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs