ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (23121B)


      1 {
      2   "paper": {
      3     "title": "A Systematic Literature Review on Large Language Models for Automated Program Repair",
      4     "authors": ["Quanjun Zhang", "Chunrong Fang", "Yang Xie", "Yuxiang Ma", "Weisong Sun", "Yun Yang", "Zhenyu Chen"],
      5     "year": 2024,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2405.01466",
      8     "doi": "10.48550/arXiv.2405.01466"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["survey_methodology"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The paper provides a GitHub repository at https://github.com/iSEngLab/AwesomeLLM4APR containing artifacts and bibliographic information for all included papers."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The replication package repository includes all collected papers and their bibliographic information, as stated in Section 4.5: 'All included papers and their bibliographic information are publicly available in our replication repository.'"
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No environment or dependency specifications are provided. The paper is a survey with no computational experiments requiring environment setup, but analysis scripts (if any) have no documented environment."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are provided for replicating the search, filtering, or quality assessment process beyond the methodology description in the paper."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": false,
     38         "answer": false,
     39         "justification": "This is a systematic literature review that reports counts and proportions of papers, not experimental results requiring confidence intervals."
     40       },
     41       "significance_tests": {
     42         "applies": false,
     43         "answer": false,
     44         "justification": "The survey reports descriptive statistics (counts, percentages, distributions) without making comparative claims requiring significance tests."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": false,
     48         "answer": false,
     49         "justification": "No experimental comparisons are made; the paper summarizes and categorizes existing literature."
     50       },
     51       "sample_size_justified": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "As a systematic review, the sample size is determined by the search and filtering process, not by experimental design requiring power analysis."
     55       },
     56       "variance_reported": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "No experimental runs are conducted; the paper is a literature survey."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Section 3 (Related Work) explicitly compares against prior surveys: Zhang et al. [294] on LLM4SE, Wang et al. [231] on LLM testing, Fan et al. [49] on LLMs in SE, Hou et al. [83] on LLM4SE SLR, Gazzola et al. [66] and Monperrus et al. [163] on APR, and Zhang et al. [292] on learning-based APR."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The related surveys cited are contemporary (2023-2024), including Zhang et al. [294], Hou et al. [83], and Wang et al. [231], all from 2023-2024."
     72       },
     73       "ablation_study": {
     74         "applies": false,
     75         "answer": false,
     76         "justification": "As a systematic literature review, there is no system with components to ablate."
     77       },
     78       "multiple_metrics": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "The paper does not run experiments with metrics; it categorizes and summarizes literature."
     82       },
     83       "human_evaluation": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "Human evaluation of system outputs is not applicable to a literature survey."
     87       },
     88       "held_out_test_set": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No test set is used; this is a systematic review."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "The paper provides extensive breakdowns: by year (Fig. 3), venue (Table 4), programming language (Fig. 4), contribution type (Table 5), LLM type (Table 6), utilization strategy (Table 7), bug type (Table 8), dataset (Table 13), and input form (Table 14)."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 9 discusses challenges and limitations of the field including data leakage, cost issues, limited human studies, and under-explored repair scenarios."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper reports negative findings: 53.85% of novel approaches fail to release artifacts (Section 8.5), Olausson et al. finding self-repair is 'not a panacea' (Section 8.4), and limited human studies in the field (only 3 papers)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims to analyze 189 papers from LLMs, APR, and integration perspectives, which is supported by the detailed analysis across Sections 5-8. Claims about categorizing LLMs, repair scenarios, and integration factors are all substantiated."
    114       },
    115       "causal_claims_justified": {
    116         "applies": false,
    117         "answer": false,
    118         "justification": "The paper is a systematic review that describes trends and categorizes work without making causal claims. Statements like 'LLMs have brought significant changes' are descriptive observations about the literature, not causal inferences."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The paper explicitly bounds its scope to 2020-September 2025 (Section 4), to specific search databases (Google Scholar, ACM DL, IEEE Explore), and to papers meeting their inclusion/exclusion criteria. Section 10.3 discusses external validity limitations."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section 10 (Threats to Validity) discusses construct, internal, and external validity threats including potential bias in paper selection, keyword limitations, and generalizability concerns."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper's claims match its measurements — it claims to summarize and categorize 189 papers, and that is what it does. It does not frame paper counts as proxies for broader phenomena without acknowledgment."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": false,
    139         "answer": false,
    140         "justification": "The paper does not use any LLMs in its own methodology; it is a survey reviewing papers that use LLMs."
    141       },
    142       "prompts_provided": {
    143         "applies": false,
    144         "answer": false,
    145         "justification": "No prompting is used in the survey methodology."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "No models are run; this is a literature survey."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used in this survey."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "The paper documents its filtering pipeline in detail: 7,253 initial papers → 285 after inclusion/exclusion criteria → 167 after quality assessment → 189 after snowballing (Sections 4.3-4.5, Fig. 2). Criteria are specified in Tables 2 and 3."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 10 (Threats to Validity) provides a dedicated discussion of construct validity (10.1), internal validity (10.2), and external validity (10.3)."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 10 discusses specific threats: ambiguity in paper categorization (10.1), potential bias from two-author review process (10.2), limitations of keyword search that may miss papers (10.3), and the rapidly evolving nature of the field."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The paper explicitly states its scope: papers from 2020 to September 2025, specific search databases, English-language papers only, and papers meeting their quality threshold of 8/10 points. Section 10.3 acknowledges papers may have been missed."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The replication package at https://github.com/iSEngLab/AwesomeLLM4APR provides the full list of included papers and bibliographic information, enabling independent verification."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 4 describes the data collection in detail: manual search of 6 SE venues yielding 25 seed papers, automated search across 3 digital libraries yielding 7,253 papers, and snowballing search adding 21 papers."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants are involved; data sources are standard digital libraries and venues which are fully described."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Fig. 2 and Sections 4.3-4.5 document the full pipeline: manual search (25 papers) → automated search (7,253) → inclusion/exclusion filtering (285) → quality assessment (167) → snowballing (189), with counts at each stage and criteria described."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The Acknowledgments section discloses funding from the Australian Research Council (DP200102491, DP230101790), National Natural Science Foundation of China (61932012, 62141215, 62372228), Natural Science Foundation of Jiangsu Province, and Fundamental Research Funds."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All author affiliations are clearly listed: Nanjing University of Science and Technology, Nanjing University, Nanyang Technological University, and Swinburne University of Technology."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "The funders are government research councils and university funds (Australian Research Council, NSFC, Jiangsu Province) with no financial stake in the survey's conclusions about LLM-based APR."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is provided in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "This is a systematic literature review that does not evaluate any pre-trained model on a benchmark."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "No model evaluation is performed in this survey."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No model evaluation is performed in this survey."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants are involved in this systematic literature review."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants are involved."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants are involved."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants are involved."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants are involved."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants are involved."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants are involved."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "This is a survey paper with no computational method to report costs for."
    283       },
    284       "compute_budget_stated": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "This is a survey paper with no computational experiments."
    288       }
    289     },
    290     "survey_methodology": {
    291       "prisma_or_structured_protocol": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "The paper follows SEGRESS guidelines (Section 4), uses the Quasi-Gold Standard (QGS) method for search string construction, and documents a structured multi-stage search process with manual search, automated search, and snowballing (Fig. 2). Inclusion/exclusion criteria (Table 2) and quality assessment criteria (Table 3) are formally specified."
    295       },
    296       "quality_assessment_of_sources": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Section 4.4.2 describes a 10-question quality assessment checklist (Table 3) where two authors independently rate papers on a three-tier scale (yes/partial/no). Papers scoring below 8 out of 10 are excluded. This reduced the pool from 285 to 167 papers."
    300       },
    301       "publication_bias_discussed": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The paper does not discuss publication bias. There is no funnel plot, no discussion of negative-result underrepresentation, and no acknowledgment that the published LLM-based APR literature likely skews toward positive results."
    305       }
    306     }
    307   },
    308   "claims": [
    309     {
    310       "claim": "189 relevant LLM-based APR papers were published between 2020 and September 2025, showing a rapidly increasing trend.",
    311       "evidence": "Section 5.1 and Fig. 3 show the publication trend: 1 (2020), 5 (2021), 13 (2022), 53 (2023), 66 (2024), 51 (2025 partial). Power function fit achieves R²=0.98833.",
    312       "supported": "strong"
    313     },
    314     {
    315       "claim": "GPT-3.5 and GPT-4 are the two most popular LLMs in APR research, used in 72 and 51 papers respectively.",
    316       "evidence": "Table 6 provides detailed year-by-year usage counts for 78 LLMs across all 189 papers.",
    317       "supported": "strong"
    318     },
    319     {
    320       "claim": "LLM-based APR has been applied to 20 different bug types and 24 programming languages.",
    321       "evidence": "Table 8 lists all 20 bug types with paper counts, and Fig. 4 shows the distribution across 24 programming languages.",
    322       "supported": "strong"
    323     },
    324     {
    325       "claim": "62.99% of collected papers provide publicly available artifacts, rising to 86.84% for top-tier SE venues.",
    326       "evidence": "Section 8.5 states '80 studies provide the replication packages in their papers, accounting for 62.99% (80/189)' and '86.84% of papers (33/38) make related artifacts publicly open' for top venues.",
    327       "supported": "strong"
    328     },
    329     {
    330       "claim": "Semantic bugs constitute approximately 42.93% of LLM-based APR research, making it the dominant repair scenario.",
    331       "evidence": "Table 8 shows semantic bugs with the highest paper count across all years, with explicit percentage stated in Section 7.",
    332       "supported": "strong"
    333     },
    334     {
    335       "claim": "There is a shift from fine-tuning toward zero-shot prompting and agent-based approaches as LLMs have grown in capability.",
    336       "evidence": "Table 7 shows fine-tuning dominated early years (2021-2022) while zero-shot and agent approaches emerged in 2023-2025. Section 6.2.5 discusses this trend.",
    337       "supported": "moderate"
    338     }
    339   ],
    340   "methodology_tags": ["meta-analysis"],
    341   "key_findings": "This systematic review of 189 LLM-based APR papers (2020-2025) identifies 78 distinct LLMs used for program repair, with GPT-3.5 (72 papers) and GPT-4 (51 papers) dominating. The field shows a clear evolution from fine-tuning smaller models toward zero-shot prompting and agent-based approaches with larger models. LLM-based APR has expanded to 20 bug types and 24 programming languages, far exceeding traditional APR's scope. Open science remains a concern, with 53.85% of novel approaches failing to release artifacts, though top-tier venues show 86.84% availability.",
    342   "red_flags": [
    343     {
    344       "flag": "No publication bias analysis",
    345       "detail": "The survey does not discuss whether the LLM-based APR literature it reviews is biased toward positive results. Papers showing LLMs fail at APR tasks are less likely to be published, which could inflate the apparent effectiveness of the field."
    346     },
    347     {
    348       "flag": "Quality assessment lacks inter-rater reliability reporting",
    349       "detail": "While two authors independently performed quality assessment, the paper does not report inter-rater agreement statistics (e.g., Cohen's kappa) for the 10-question quality assessment, only stating disagreements were resolved through discussion."
    350     },
    351     {
    352       "flag": "No critical synthesis of methodological quality",
    353       "detail": "Despite having a quality assessment checklist, the paper does not report the aggregate quality scores or analyze how methodological quality varies across the 189 papers. It treats all papers that passed the threshold equally, potentially laundering weak results alongside strong ones."
    354     }
    355   ],
    356   "cited_papers": [
    357     {
    358       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    359       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"],
    360       "year": 2024,
    361       "arxiv_id": "2310.06770",
    362       "relevance": "Premier benchmark for evaluating LLM agents on real-world repository-level software engineering tasks."
    363     },
    364     {
    365       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    366       "authors": ["John Yang", "Carlos E Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    367       "year": 2024,
    368       "relevance": "Key agentic approach for repository-level program repair using LLM-based agents with tool interfaces."
    369     },
    370     {
    371       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    372       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    373       "year": 2024,
    374       "arxiv_id": "2407.16741",
    375       "relevance": "Open platform for AI coding agents with generalist capabilities for software development tasks."
    376     },
    377     {
    378       "title": "Automated Program Repair in the Era of Large Pre-trained Language Models",
    379       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    380       "year": 2023,
    381       "relevance": "Comprehensive empirical study of 9 LLMs across 5 datasets for program repair, establishing key findings about LLM capabilities."
    382     },
    383     {
    384       "title": "A Survey on Large Language Models for Software Engineering",
    385       "authors": ["Quanjun Zhang", "Chunrong Fang", "Yang Xie"],
    386       "year": 2023,
    387       "arxiv_id": "2312.15223",
    388       "relevance": "Broader survey on LLMs in SE providing context for the APR-specific review."
    389     },
    390     {
    391       "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair",
    392       "authors": ["Islem Bouzenia", "Premkumar Devanbu", "Michael Pradel"],
    393       "year": 2024,
    394       "arxiv_id": "2403.17134",
    395       "relevance": "First autonomous LLM-based repair agent using ChatGPT with state machine for iterative debugging."
    396     },
    397     {
    398       "title": "AutoCodeRover: Autonomous Program Improvement",
    399       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    400       "year": 2024,
    401       "relevance": "AST-based repository-level repair agent demonstrating autonomous program improvement."
    402     },
    403     {
    404       "title": "Automated Program Repair via Conversation: Fixing 162 out of 337 Bugs for $0.42 Each using ChatGPT",
    405       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    406       "year": 2024,
    407       "relevance": "Pioneering conversational APR approach (ChatRepair) with cost analysis, demonstrating practical LLM-based repair."
    408     },
    409     {
    410       "title": "Is Self-Repair a Silver Bullet for Code Generation?",
    411       "authors": ["Theo X. Olausson", "Jeevana Priya Inala", "Chenglong Wang", "Jianfeng Gao", "Armando Solar-Lezama"],
    412       "year": 2024,
    413       "relevance": "Empirical study questioning the effectiveness of LLM self-repair, finding existing models often fail to provide reliable feedback."
    414     },
    415     {
    416       "title": "A Survey of Learning-Based Automated Program Repair",
    417       "authors": ["Quanjun Zhang", "Chunrong Fang", "Yuxiang Ma", "Weisong Sun", "Zhenyu Chen"],
    418       "year": 2023,
    419       "relevance": "Predecessor survey on learning-based APR that this work directly extends to the LLM era."
    420     },
    421     {
    422       "title": "SWE-RL: Advancing LLM Reasoning via Reinforcement Learning on Open Software Evolution",
    423       "authors": ["Yuxiang Wei"],
    424       "year": 2025,
    425       "arxiv_id": "2502.18449",
    426       "relevance": "RL-based approach to improving LLM reasoning for software engineering tasks."
    427     },
    428     {
    429       "title": "Agentless: Demystifying LLM-based Software Engineering Agents",
    430       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"],
    431       "year": 2025,
    432       "relevance": "Procedural approach to repository-level repair without agentic scaffolding, providing important baseline."
    433     }
    434   ]
    435 }

Impressum · Datenschutz