ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (21417B)


      1 {
      2   "paper": {
      3     "title": "Large Language Model-Based Agents for Software Engineering: A Survey",
      4     "authors": [
      5       "Junwei Liu",
      6       "Kaixin Wang",
      7       "Yixuan Chen",
      8       "Xin Peng",
      9       "Zhenpeng Chen",
     10       "Lingming Zhang",
     11       "Yiling Lou"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv",
     15     "arxiv_id": "2409.02977"
     16   },
     17   "scan_version": 2,
     18   "active_modules": ["survey_methodology"],
     19   "methodology_tags": ["meta-analysis"],
     20   "key_findings": "This survey collects 124 papers on LLM-based agents for software engineering, categorized from both SE task perspectives (requirements engineering, code generation, testing, debugging, IT operations, end-to-end development/maintenance) and agent architecture perspectives (planning, memory, perception, action, multi-agent systems). The survey finds that 59.7% of SE agents use multi-agent systems, with waterfall being the most popular process model for end-to-end development. Key challenges identified include reliance on test feedback quality, cascading errors from incorrect model feedback, and degraded long-context reasoning. The authors note that approximately 75% of collected papers are peer-reviewed publications.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper provides a GitHub repository at https://github.com/FudanSELab/Agent4SE-Paper-List for the paper list."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper list repository serves as the released data artifact. The collected 124 papers and their categorizations are available via the GitHub repository."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No environment or dependency specifications are provided. As a survey, analysis scripts or tools used for paper collection/analysis are not specified."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions are provided for replicating the survey methodology or paper selection process."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": false,
     47         "answer": false,
     48         "justification": "This is a survey paper that does not run experiments or produce statistical results requiring confidence intervals."
     49       },
     50       "significance_tests": {
     51         "applies": false,
     52         "answer": false,
     53         "justification": "This is a survey paper with no statistical comparisons requiring significance tests."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": false,
     57         "answer": false,
     58         "justification": "This is a survey paper with no experimental results requiring effect size reporting."
     59       },
     60       "sample_size_justified": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "This is a survey paper; no experimental sample sizes to justify."
     64       },
     65       "variance_reported": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "This is a survey paper with no experimental runs requiring variance reporting."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Section 2.4 discusses related surveys on LLMs for SE and general-domain LLM agents, positioning this survey against prior work including surveys by Hou et al. [2], Fan et al. [3], and He et al. [40]."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The compared surveys are recent (2023-2024), representing the current state of survey literature in this domain."
     81       },
     82       "ablation_study": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "Not applicable to a survey paper — there is no system with components to ablate."
     86       },
     87       "multiple_metrics": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "Not applicable — this is a survey paper without experimental evaluation requiring metrics."
     91       },
     92       "human_evaluation": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "Not applicable to a survey paper's claims, which are about coverage and categorization of literature."
     96       },
     97       "held_out_test_set": {
     98         "applies": false,
     99         "answer": false,
    100         "justification": "Not applicable — no experimental evaluation requiring held-out sets."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The survey provides detailed breakdowns by SE task category (Figure 4), publication venue distribution (Figure 3b), and temporal trends (Figure 3a). Tables throughout the paper break down agents by task type."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper discusses failure cases of LLM-based agents in each SE task category (Sections 4.2.3, 4.5.4) including coordination failures, cascading errors, and degraded long-context reasoning."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper reports negative findings including that dual planners did not outperform single planners in Flows [102] (Section 4.2.1), and that autonomous localization did not lead to better performance (Section 4.8.8)."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims to collect 124 papers and categorize them from SE and agent perspectives, which is substantiated throughout the paper. The claim about 'remarkable effectiveness' is supported by cited results throughout Section 4."
    123       },
    124       "causal_claims_justified": {
    125         "applies": false,
    126         "answer": false,
    127         "justification": "The paper is a survey that does not make causal claims. It summarizes and categorizes existing work."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title claims 'Large Language Model-Based Agents for Software Engineering: A Survey' broadly, but the paper collection is bounded to July 2024 with DBLP keyword search. The scope section (3.1) defines boundaries but the title and abstract do not reflect these temporal and source limitations."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "As a survey/taxonomy paper presenting no empirical results of its own, alternative explanations are not applicable."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "Theoretical/survey paper with no measurements of its own."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": false,
    148         "answer": false,
    149         "justification": "The paper does not use any LLM models — it is a survey paper."
    150       },
    151       "prompts_provided": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "The paper does not use prompting — it is a survey paper."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": false,
    158         "answer": false,
    159         "justification": "No experiments requiring hyperparameters."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding used — this is a survey paper."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 3 describes the paper collection methodology in detail: keyword searching on DBLP (57 searches, 10,362 hits → 67 after manual filtering), snowballing (adding 41 papers), and author feedback collection (adding 16 more). Table 1 provides inclusion/exclusion criteria. Table 2 shows per-keyword hit counts."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 7.2 'Threats to Validity' provides a dedicated discussion of threats including manual inspection bias, publication status of collected papers, and validation gaps for specific strategies."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 7.2 discusses specific threats: manual paper inspection may lead to inadvertent exclusion, several strategies are supported primarily by preprints (multi-agent RE, knowledge-enhanced bug detection, iterative coverage improvements, visual input), with specific counts of published vs. unpublished papers for each claim."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 3.1 explicitly defines scope boundaries: focuses on LLM-based agents that 'iteratively interact with the environment' (not standalone LLMs), SE tasks along the software life cycle, and positions as 'comprehensive survey rather than a systematic literature review.' The paper also states it does not conduct extra experimental analysis."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "The paper list is available at the GitHub repository https://github.com/FudanSELab/Agent4SE-Paper-List, allowing verification of the collected papers."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 3.2 describes the collection process in detail: DBLP keyword searching on July 1, 2024, snowballing between July 1-10, 2024, and author feedback collection with 321 authors contacted and 36 responses received."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. The data source is a standard literature database (DBLP) with documented search queries."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Table 2 shows the full pipeline: 10,362 hits → 67 after manual inspection → 108 after snowballing → 124 after author feedback. Each stage is described in Sections 3.2.1-3.2.3."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding sources are disclosed in the paper. There is no acknowledgments section listing grants or sponsors."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: Fudan University (Liu, Wang, Chen, Peng), Nanyang Technological University (Z. Chen), and University of Illinois Urbana-Champaign (Zhang, Lou)."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No funding is disclosed, so independence cannot be assessed."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement is provided anywhere in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "This is a survey paper that does not evaluate a pre-trained model on any benchmark."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "This is a survey paper that does not evaluate a pre-trained model on any benchmark."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "This is a survey paper that does not evaluate a pre-trained model on any benchmark."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this survey paper."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this survey paper."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this survey paper."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this survey paper."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this survey paper."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this survey paper."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this survey paper."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": false,
    290         "answer": false,
    291         "justification": "This is a survey paper with no method of its own requiring cost reporting."
    292       },
    293       "compute_budget_stated": {
    294         "applies": false,
    295         "answer": false,
    296         "justification": "This is a survey paper with no computational experiments."
    297       }
    298     },
    299     "survey_methodology": {
    300       "prisma_or_structured_protocol": {
    301         "applies": true,
    302         "answer": true,
    303         "justification": "Section 3 describes a structured protocol: keyword searching on DBLP with documented search strings, inclusion/exclusion criteria (Table 1), snowballing, and author feedback collection. The process follows established SE survey practices citing [41], [42], [43], [44], [45]. While not PRISMA specifically, it is a documented systematic protocol."
    304       },
    305       "quality_assessment_of_sources": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The survey does not assess the methodological quality of its 124 source papers. All papers meeting inclusion criteria are treated equally regardless of study rigor. Section 7.2 acknowledges that ~25% are preprints, but no quality scoring or risk-of-bias assessment is applied."
    309       },
    310       "publication_bias_discussed": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "The survey does not discuss publication bias. There is no consideration of whether the collected papers skew toward positive results, no funnel plot analysis, and no acknowledgment that papers showing agents don't work may be underrepresented."
    314       }
    315     }
    316   },
    317   "claims": [
    318     {
    319       "claim": "124 papers on LLM-based agents for SE were collected through keyword searching, snowballing, and author feedback.",
    320       "evidence": "Section 3 and Table 2 document the full collection pipeline: 67 from keyword searching, 41 from snowballing, 16 from author feedback.",
    321       "supported": "strong"
    322     },
    323     {
    324       "claim": "59.7% of existing agents for SE are multi-agent systems.",
    325       "evidence": "Section 5.2 states this statistic based on the collected papers.",
    326       "supported": "strong"
    327     },
    328     {
    329       "claim": "Approximately 75% of references are peer-reviewed publications from reputable journals and conferences.",
    330       "evidence": "Section 3.3 states this, with Figure 3b showing venue distribution.",
    331       "supported": "moderate"
    332     },
    333     {
    334       "claim": "Agentless, using a simplistic workflow, can outperform more complex fully autonomous agents on SWE-bench.",
    335       "evidence": "Section 4.8.8 and Figure 17 show Agentless achieves competitive resolve rates on SWE-bench Lite, confirmed by OpenAI [269].",
    336       "supported": "strong"
    337     },
    338     {
    339       "claim": "Pure autonomous localization methods do not necessarily lead to better performance in end-to-end software maintenance.",
    340       "evidence": "Section 4.8.8 notes SWE-agent, which allows fully autonomous localization, performed worst among all agents on SWE-bench Lite (Figure 17).",
    341       "supported": "moderate"
    342     },
    343     {
    344       "claim": "The Scrum model achieves the best and most stable performance among tested software process models for code generation.",
    345       "evidence": "Section 4.7.1 cites experiments from FlowGen [237] on function-level code generation benchmarks.",
    346       "supported": "weak"
    347     }
    348   ],
    349   "red_flags": [
    350     {
    351       "flag": "No quality assessment of source papers",
    352       "detail": "The survey treats all 124 papers equally regardless of methodological quality. ~25% are preprints that have not undergone peer review. Without quality assessment, the survey may launder weak results from low-quality studies alongside rigorous work."
    353     },
    354     {
    355       "flag": "No publication bias discussion",
    356       "detail": "The survey does not consider whether negative results about LLM-based agents for SE are underrepresented in the literature. Given the hype around LLM agents, there is likely significant publication bias toward positive findings."
    357     },
    358     {
    359       "flag": "Self-citation concentration",
    360       "detail": "Several of the authors (from Fudan University and UIUC) have published papers included in the survey. While their affiliations are disclosed, potential bias in how their own work is characterized is not explicitly acknowledged."
    361     }
    362   ],
    363   "cited_papers": [
    364     {
    365       "title": "Large language models for software engineering: A systematic literature review",
    366       "authors": ["Xinyi Hou", "Yanjie Zhao", "Yue Liu"],
    367       "year": 2024,
    368       "relevance": "Comprehensive SLR on LLMs in SE, key comparison point for this survey's scope."
    369     },
    370     {
    371       "title": "SWE-bench: Can language models resolve real-world github issues?",
    372       "authors": ["Carlos E. Jimenez", "John Yang"],
    373       "year": 2024,
    374       "relevance": "Primary benchmark for end-to-end software maintenance agents, central to Section 4.8."
    375     },
    376     {
    377       "title": "Demystifying llm-based software engineering agents",
    378       "authors": ["Chunqiu Steven Xia", "Yinlin Deng"],
    379       "year": 2025,
    380       "relevance": "Agentless approach showing simpler workflows can outperform complex agents on SWE-bench."
    381     },
    382     {
    383       "title": "ChatDev: Communicative agents for software development",
    384       "authors": ["Chen Qian", "Wei Liu"],
    385       "year": 2024,
    386       "relevance": "Key multi-agent system for end-to-end software development using waterfall model with chat-based collaboration."
    387     },
    388     {
    389       "title": "MetaGPT: Meta programming for a multi-agent collaborative framework",
    390       "authors": ["Sirui Hong", "Mingchen Zhuge"],
    391       "year": 2024,
    392       "relevance": "Influential multi-agent framework with structured message passing and shared memory for software development."
    393     },
    394     {
    395       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    396       "authors": ["John Yang", "Carlos E. Jimenez"],
    397       "year": 2024,
    398       "relevance": "Pioneering agent-computer interface for autonomous software issue resolution."
    399     },
    400     {
    401       "title": "Reflexion: Language agents with verbal reinforcement learning",
    402       "authors": ["Noah Shinn", "Federico Cassano"],
    403       "year": 2023,
    404       "arxiv_id": "2310.11591",
    405       "relevance": "Foundational work on self-reflection and verbal feedback for iterative code generation."
    406     },
    407     {
    408       "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversations",
    409       "authors": ["Qingyun Wu", "Gagan Bansal"],
    410       "year": 2024,
    411       "relevance": "Major multi-agent framework enabling conversational collaboration between LLM agents."
    412     },
    413     {
    414       "title": "The rise and potential of large language model based agents: A survey",
    415       "authors": ["Zhiheng Xi", "Wenxiang Chen"],
    416       "year": 2025,
    417       "relevance": "General survey on LLM-based agents providing the agent framework taxonomy used in this survey."
    418     },
    419     {
    420       "title": "Fuzz4All: Universal fuzzing with large language models",
    421       "authors": ["Chunqiu Steven Xia", "Matteo Paltenghi"],
    422       "year": 2024,
    423       "relevance": "Universal LLM-based fuzzer for system-level testing across multiple programming languages."
    424     },
    425     {
    426       "title": "RepairAgent: An autonomous, LLM-based agent for program repair",
    427       "authors": ["Islem Bouzenia", "Premkumar T. Devanbu"],
    428       "year": 2025,
    429       "relevance": "Highly autonomous agent for program repair with state-machine-based middleware."
    430     },
    431     {
    432       "title": "MASAI: Modular architecture for software-engineering AI agents",
    433       "authors": ["Nalin Wadhwa", "Atharv Sonwane"],
    434       "year": 2024,
    435       "relevance": "Modular multi-agent architecture for end-to-end software maintenance with task decomposition."
    436     }
    437   ]
    438 }

Impressum · Datenschutz