scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20923B)
      1 {
      2   "paper": {
      3     "title": "LLM-Based Multi-Agent Systems for Software Engineering: Literature Review, Vision and the Road Ahead",
      4     "authors": ["Junda He", "Christoph Treude", "David Lo"],
      5     "year": 2025,
      6     "venue": "ACM (journal not specified, publication date July 2025)",
      7     "arxiv_id": "2404.04834",
      8     "doi": "10.1145/nnnnnnn.nnnnnnn"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["survey_methodology"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "No repository URL or code archive is provided. The paper does not release any analysis scripts or data extraction tools."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "No dataset of the 71 reviewed papers, extracted metadata, or case study artifacts is released."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "For the ChatDev case studies, the paper states GPT-3.5-turbo with temperature 0.2 but provides no environment setup details (Python version, ChatDev version, dependencies)."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step instructions are provided for reproducing the literature search or the case studies."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": false,
     38         "answer": false,
     39         "justification": "This is primarily a survey paper. The case studies report per-attempt results but do not perform statistical experiments requiring confidence intervals."
     40       },
     41       "significance_tests": {
     42         "applies": false,
     43         "answer": false,
     44         "justification": "No comparative statistical claims are made that would require significance testing."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": false,
     48         "answer": false,
     49         "justification": "Survey paper with illustrative case studies; no statistical effect sizes are relevant."
     50       },
     51       "sample_size_justified": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "No statistical experiments requiring sample size justification."
     55       },
     56       "variance_reported": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "Survey paper; no multi-run experimental results requiring variance reporting."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper does not compare its survey methodology or coverage against prior surveys on LMA systems in SE. No systematic comparison with related review papers."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No comparison baselines are included at all."
     72       },
     73       "ablation_study": {
     74         "applies": false,
     75         "answer": false,
     76         "justification": "No system with components to ablate; this is a survey and case study paper."
     77       },
     78       "multiple_metrics": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "No system evaluation requiring metrics."
     82       },
     83       "human_evaluation": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "Human evaluation of system outputs is not relevant to a survey paper."
     87       },
     88       "held_out_test_set": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No ML model or test set is involved in this survey."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "The literature review is organized by SDLC phase (requirements engineering, code generation, quality assurance, maintenance, end-to-end), providing per-category breakdown of the reviewed work."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The Tetris case study (Section 4.2) explicitly discusses failure: ChatDev failed 9 out of 10 attempts, and even the successful attempt lacked row-removal functionality."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The case studies report negative findings: Tetris game failed in 9/10 attempts and the successful version lacked core functionality (row removal). The paper explicitly states 'This highlights the limitations of current LMA systems.'"
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims are largely descriptive: systematic review of primary studies, two case studies, identification of research gaps, and a research agenda. These are supported by the paper's content."
    114       },
    115       "causal_claims_justified": {
    116         "applies": false,
    117         "answer": false,
    118         "justification": "The paper does not make causal claims. It describes the landscape and proposes future directions. The case study observations are descriptive, not causal."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper makes broad claims about LMA systems' potential ('autonomous problem-solving', 'scalable solutions') that go well beyond what the two ChatDev case studies demonstrate. The title claims coverage of 'Software Engineering' broadly but the review is limited to DBLP-indexed papers."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The case study failures are attributed to complexity limitations without discussing alternative explanations (e.g., prompt quality, model choice, ChatDev-specific limitations vs. general LMA limitations)."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The case studies use two games (Snake, Tetris) as proxies for 'complex software engineering tasks' without discussing whether game development generalizes to real-world SE challenges."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper says 'GPT-3.5-turbo' for the case studies but does not specify a version or snapshot date (e.g., gpt-3.5-turbo-0613)."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The actual prompts used for the Snake and Tetris case studies are provided in full in Section 4 (boxed prompt text)."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Temperature is reported as 0.2 for GPT-3.5-turbo, following the original ChatDev setting (Section 4)."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "ChatDev's scaffolding is described: three phases (designing, coding, testing) with specialized roles (CEO, CTO, programmer, reviewer, tester). The framework's structure is explained in Section 4."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "The literature search strategy is documented in Section 3 with keyword sets, search database (DBLP), inclusion/exclusion criteria, and the three-phase filtering process with snowballing. The search date (November 14, 2024) and final count (41 + 30 snowballed = 71) are stated."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 6.2 'Threat to Validity' discusses potential threats, though it is brief (one paragraph)."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "The threats to validity section (6.2) contains only one generic threat: 'inadvertently excluding relevant studies.' It does not discuss specific threats like DBLP coverage limitations, case study representativeness, or selection bias in snowballing."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The paper states scope boundaries: limited to papers after November 2022 (ChatGPT release), only LMA systems (not single agents), only SE-related tasks, and specific SDLC phases. Exclusion criteria are enumerated."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The list of 71 reviewed papers is not provided as a downloadable dataset. Only references are given inline."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 3 describes the DBLP search strategy with keyword sets, search date, inclusion/exclusion criteria, and snowballing process."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants; data source is a literature database search."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The three-phase filtering pipeline is documented: Phase 1 (short papers, duplicates), Phase 2 (venue/title/abstract screening), Phase 3 (full-text review), plus snowballing. Counts at key stages: 41 from search + 30 from snowballing = 71 total."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding information or acknowledgments section is present in the paper."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All three authors are disclosed as affiliated with Singapore Management University."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding source is disclosed, so independence cannot be assessed."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement is provided in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "This is a survey paper with illustrative case studies. It does not evaluate a pre-trained model's capability on any benchmark."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "No benchmark evaluation of model capability is performed."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No benchmark evaluation of model capability is performed."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "This is a survey paper. The case studies do report costs ($0.019 and $0.020 per attempt), but cost reporting is not the paper's own methodological concern."
    283       },
    284       "compute_budget_stated": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "Survey paper; compute budget is not relevant to the paper's own methodology."
    288       }
    289     },
    290     "survey_methodology": {
    291       "prisma_or_structured_protocol": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "The paper follows a structured review protocol with defined keyword search strategy, DBLP database, explicit inclusion/exclusion criteria, three-phase filtering, and snowballing. While not PRISMA per se, it follows a recognized systematic review methodology with reproducible queries."
    295       },
    296       "quality_assessment_of_sources": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The survey does not assess the methodological quality of its 71 source papers. All papers are treated equally regardless of their rigor, venue quality, or experimental validity."
    300       },
    301       "publication_bias_discussed": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No discussion of publication bias, positive-result bias, or whether the reviewed papers skew toward successful LMA applications."
    305       }
    306     }
    307   },
    308   "claims": [
    309     {
    310       "claim": "LMA systems demonstrate strong performance in reasonably complex tasks like developing a Snake game, meeting all requirements within a few iterations.",
    311       "evidence": "Section 4.1: ChatDev generated a playable Snake game on the second attempt, fulfilling all prompt requirements. Average time 76 seconds, cost $0.019.",
    312       "supported": "weak"
    313     },
    314     {
    315       "claim": "Current LMA systems have limitations in handling more complex tasks that require deeper logical reasoning, as shown by the Tetris case study.",
    316       "evidence": "Section 4.2: ChatDev failed in 9 out of 10 attempts for Tetris. The successful attempt lacked row-removal functionality. Average time 70 seconds, cost $0.020.",
    317       "supported": "moderate"
    318     },
    319     {
    320       "claim": "71 primary studies on LMA systems in SE were identified through systematic literature search.",
    321       "evidence": "Section 3: 41 papers from DBLP keyword search + 30 from snowballing, using defined inclusion/exclusion criteria. Search conducted November 14, 2024.",
    322       "supported": "strong"
    323     }
    324   ],
    325   "methodology_tags": ["meta-analysis", "case-study"],
    326   "key_findings": "This systematic review identifies 71 primary studies applying LLM-based multi-agent systems across software engineering tasks including requirements engineering, code generation, quality assurance, and maintenance. Case studies with ChatDev show that current LMA systems handle moderate complexity (Snake game) but fail on harder tasks (Tetris, 9/10 failures, missing core functionality even when successful). The paper proposes a two-phase research agenda: enhancing individual agent capabilities and optimizing agent synergy.",
    327   "red_flags": [
    328     {
    329       "flag": "No quality assessment of reviewed papers",
    330       "detail": "The survey reviews 71 papers without any quality assessment rubric or risk-of-bias evaluation. All papers are treated as equally valid evidence regardless of their methodological quality, potentially laundering weak results."
    331     },
    332     {
    333       "flag": "Extremely limited empirical evidence for broad claims",
    334       "detail": "The paper makes sweeping claims about LMA systems' potential for 'autonomous problem-solving', 'scalability', and 'Software Engineering 2.0' based on a literature summary and two toy case studies (Snake and Tetris games) using a single framework (ChatDev) with a single model (GPT-3.5-turbo)."
    335     },
    336     {
    337       "flag": "Case study methodology concerns",
    338       "detail": "The case studies use only ChatDev with GPT-3.5-turbo, run the same prompt multiple times until success, and use toy game generation as a proxy for software engineering complexity. No comparison with other LMA frameworks or single-agent baselines."
    339     },
    340     {
    341       "flag": "Self-citation concentration",
    342       "detail": "Several references are by the same author group (He, Lo, et al.), and the paper heavily promotes the 'Software Engineering 2.0' vision from the last author's prior work without critical examination."
    343     },
    344     {
    345       "flag": "Research agenda without empirical grounding",
    346       "detail": "The research agenda (Section 5) is extensive and prescriptive but is not grounded in empirical findings from the review. The proposed directions read more like a vision paper than evidence-based recommendations."
    347     }
    348   ],
    349   "cited_papers": [
    350     {
    351       "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework",
    352       "authors": ["Sirui Hong", "Xiawu Zheng", "Jonathan Chen"],
    353       "year": 2023,
    354       "arxiv_id": "2308.00352",
    355       "relevance": "Foundational LMA framework for software development using role-specialized agents."
    356     },
    357     {
    358       "title": "ChatDev: Communicative Agents for Software Development",
    359       "authors": ["Chen Qian", "Wei Liu", "Hongzhang Liu"],
    360       "year": 2024,
    361       "relevance": "Primary LMA framework used in the case studies; models waterfall-style software development with communicative agents."
    362     },
    363     {
    364       "title": "MASAI: Modular Architecture for Software-engineering AI Agents",
    365       "authors": ["Daman Arora", "Atharv Sonwane", "Nalin Wadhwa"],
    366       "year": 2024,
    367       "arxiv_id": "2406.11638",
    368       "relevance": "Multi-agent architecture for software debugging with modular agent design."
    369     },
    370     {
    371       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework",
    372       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    373       "year": 2023,
    374       "arxiv_id": "2308.08155",
    375       "relevance": "Major multi-agent conversation framework supporting RAG and agent-based workflows."
    376     },
    377     {
    378       "title": "AgentCoder: Multi-Agent-based Code Generation with Iterative Testing and Optimisation",
    379       "authors": ["Dong Huang", "Qingwen Bu", "Jie M Zhang"],
    380       "year": 2023,
    381       "arxiv_id": "2312.13010",
    382       "relevance": "Multi-agent code generation with iterative testing feedback loops."
    383     },
    384     {
    385       "title": "SWE-Search: Enhancing Software Agents with Monte Carlo Tree Search and Iterative Refinement",
    386       "authors": ["Antonis Antoniades", "Albert Örwall"],
    387       "year": 2024,
    388       "arxiv_id": "2410.20285",
    389       "relevance": "Multi-agent system for software engineering combining MCTS with LLM agents."
    390     },
    391     {
    392       "title": "CAMEL: Communicative Agents for 'Mind' Exploration of Large Language Model Society",
    393       "authors": ["Guohao Li", "Hasan Hammoud"],
    394       "year": 2024,
    395       "relevance": "Multi-agent communication framework for collaborative LLM agents."
    396     },
    397     {
    398       "title": "Fuzz4All: Universal Fuzzing with Large Language Models",
    399       "authors": ["Chunqiu Steven Xia", "Matteo Paltenghi"],
    400       "year": 2024,
    401       "relevance": "LMA system for fuzz testing across multiple programming languages."
    402     },
    403     {
    404       "title": "AgileCoder: Dynamic Collaborative Agents for Software Development based on Agile Methodology",
    405       "authors": ["Minh Huynh Nguyen", "Thang Phan Chau"],
    406       "year": 2024,
    407       "arxiv_id": "2406.11912",
    408       "relevance": "LMA framework applying Agile methodology with sprint-based collaboration."
    409     },
    410     {
    411       "title": "Agents4PLC: Automating Closed-loop PLC Code Generation and Verification in Industrial Control Systems using LLM-based Agents",
    412       "authors": ["Zihan Liu", "Ruinan Zeng"],
    413       "year": 2024,
    414       "arxiv_id": "2410.14209",
    415       "relevance": "Multi-agent system for industrial control system code generation and verification."
    416     },
    417     {
    418       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    419       "authors": ["Terry Yue Zhuo", "Minh Chien Vu"],
    420       "year": 2024,
    421       "arxiv_id": "2406.15877",
    422       "relevance": "Code generation benchmark relevant to evaluating LMA system capabilities."
    423     },
    424     {
    425       "title": "GPT-4 Technical Report",
    426       "authors": ["Josh Achiam", "Steven Adler"],
    427       "year": 2023,
    428       "arxiv_id": "2303.08774",
    429       "relevance": "Foundation model whose capabilities drive LMA system research."
    430     }
    431   ]
    432 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs