scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (19587B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "LLM-Based Multi-Agent Systems for Software Engineering: Vision and the Road Ahead",
      6     "authors": [
      7       "Junda He",
      8       "Christoph Treude",
      9       "David Lo"
     10     ],
     11     "year": 2024,
     12     "venue": "Unknown",
     13     "arxiv_id": "2404.04834",
     14     "doi": "10.48550/arXiv.2404.04834"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All four stated contributions (systematic review of 71 papers, two case studies, gap identification, research agenda) are directly substantiated by paper content.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": false,
     26         "answer": false,
     27         "justification": "The paper is a survey and vision piece with no causal claims; case study results are reported descriptively, not causally attributed.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Broad claims like 'LMA systems are expected to significantly speed up software development' and the 'Software Engineering 2.0' framing extend far beyond what two simple game-generation tasks and a narrative synthesis of 71 papers support.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Section 6.1 briefly compares LMA to Mixture of Experts, but the paper does not consider alternative explanations for either the case study failures or the broader capability assessments drawn from reviewed work.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Generating a playable Snake or Tetris game is used as evidence of LMA capability for software engineering broadly, with no discussion of how toy game generation proxies for real-world SE tasks.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 6.2 'Threat to Validity' exists as a dedicated subsection, though it is only a single paragraph.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Section 6.2 names only one threat (inadvertent exclusion of studies) with a generic mitigation; it ignores threats from single-database search, single-framework case studies, single LLM (GPT-3.5), and two trivial tasks.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The paper states its time window (post-ChatGPT) and the four SDLC phases covered but never explicitly states what the review does not show or what conclusions cannot be drawn from it.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding acknowledgment or disclosure appears anywhere in the paper.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors are listed with Singapore Management University affiliation and institutional email addresses on the title page.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funding is disclosed; criterion does not apply.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests, patent, or financial interests statement appears in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Sections 2.1–2.3 formally define 'autonomous agent', 'LLM-based autonomous agent' (with tuple notation ⟨L,O,M,P,A,R⟩), and 'LLM-Based Multi-Agent Systems' with orchestration platform and agent components.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 1 enumerates three bullet-pointed contributions explicitly: systematic review of 71 studies, two case studies, and a structured research agenda.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper cites 171 references, situates itself relative to prior LLM-for-SE surveys, and discusses how reviewed LMA frameworks (MetaGPT, ChatDev, AgileCoder) relate to and differ from each other.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "survey": {
    118       "search_and_selection": {
    119         "search_strategy_reproducible": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "The paper provides the full keyword matrix for both agent and SE dimensions, specifies DBLP as the database, states the exact search date (November 14, 2024), and describes the three-phase screening and snowballing procedure.",
    123           "source": "haiku"
    124         },
    125         "inclusion_exclusion_explicit": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Eight criteria are explicitly listed using ! (inclusion) and % (exclusion) symbols with clear descriptions applied across three screening phases.",
    129           "source": "haiku"
    130         },
    131         "prisma_or_structured_protocol": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "The paper references prior methodology papers for SLR design but does not adopt or name PRISMA or any other formal structured review protocol.",
    135           "source": "haiku"
    136         },
    137         "search_terms_provided": {
    138           "applies": true,
    139           "answer": true,
    140           "justification": "Specific keyword strings are provided for both agent terms ('Agent OR LLM OR Large Language Model OR Collaborat') and each of the four SDLC phases with truncated variants.",
    141           "source": "haiku"
    142         },
    143         "databases_listed": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "DBLP is explicitly listed and described as indexing 7.5M publications; only one database is searched, which is a limitation but it is disclosed.",
    147           "source": "haiku"
    148         },
    149         "screening_process_documented": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "The paper reports final counts (41 from keyword search + 30 from snowballing = 71) but provides no intermediate counts showing how many papers were screened and excluded at each phase.",
    153           "source": "haiku"
    154         },
    155         "review_scope_justified": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "The post-ChatGPT time boundary is justified but the restriction to only DBLP and only four of many SDLC phases is not adequately justified; the choice to exclude design and deployment phases goes unremarked.",
    159           "source": "haiku"
    160         }
    161       },
    162       "synthesis_quality": {
    163         "conflicting_findings_acknowledged": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The synthesis is predominantly positive; papers showing LMA failures or negative results are not systematically compared against those showing success.",
    167           "source": "haiku"
    168         },
    169         "quality_assessment_of_sources": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Papers are included or excluded based on topic relevance but not assessed for methodological quality; no risk-of-bias tool, quality rubric, or structured evaluation of primary studies is applied.",
    173           "source": "haiku"
    174         },
    175         "publication_bias_discussed": {
    176           "applies": true,
    177           "answer": false,
    178           "justification": "Publication bias is not mentioned anywhere; the paper does not acknowledge that the reviewed literature skews positive due to selective reporting of successful LMA results.",
    179           "source": "haiku"
    180         },
    181         "quantitative_synthesis_present": {
    182           "applies": true,
    183           "answer": false,
    184           "justification": "The synthesis is entirely narrative and descriptive; no meta-analysis, vote counting, effect size aggregation, or quantitative comparison across reviewed papers is performed.",
    185           "source": "haiku"
    186         },
    187         "recommendations_supported_by_evidence": {
    188           "applies": true,
    189           "answer": false,
    190           "justification": "The two-phase research agenda (Sections 5.1–5.2) consists largely of speculative future directions with detailed step-by-step proposals not grounded in specific findings from the 71 reviewed papers.",
    191           "source": "haiku"
    192         }
    193       }
    194     }
    195   },
    196   "claims": [
    197     {
    198       "claim": "LMA systems enable autonomous problem-solving, improve robustness, and provide scalable solutions for managing software project complexity.",
    199       "evidence": "Asserted in the introduction as motivation, supported only by citations to prior work; not empirically demonstrated in this paper.",
    200       "supported": "weak"
    201     },
    202     {
    203       "claim": "ChatDev successfully generated a fully functional Snake game meeting all prompt requirements within two iterations, averaging 76 seconds and $0.019 per attempt.",
    204       "evidence": "Direct observation in the case study (Section 4.1); screenshots provided.",
    205       "supported": "strong"
    206     },
    207     {
    208       "claim": "ChatDev required ten attempts to produce a Tetris game that still lacked core functionality (row removal), highlighting limitations of current LMA systems on complex tasks.",
    209       "evidence": "Direct observation in case study (Section 4.2); screenshots provided showing missing row-removal state.",
    210       "supported": "strong"
    211     },
    212     {
    213       "claim": "Multi-agent debate and discussion mechanisms enhance factuality, reasoning, and divergent thinking in LLMs.",
    214       "evidence": "Cited from Du et al. (2023) and Liang et al. (2023); not an original finding of this paper.",
    215       "supported": "moderate"
    216     },
    217     {
    218       "claim": "Current ChatGPT-based agents lack nuanced expertise for specialized SE roles such as vulnerability detection and security auditing.",
    219       "evidence": "Cited from Chen et al. (2024), Fu et al. (2023), and Sridhara et al. (2023); not empirically verified here.",
    220       "supported": "moderate"
    221     },
    222     {
    223       "claim": "Most existing LMA systems operate with static architectures of fixed agent roles and predefined communication patterns, lacking true dynamic adaptation.",
    224       "evidence": "Asserted in Section 5.2.5 based on survey of reviewed papers (MetaGPT, ChatDev); supported by the literature review but no systematic evidence.",
    225       "supported": "moderate"
    226     }
    227   ],
    228   "methodology_tags": [
    229     "qualitative",
    230     "case-study"
    231   ],
    232   "key_findings": "This paper reviews 71 primary studies on LLM-based multi-agent (LMA) systems for software engineering, organized across requirements engineering, code generation, quality assurance, maintenance, and end-to-end development. Two case studies using ChatDev show that current LMA systems can handle moderately complex tasks (Snake game generated successfully in ~2 attempts) but fail on higher complexity (Tetris required 10 attempts and still omits row removal). The paper identifies eight research gaps and proposes a two-phase agenda: enhancing individual agent capabilities via role specialization and agent-oriented prompting languages, and optimizing agent synergy through human-agent collaboration design, collaborative benchmarking, scalability mechanisms, industry-principle adoption, dynamic adaptation, and privacy-preserving data management.",
    233   "red_flags": [
    234     {
    235       "flag": "Single-database search",
    236       "detail": "Only DBLP was searched; major sources including ACM Digital Library, IEEE Xplore, Scopus, and direct arXiv search were excluded, likely missing a substantial portion of relevant work."
    237     },
    238     {
    239       "flag": "No quality assessment of sources",
    240       "detail": "The 71 reviewed papers are not assessed for methodological quality; all papers are treated equivalently in the narrative regardless of rigor."
    241     },
    242     {
    243       "flag": "Case study overgeneralization",
    244       "detail": "Two toy game-generation tasks using a single framework (ChatDev, GPT-3.5-turbo) are used to characterize the state of all LMA systems; the sample is not representative."
    245     },
    246     {
    247       "flag": "Minimal threats to validity",
    248       "detail": "Section 6.2 identifies only one threat (study exclusion risk) and ignores threats from single-database bias, single LLM, two trivial tasks, and cherry-picking positive-leaning literature."
    249     },
    250     {
    251       "flag": "No publication bias acknowledgment",
    252       "detail": "The survey makes no mention of publication bias despite reviewing a fast-moving field where negative results are rarely published, likely inflating the apparent effectiveness of LMA systems."
    253     },
    254     {
    255       "flag": "No funding disclosure",
    256       "detail": "No funding source is disclosed anywhere in the paper."
    257     }
    258   ],
    259   "cited_papers": [
    260     {
    261       "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework",
    262       "relevance": "Primary example of Waterfall-based LMA system for SE; extensively discussed in code generation and end-to-end sections."
    263     },
    264     {
    265       "title": "ChatDev: Communicative Agents for Software Development",
    266       "relevance": "Central to the paper — used in the literature review and as the sole framework in both case studies."
    267     },
    268     {
    269       "title": "Large Language Models for Software Engineering: A Systematic Literature Review",
    270       "relevance": "Closely related prior survey providing context for LLM limitations in SE roles; cited for capability gap analysis."
    271     },
    272     {
    273       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework",
    274       "relevance": "Major multi-agent framework discussed in the prompting and orchestration sections."
    275     },
    276     {
    277       "title": "AgileCoder: Dynamic Collaborative Agents for Software Development based on Agile Methodology",
    278       "relevance": "Key reviewed LMA framework representing Agile-based SE agent design."
    279     },
    280     {
    281       "title": "SWE-Search: Enhancing Software Agents with Monte Carlo Tree Search and Iterative Refinement",
    282       "relevance": "Advanced debugging/maintenance LMA system using MCTS, reviewed as a state-of-the-art approach."
    283     },
    284     {
    285       "title": "Trustworthy and Synergistic Artificial Intelligence for Software Engineering: Vision and Roadmaps",
    286       "relevance": "Foundational 'SE 2.0' vision paper by co-author David Lo that motivates the entire framing of this survey."
    287     },
    288     {
    289       "title": "A Survey on Large Language Model based Autonomous Agents",
    290       "relevance": "Broader LLM agent survey that contextualizes the SE-specific focus and formal agent definitions used in Section 2."
    291     }
    292   ],
    293   "engagement_factors": {
    294     "practical_relevance": {
    295       "score": 2,
    296       "justification": "Provides a structured map of LMA frameworks across SDLC stages and a concrete research agenda useful for both researchers and practitioners evaluating LMA adoption."
    297     },
    298     "surprise_contrarian": {
    299       "score": 1,
    300       "justification": "The Tetris failure (10 attempts, missing row removal) illustrates LMA limitations concretely, but the overall picture — LMA promising but not production-ready — is not surprising."
    301     },
    302     "fear_safety": {
    303       "score": 1,
    304       "justification": "Section 5.2.6 raises privacy and data sovereignty concerns for multi-organizational LMA deployment, framed as a research challenge rather than an alarm."
    305     },
    306     "drama_conflict": {
    307       "score": 1,
    308       "justification": "The LMA vs. Mixture of Experts comparison in Section 6.1 creates mild intellectual tension, but the paper avoids controversy."
    309     },
    310     "demo_ability": {
    311       "score": 2,
    312       "justification": "Case studies use ChatDev (publicly available, open source) to generate Snake and Tetris games — directly reproducible with the original ChatDev repository."
    313     },
    314     "brand_recognition": {
    315       "score": 1,
    316       "justification": "Authors are from Singapore Management University; David Lo and Christoph Treude are well-known SE researchers but this is not a top-tier industry lab publication."
    317     }
    318   },
    319   "hn_data": {
    320     "threads": [
    321       {
    322         "hn_id": "43685817",
    323         "title": "All-in-Memory Stochastic Computing Using ReRAM",
    324         "points": 30,
    325         "comments": 1,
    326         "url": "https://news.ycombinator.com/item?id=43685817",
    327         "created_at": "2025-04-14T20:14:29Z"
    328       },
    329       {
    330         "hn_id": "44934611",
    331         "title": "Scientific and technological knowledge grows linearly over time",
    332         "points": 4,
    333         "comments": 0,
    334         "url": "https://news.ycombinator.com/item?id=44934611",
    335         "created_at": "2025-08-17T20:22:11Z"
    336       },
    337       {
    338         "hn_id": "40671396",
    339         "title": "Doing Battle with the Sun: Lessons from Low Earth Orbit",
    340         "points": 3,
    341         "comments": 1,
    342         "url": "https://news.ycombinator.com/item?id=40671396",
    343         "created_at": "2024-06-13T16:07:32Z"
    344       },
    345       {
    346         "hn_id": "42566444",
    347         "title": "DeepSeek-V2: A Strong, Economical, and Efficient MOE Language Model",
    348         "points": 3,
    349         "comments": 0,
    350         "url": "https://news.ycombinator.com/item?id=42566444",
    351         "created_at": "2025-01-01T15:10:28Z"
    352       },
    353       {
    354         "hn_id": "39305148",
    355         "title": "Long Is More for Alignment: A Simple Baseline for Instruction Fine-Tuning",
    356         "points": 2,
    357         "comments": 1,
    358         "url": "https://news.ycombinator.com/item?id=39305148",
    359         "created_at": "2024-02-08T17:55:17Z"
    360       },
    361       {
    362         "hn_id": "44710581",
    363         "title": "Futureproof Static Memory Planning",
    364         "points": 2,
    365         "comments": 0,
    366         "url": "https://news.ycombinator.com/item?id=44710581",
    367         "created_at": "2025-07-28T13:16:11Z"
    368       },
    369       {
    370         "hn_id": "41128425",
    371         "title": "Things Come from Having Many Good Models",
    372         "points": 2,
    373         "comments": 0,
    374         "url": "https://news.ycombinator.com/item?id=41128425",
    375         "created_at": "2024-08-01T12:25:07Z"
    376       },
    377       {
    378         "hn_id": "39032813",
    379         "title": "Adapting Standard Retrieval Benchmarks to Evaluate Generated Answers",
    380         "points": 1,
    381         "comments": 0,
    382         "url": "https://news.ycombinator.com/item?id=39032813",
    383         "created_at": "2024-01-17T20:13:37Z"
    384       },
    385       {
    386         "hn_id": "26741246",
    387         "title": "The Production and Consumption of Social Media",
    388         "points": 1,
    389         "comments": 0,
    390         "url": "https://news.ycombinator.com/item?id=26741246",
    391         "created_at": "2021-04-08T17:08:04Z"
    392       }
    393     ],
    394     "top_points": 30,
    395     "total_points": 48,
    396     "total_comments": 3
    397   }
    398 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs