scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (18907B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Designing LLM-based Multi-Agent Systems for Software Engineering Tasks: Quality Attributes, Design Patterns and Rationale",
      6     "authors": [
      7       "Yangxiao Cai",
      8       "Ruiyin Li",
      9       "Peng Liang",
     10       "Mojtaba Shahin",
     11       "Zengyang Li"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2511.08475",
     16     "doi": "10.48550/arXiv.2511.08475"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims about Code Generation being the most common SE task (47.9%), Functional Suitability being the most considered QA (94.7%), Role-Based Cooperation being the most used pattern (46.8%), and Improving the Quality of Generated Code being the most common rationale (44.7%) are all supported by the data in Section 4.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "The paper is descriptive, reporting what was found in the literature. It does not make causal claims about why certain patterns lead to certain outcomes.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title claims broad coverage of 'LLM-based Multi-Agent Systems for Software Engineering Tasks' but the study is limited to 94 papers collected before September 2024 from two surveys and arXiv only. No bounding of generalization to this specific corpus and time window is provided in the abstract or conclusions.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss alternative explanations for its observed patterns, such as whether the dominance of Code Generation reflects genuine importance or simply publication bias toward easily benchmarked tasks.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper counts papers as a proxy for design practice importance but does not discuss whether paper frequency reflects actual real-world adoption or just publication trends.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 6 'Threats on Validity' discusses construct validity, external validity, and reliability threats.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The threats section discusses specific issues: individual bias in manual data extraction (mitigated by pilot extraction and multi-author review), data source selection limited to two surveys and arXiv, and methodological uncertainties resolved through author discussions.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what its results do NOT show. It does not bound claims to the specific time window (pre-September 2024), acknowledge that industrial/proprietary MAS designs are excluded, or note that its findings may not generalize beyond academic publications.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgments section states support from NSFC Grant No. 62402348 and 62172311, and Major Science and Technology Project of Hubei Province Grant No. 2024BAA008.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All author affiliations are clearly listed: Wuhan University, RMIT University, and Central China Normal University.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The funders (NSFC, Hubei Province) are government research agencies with no financial stake in the study outcomes.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement is present in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are explicitly defined: LLM-based MASs (multiple autonomous agents collaborating via communication and responsibility specialization), Quality Attributes (per ISO/IEC 25010:2023), and Design Rationale (guiding principles behind design choices).",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three bullet-point contributions are explicitly enumerated: (1) SE task/QA identification, (2) design patterns and rationale extraction, (3) mapping relationships established among SE tasks, QAs, design patterns, and rationale.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 engages substantively with six related works across three sub-categories, explicitly comparing each to this work and articulating the gap (no prior study examined QAs + design patterns + rationale jointly for LLM-based MASs).",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "survey": {
    120       "search_and_selection": {
    121         "search_strategy_reproducible": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The search is reproducible: two named seed surveys (Liu et al. 2024, Wang et al. 2025) plus arXiv SE category with the exact query '(\"large language model\" AND \"agent\")' for papers before September 30, 2024.",
    125           "source": "haiku"
    126         },
    127         "inclusion_exclusion_explicit": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Three explicit inclusion criteria are stated and numbered: (1) must introduce at least one MAS, (2) agents must leverage LLMs, (3) must address at least one SE task.",
    131           "source": "haiku"
    132         },
    133         "prisma_or_structured_protocol": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No PRISMA flowchart or equivalent structured review protocol is used. The paper describes a research process diagram (Figure 1) but does not follow PRISMA or cite any equivalent SLR methodology standard.",
    137           "source": "haiku"
    138         },
    139         "search_terms_provided": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "The exact search query is provided verbatim: '(\"large language model\" AND \"agent\")' applied to arXiv's SE category.",
    143           "source": "haiku"
    144         },
    145         "databases_listed": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Sources are explicitly listed: Liu et al. (2024) survey, Wang et al. (2025) survey, and arXiv (Cornell University SE category). No other databases were searched.",
    149           "source": "haiku"
    150         },
    151         "screening_process_documented": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "Initial counts per source are given (118 + 115 + 194 = 427, deduped to 236, final 94) but the paper does not break down how many papers were excluded by each criterion, leaving the 236→94 reduction undocumented.",
    155           "source": "haiku"
    156         },
    157         "review_scope_justified": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "The time cutoff is justified only as 'when we started this study.' The decision to search only arXiv and two surveys—excluding ACM DL, IEEE Xplore, and Springer—is not justified, which is a significant scope limitation.",
    161           "source": "haiku"
    162         }
    163       },
    164       "synthesis_quality": {
    165         "conflicting_findings_acknowledged": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The paper is a descriptive mapping study presenting frequency counts without acknowledging any contradictions or conflicting design approaches across the 94 papers or tensions between QAs.",
    169           "source": "haiku"
    170         },
    171         "quality_assessment_of_sources": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No quality assessment of the 94 included papers is performed. All papers are treated as equal-weight sources regardless of methodological rigor, peer-review status, or empirical validity.",
    175           "source": "haiku"
    176         },
    177         "publication_bias_discussed": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "Publication bias is not discussed. The paper does not acknowledge that published/preprinted papers may systematically over-represent certain design patterns, QAs, or SE tasks due to reporting norms or positive framing.",
    181           "source": "haiku"
    182         },
    183         "quantitative_synthesis_present": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Frequency counts and percentages are provided for all taxonomy categories and mapped across dimensions (Tables 2–6, Figures 2–4), constituting basic quantitative synthesis through vote counting, though not meta-analysis or effect size aggregation.",
    187           "source": "haiku"
    188         },
    189         "recommendations_supported_by_evidence": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "All six implications are explicitly tied to specific frequency findings (e.g., Implication 2 on Role-Based Cooperation and Maintainability directly cites Modularity being the top sub-QA and Role-Based Cooperation being the top pattern).",
    193           "source": "haiku"
    194         }
    195       }
    196     }
    197   },
    198   "claims": [
    199     {
    200       "claim": "Code Generation is the most common SE task addressed by LLM-based MASs (47.9%, 45/94 papers)",
    201       "evidence": "Figure 2 and Table 2 show 45 of 94 papers address Code Generation; next is Fault Localization at 9.6%",
    202       "supported": "strong"
    203     },
    204     {
    205       "claim": "Functional Suitability is the QA most frequently considered by designers (94.7%, 89/94 papers)",
    206       "evidence": "Figure 3 and Table 3 show 89 papers consider Functional Suitability, with Functional Correctness in 86 papers",
    207       "supported": "strong"
    208     },
    209     {
    210       "claim": "Role-Based Cooperation is the most frequently employed design pattern (46.8%, 44/94 papers)",
    211       "evidence": "Table 4 lists 44 papers using Role-Based Cooperation, followed by Self-Reflection at 36.2%",
    212       "supported": "strong"
    213     },
    214     {
    215       "claim": "Improving the Quality of Generated Code is the predominant design rationale (44.7%, 42/94 papers)",
    216       "evidence": "Table 5 shows 42 papers cite this rationale, followed by Simulating Human Processes at 29.8%",
    217       "supported": "strong"
    218     },
    219     {
    220       "claim": "Security is a low-priority QA among LLM-based MAS designers, considered by only 10.6% of papers",
    221       "evidence": "Table 3 shows only 10 papers address Security (9 Confidentiality, 1 Integrity), the same proportion as Compatibility",
    222       "supported": "strong"
    223     },
    224     {
    225       "claim": "LLM-based MASs are increasingly adopted to support the entire software lifecycle",
    226       "evidence": "Only 7 papers address End-to-End Software Development and 8 address End-to-End Software Maintenance (15/94 total, 16%), which is modest evidence for 'increasing' adoption",
    227       "supported": "weak"
    228     }
    229   ],
    230   "methodology_tags": [
    231     "qualitative",
    232     "meta-analysis"
    233   ],
    234   "key_findings": "This systematic mapping study of 94 LLM-based MAS papers for SE tasks identifies that Code Generation dominates (47.9%), with Functional Suitability—especially Functional Correctness (91.5%)—as the primary design concern, Role-Based Cooperation as the dominant design pattern (46.8%), and improving code quality as the primary design rationale (44.7%). The paper provides taxonomic mappings across SE tasks, quality attributes (structured by ISO/IEC 25010:2023), design patterns, and rationale, with practical implications for practitioners. However, the survey does not assess the methodological quality of included papers, omits publication bias discussion, restricts search to two seed surveys plus arXiv (excluding ACM DL and IEEE Xplore), and lacks inter-rater reliability metrics for data extraction.",
    235   "red_flags": [
    236     {
    237       "flag": "No quality assessment of included papers",
    238       "detail": "All 94 papers are treated as equal-weight sources regardless of methodological rigor, peer-review status, or empirical quality. Design patterns extracted from position papers or weak empirical studies carry the same weight as patterns from rigorous studies."
    239     },
    240     {
    241       "flag": "Limited database coverage",
    242       "detail": "The search covers only arXiv's SE category plus two seed surveys, excluding ACM Digital Library, IEEE Xplore, and Springer. This likely misses published conference and journal papers not indexed in those two surveys."
    243     },
    244     {
    245       "flag": "Publication bias not discussed",
    246       "detail": "The survey does not acknowledge that published papers systematically over-report certain design choices, QAs, or rationale due to framing conventions, potentially distorting the frequency distributions reported."
    247     },
    248     {
    249       "flag": "Incomplete screening documentation",
    250       "detail": "The paper reports 236 papers before inclusion criteria and 94 after, but provides no stage-by-stage breakdown of exclusions by criterion, preventing assessment of how consistently criteria were applied."
    251     },
    252     {
    253       "flag": "Single-coder primary extraction",
    254       "detail": "The first author performed all primary data extraction; second and third authors only reviewed afterward. No independent parallel coding with inter-rater reliability metrics (e.g., Cohen's kappa) was reported."
    255     },
    256     {
    257       "flag": "Proxy measurement unacknowledged",
    258       "detail": "What designers documented in papers is treated as equivalent to what they actually prioritized, without acknowledging selective reporting: papers emphasize what differentiates them, not comprehensively documenting all design decisions."
    259     }
    260   ],
    261   "cited_papers": [
    262     {
    263       "title": "Large Language Model-Based Agents for Software Engineering: A Survey",
    264       "relevance": "Primary seed survey providing 118 initial papers and the main baseline this work builds upon and differentiates from"
    265     },
    266     {
    267       "title": "Agents in Software Engineering: Survey, Landscape, and Vision",
    268       "relevance": "Second primary seed survey providing 115 initial papers; key baseline for scope comparison"
    269     },
    270     {
    271       "title": "Agent Design Pattern Catalogue: A Collection of Architectural Patterns for Foundation Model based Agents",
    272       "relevance": "Foundational taxonomy used as starting point for design pattern classification in RQ3"
    273     },
    274     {
    275       "title": "LLM-Based Multi-Agent Systems for Software Engineering: Literature Review, Vision and the Road Ahead",
    276       "relevance": "Closely related survey by He et al. covering 71 papers; directly compared to this work to establish the research gap"
    277     },
    278     {
    279       "title": "MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework",
    280       "relevance": "Canonical example of LLM-based MAS for End-to-End Software Development; used as illustrative example throughout results"
    281     },
    282     {
    283       "title": "ChatDev: Communicative Agents for Software Development",
    284       "relevance": "Key example of Role-Based Cooperation pattern and chat-chain task decomposition rationale"
    285     },
    286     {
    287       "title": "A Survey on LLM-based Multi-Agent System: Recent Advances and New Frontiers in Application",
    288       "relevance": "Related survey covering 125 papers from top AI venues 2023-2024; discussed in related work section"
    289     },
    290     {
    291       "title": "Why Do Multi-Agent LLM Systems Fail?",
    292       "relevance": "Empirical study of MAS failure modes through 200+ dialogues; provides context for reliability and fault tolerance QA findings"
    293     }
    294   ],
    295   "engagement_factors": {
    296     "practical_relevance": {
    297       "score": 2,
    298       "justification": "Identifies 16 reusable design patterns and mapping relationships that practitioners building multi-agent SE systems can directly reference."
    299     },
    300     "surprise_contrarian": {
    301       "score": 0,
    302       "justification": "Findings confirm expected patterns — code generation dominates, correctness matters most, role-based cooperation is common — with no counterintuitive results."
    303     },
    304     "fear_safety": {
    305       "score": 0,
    306       "justification": "Security is mentioned as a minor quality attribute (10.6%) but no novel risks or vulnerabilities are demonstrated."
    307     },
    308     "drama_conflict": {
    309       "score": 0,
    310       "justification": "A straightforward taxonomic survey with no controversy, no challenges to specific claims, and no conflict angle."
    311     },
    312     "demo_ability": {
    313       "score": 1,
    314       "justification": "Dataset is publicly available on GitHub but there is no runnable tool, demo, or interactive artifact to try."
    315     },
    316     "brand_recognition": {
    317       "score": 0,
    318       "justification": "From Wuhan University and RMIT — respected but not household-name labs — and the topic is an academic taxonomy rather than a famous product."
    319     }
    320   },
    321   "hn_data": {
    322     "threads": [
    323       {
    324         "hn_id": "29250329",
    325         "title": "Free Will Belief as a Consequence of Model-Based Reinforcement Learning",
    326         "points": 2,
    327         "comments": 4,
    328         "url": "https://news.ycombinator.com/item?id=29250329",
    329         "created_at": "2021-11-17T08:19:50Z"
    330       },
    331       {
    332         "hn_id": "33617429",
    333         "title": "High-level synthesis for packet processing pipelines",
    334         "points": 2,
    335         "comments": 0,
    336         "url": "https://news.ycombinator.com/item?id=33617429",
    337         "created_at": "2022-11-16T00:50:17Z"
    338       },
    339       {
    340         "hn_id": "10672783",
    341         "title": "Reverse Engineering Intel DRAM Addressing and Exploitation",
    342         "points": 2,
    343         "comments": 0,
    344         "url": "https://news.ycombinator.com/item?id=10672783",
    345         "created_at": "2015-12-03T21:28:18Z"
    346       }
    347     ],
    348     "top_points": 2,
    349     "total_points": 6,
    350     "total_comments": 4
    351   }
    352 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs