scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (19059B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Designing LLM-based Multi-Agent Systems for Software Engineering Tasks: Quality Attributes, Design Patterns and Rationale",
      6     "authors": [
      7       "Yangxiao Cai",
      8       "Ruiyin Li",
      9       "Peng Liang",
     10       "Mojtaba Shahin",
     11       "Zengyang Li"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2511.08475",
     16     "doi": "10.48550/arXiv.2511.08475"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All four main claims in the abstract (Code Generation 47.9%, Functional Suitability 94.7%, Role-Based Cooperation most common pattern at 46.8%, Improving Code Quality most common rationale at 44.7%) are directly supported by data tables in Sections 4.1–4.4.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The implications section makes causal-adjacent recommendations (e.g., 'Role-Based Cooperation can improve Maintainability') but the evidence is observational co-occurrence frequency from a literature mapping, which cannot support causal inference.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Implications are framed as general design guidance without explicitly bounding conclusions to the 94-paper corpus from before September 2024 or the specific limited source databases (two surveys + arXiv only).",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The dominance of code generation (47.9%) could reflect benchmark availability or publication incentives rather than actual designer priorities; the paper does not consider such alternative explanations for the observed distributions.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper uses frequency of mentions in papers as a proxy for what designers 'mainly focus on' without acknowledging the gap between paper-level reporting and actual practitioner design priorities.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 6 'Threats on Validity' explicitly addresses construct validity, external validity, and reliability threats with dedicated discussion of each.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The threats section cites specific mitigations: pilot extraction with 5 randomly selected papers, multi-round discussions among three named authors, and named specific sources (Liu et al. and Wang et al. surveys plus arXiv SE category).",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper states the date cutoff (Sept 30, 2024) and inclusion criteria but does not explicitly articulate what the results do NOT show, or bound the implications to the studied corpus and time period.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Funding is disclosed in acknowledgments: NSFC grants 62402348 and 62172311, and the Major Science and Technology Project of Hubei Province under Grant No. 2024BAA008.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly listed: Wuhan University (Cai, Li, Liang), RMIT University (Shahin), and Central China Normal University (Z. Li).",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Funders are Chinese government science foundations (NSFC and Hubei Province), which have no direct commercial stake in the paper's taxonomic findings about LLM-based MAS design patterns.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests declaration is present; there is no statement about patents, equity, or consulting relationships anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "LLM-based MASs are defined as 'multiple autonomous agents that collaborate through communication and responsibility specialization'; Quality Attributes are categorized per ISO/IEC 25010:2023; Design Patterns are defined as 'reusable solutions that help balance quality attributes'.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three explicit contributions are listed in the introduction: (1) identifying SE tasks and QAs, (2) identifying design patterns and rationale, and (3) establishing mapping relationships among all four dimensions.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 provides structured review of related work across three subsections (LLM-based MASs for SE, MAS characteristics, prior surveys), and Section 2.4 explicitly articulates the research gap distinguishing this study from six named prior surveys.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "survey": {
    120       "search_and_selection": {
    121         "search_strategy_reproducible": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The search query is provided ('large language model' AND 'agent' in arXiv SE category), the date cutoff (September 30, 2024) is specified, and the two prior surveys used as seed sources are named with citations.",
    125           "source": "haiku"
    126         },
    127         "inclusion_exclusion_explicit": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Three explicit inclusion criteria are stated: papers must introduce at least one MAS, agents must leverage LLMs, and agents must address at least one SE task; these are applied consistently.",
    131           "source": "haiku"
    132         },
    133         "prisma_or_structured_protocol": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No PRISMA or other named systematic review protocol is mentioned; the paper describes a custom 4-phase process without following an established review methodology or reporting guideline.",
    137           "source": "haiku"
    138         },
    139         "search_terms_provided": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "The search query is explicitly provided verbatim: (\"large language model\" AND \"agent\") applied to the arXiv SE category.",
    143           "source": "haiku"
    144         },
    145         "databases_listed": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Sources are explicitly listed: Liu et al. 2024 survey, Wang et al. 2025 survey, and arXiv SE category; though major academic databases (IEEE Xplore, ACM DL, Scopus) were not searched.",
    149           "source": "haiku"
    150         },
    151         "screening_process_documented": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Counts are provided at each major stage: 118 (Liu et al.) + 115 (Wang et al.) + 194 (arXiv) = 427 total, 236 after deduplication, 94 after applying inclusion criteria.",
    155           "source": "haiku"
    156         },
    157         "review_scope_justified": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "The paper justifies using arXiv and two surveys for comprehensiveness but does not justify excluding major academic databases (IEEE Xplore, ACM DL, Scopus), which is a significant coverage limitation left unaddressed.",
    161           "source": "haiku"
    162         }
    163       },
    164       "synthesis_quality": {
    165         "conflicting_findings_acknowledged": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The paper catalogues patterns across 94 papers without discussing contradictions or conflicts in findings among reviewed papers (e.g., papers advocating different patterns for the same SE task produce no tension analysis).",
    169           "source": "haiku"
    170         },
    171         "quality_assessment_of_sources": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No quality rubric, risk-of-bias assessment, or structured quality evaluation is applied to the 94 included papers; all papers are treated as equally valid sources regardless of methodological rigor.",
    175           "source": "haiku"
    176         },
    177         "publication_bias_discussed": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "Publication bias is not discussed; the paper does not acknowledge that arXiv preprints and conference publications skew toward positive results, or that this may inflate apparent prevalence of certain design patterns.",
    181           "source": "haiku"
    182         },
    183         "quantitative_synthesis_present": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "The paper provides frequency counts and percentages across all four taxonomies and cross-dimensional mapping tables (e.g., Table 6 mapping QA sub-characteristics against SE tasks with paper counts), constituting vote-counting synthesis.",
    187           "source": "haiku"
    188         },
    189         "recommendations_supported_by_evidence": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Each of the six implications in Section 5.2 references specific frequency findings (e.g., Implication 2 on Role-Based Cooperation cites the co-occurrence with Modularity across 46 papers), tying recommendations to observed data.",
    193           "source": "haiku"
    194         }
    195       }
    196     }
    197   },
    198   "claims": [
    199     {
    200       "claim": "Code Generation is the most common SE task addressed by LLM-based MASs at 47.9% of papers",
    201       "evidence": "45 of 94 papers focus on code generation, with all study references listed in Table 2",
    202       "supported": "strong"
    203     },
    204     {
    205       "claim": "Functional Suitability is the most frequently considered quality attribute, appearing in 94.7% of papers",
    206       "evidence": "89 of 94 papers explicitly consider Functional Suitability, with Functional Correctness the most common sub-QA (86 papers); documented in Table 3",
    207       "supported": "strong"
    208     },
    209     {
    210       "claim": "Role-Based Cooperation is the most commonly employed design pattern at 46.8% of papers",
    211       "evidence": "44 of 94 papers use Role-Based Cooperation, documented in Table 4 with full study citations",
    212       "supported": "strong"
    213     },
    214     {
    215       "claim": "Improving the Quality of Generated Code is the most common design rationale at 44.7% of papers",
    216       "evidence": "42 of 94 papers cite this rationale, listed with study references in Table 5",
    217       "supported": "strong"
    218     },
    219     {
    220       "claim": "End-to-end software lifecycle coverage remains nascent (7.4% development, 8.5% maintenance)",
    221       "evidence": "Only 7 papers on end-to-end development and 8 on end-to-end maintenance; attributed to lack of cross-stage benchmarks and evaluation criteria",
    222       "supported": "moderate"
    223     },
    224     {
    225       "claim": "Role-Based Cooperation improves Maintainability in LLM-based MASs",
    226       "evidence": "Observational co-occurrence: papers using role-based cooperation also consider modularity; no controlled comparison or causal study design",
    227       "supported": "weak"
    228     }
    229   ],
    230   "methodology_tags": [
    231     "qualitative",
    232     "meta-analysis"
    233   ],
    234   "key_findings": "A mapping study of 94 papers on LLM-based MASs for SE tasks reveals heavy concentration on Code Generation (48%), with Role-Based Cooperation (47%) and Self-Reflection (36%) as the dominant design patterns and Functional Correctness (92%) as the overwhelmingly prioritized quality attribute. Improving code quality is the primary design rationale (45%), with resource optimization and efficiency also prominent. End-to-end lifecycle support remains nascent (under 9% of papers) due to absent cross-stage benchmarks and evaluation criteria, while a taxonomy of 16 design patterns, 8 QAs, and 8 rationale categories provides the first structured mapping linking SE tasks to design decisions.",
    235   "red_flags": [
    236     {
    237       "flag": "Limited source databases",
    238       "detail": "Review relies on only two prior surveys and arXiv SE category, omitting IEEE Xplore, ACM DL, and Scopus; venue-published work not captured as arXiv preprints may be systematically excluded."
    239     },
    240     {
    241       "flag": "No quality assessment of included papers",
    242       "detail": "All 94 papers are treated as equally credible sources regardless of study design or methodological rigor, conflating high-quality empirical evaluations with unvalidated system descriptions."
    243     },
    244     {
    245       "flag": "No PRISMA or established review protocol",
    246       "detail": "The review does not follow PRISMA or any named systematic review protocol, reducing reproducibility and making selection bias harder to assess."
    247     },
    248     {
    249       "flag": "Single-reviewer primary extraction without inter-rater reliability",
    250       "detail": "The first author conducted primary data extraction independently, with post-hoc review by co-authors only; no Cohen's kappa or other formal inter-rater reliability measure is reported."
    251     },
    252     {
    253       "flag": "Causal framing of observational findings",
    254       "detail": "Implications suggest patterns like Role-Based Cooperation 'improve' Maintainability, but evidence is observational co-occurrence frequency — no controlled evaluation of design pattern effects exists."
    255     },
    256     {
    257       "flag": "Publication bias unaddressed",
    258       "detail": "ArXiv self-selection and conference publication bias toward positive results are not discussed, potentially skewing the taxonomy toward approaches that appear to work rather than those commonly attempted."
    259     }
    260   ],
    261   "cited_papers": [
    262     {
    263       "title": "Large Language Model-Based Agents for Software Engineering: A Survey (Liu et al. 2024)",
    264       "relevance": "Primary seed source providing 118 of the initial paper pool; directly enables the starting corpus"
    265     },
    266     {
    267       "title": "Agents in Software Engineering: Survey, Landscape, and Vision (Wang et al. 2025)",
    268       "relevance": "Second seed source providing 115 papers; co-constitutes the initial corpus"
    269     },
    270     {
    271       "title": "Agent Design Pattern Catalogue: A Collection of Architectural Patterns for Foundation Model based Agents (Liu et al. 2025)",
    272       "relevance": "Used as classification framework for design patterns in RQ3; provides the starting taxonomy that this paper extends"
    273     },
    274     {
    275       "title": "MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework (Hong et al. 2023)",
    276       "relevance": "Prominent example system cited across multiple findings including end-to-end development, role-based cooperation, and self-reflection"
    277     },
    278     {
    279       "title": "ChatDev: Communicative Agents for Software Development (Qian et al. 2024)",
    280       "relevance": "Key example of end-to-end software development MAS, cited for chat-chain task decomposition and role assignment"
    281     },
    282     {
    283       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation (Wu et al.)",
    284       "relevance": "Prominent MAS framework used as example across multiple QAs, design patterns (RAG, Role-Based Cooperation), and rationale"
    285     },
    286     {
    287       "title": "LLM-Based Multi-Agent Systems for Software Engineering: Literature Review, Vision and the Road Ahead (He et al. 2025)",
    288       "relevance": "Related survey that this paper explicitly positions against to justify its distinct focus on design QAs and patterns rather than capabilities"
    289     },
    290     {
    291       "title": "ISO/IEC 25010:2023 Systems and software Quality Requirements and Evaluation",
    292       "relevance": "Foundational standard used to categorize all quality attributes in RQ2; central methodological reference throughout the paper"
    293     }
    294   ],
    295   "engagement_factors": {
    296     "practical_relevance": {
    297       "score": 2,
    298       "justification": "The taxonomy of 16 design patterns, 8 QAs, and mapping tables provides actionable checklists for practitioners designing LLM-based MASs for specific SE tasks."
    299     },
    300     "surprise_contrarian": {
    301       "score": 0,
    302       "justification": "Findings confirm conventional expectations: code generation dominates, role-based cooperation and self-reflection are most common — no surprising or contrarian results are surfaced."
    303     },
    304     "fear_safety": {
    305       "score": 0,
    306       "justification": "Security is among the least-discussed QAs (10.6%) and no significant AI safety risks are surfaced by the survey findings."
    307     },
    308     "drama_conflict": {
    309       "score": 0,
    310       "justification": "Purely descriptive taxonomy study with no controversy, competing claims between research groups, or inter-community conflict."
    311     },
    312     "demo_ability": {
    313       "score": 1,
    314       "justification": "Dataset is publicly available on GitHub (Caiyangxiao/MASDesign) allowing inspection of extraction results, though the paper itself is not a runnable artifact."
    315     },
    316     "brand_recognition": {
    317       "score": 0,
    318       "justification": "Authors are from Wuhan University, RMIT, and Central China Normal University — no famous lab or industry-affiliated product involved."
    319     }
    320   },
    321   "hn_data": {
    322     "threads": [
    323       {
    324         "hn_id": "29250329",
    325         "title": "Free Will Belief as a Consequence of Model-Based Reinforcement Learning",
    326         "points": 2,
    327         "comments": 4,
    328         "url": "https://news.ycombinator.com/item?id=29250329",
    329         "created_at": "2021-11-17T08:19:50Z"
    330       },
    331       {
    332         "hn_id": "33617429",
    333         "title": "High-level synthesis for packet processing pipelines",
    334         "points": 2,
    335         "comments": 0,
    336         "url": "https://news.ycombinator.com/item?id=33617429",
    337         "created_at": "2022-11-16T00:50:17Z"
    338       },
    339       {
    340         "hn_id": "10672783",
    341         "title": "Reverse Engineering Intel DRAM Addressing and Exploitation",
    342         "points": 2,
    343         "comments": 0,
    344         "url": "https://news.ycombinator.com/item?id=10672783",
    345         "created_at": "2015-12-03T21:28:18Z"
    346       }
    347     ],
    348     "top_points": 2,
    349     "total_points": 6,
    350     "total_comments": 4
    351   }
    352 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs