scan.json (22720B)
1 { 2 "paper": { 3 "title": "Designing LLM-based Multi-Agent Systems for Software Engineering Tasks: Quality Attributes, Design Patterns and Rationale", 4 "authors": [ 5 "Yangxiao Cai", 6 "Ruiyin Li", 7 "Peng Liang", 8 "Mojtaba Shahin", 9 "Zengyang Li" 10 ], 11 "year": 2025, 12 "venue": "ACM Transactions on Software Engineering and Methodology", 13 "arxiv_id": "2511.08475", 14 "doi": "10.48550/arXiv.2511.08475" 15 }, 16 "scan_version": 3, 17 "active_modules": [ 18 "survey_methodology" 19 ], 20 "methodology_tags": [ 21 "meta-analysis", 22 "qualitative" 23 ], 24 "key_findings": "This systematic study of 94 papers on LLM-based multi-agent systems for SE tasks finds that Code Generation is the most common task (47.9%), Functional Suitability the most prioritized quality attribute (94.7%), Role-Based Cooperation the most used design pattern (46.8%), and Improving the Quality of Generated Code the most common design rationale (44.7%). The study identifies 10 SE task categories, 16 design patterns, and 8 design rationale categories, providing mapping relationships among them.", 25 "checklist": { 26 "artifacts": { 27 "code_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The dataset is publicly available at a GitHub repository [5] (https://github.com/Caiyangxiao/MASDesign), containing extraction results in MS Excel and MAXQDA files." 31 }, 32 "data_released": { 33 "applies": true, 34 "answer": true, 35 "justification": "The paper states 'The dataset of this work has been made available at [5]' and the formal data extraction results are recorded in publicly available MS Excel and MAXQDA files." 36 }, 37 "environment_specified": { 38 "applies": true, 39 "answer": false, 40 "justification": "No environment or dependency specifications are provided. The study is primarily manual qualitative analysis, but the dataset format and tools (MAXQDA, MS Excel) are not specified with versions." 41 }, 42 "reproduction_instructions": { 43 "applies": true, 44 "answer": false, 45 "justification": "No step-by-step reproduction instructions are provided. While the methodology is described, there are no specific instructions for replicating the analysis process." 46 } 47 }, 48 "statistical_methodology": { 49 "confidence_intervals_or_error_bars": { 50 "applies": false, 51 "answer": false, 52 "justification": "This is a qualitative survey/mapping study using open coding and constant comparison. It reports frequencies and percentages, not experimental measurements requiring confidence intervals." 53 }, 54 "significance_tests": { 55 "applies": false, 56 "answer": false, 57 "justification": "No statistical comparisons are made. The paper reports descriptive frequencies of SE tasks, QAs, design patterns, and rationale." 58 }, 59 "effect_sizes_reported": { 60 "applies": false, 61 "answer": false, 62 "justification": "No experimental effects are measured. The paper is a qualitative mapping study." 63 }, 64 "sample_size_justified": { 65 "applies": true, 66 "answer": false, 67 "justification": "The sample of 94 papers is described but no justification is given for why this size is sufficient for the claimed comprehensiveness. No saturation analysis for the open coding is reported." 68 }, 69 "variance_reported": { 70 "applies": false, 71 "answer": false, 72 "justification": "No experimental runs are conducted. The study is qualitative analysis of papers." 73 } 74 }, 75 "evaluation_design": { 76 "baselines_included": { 77 "applies": true, 78 "answer": true, 79 "justification": "The paper compares against and positions itself relative to prior surveys by Liu et al. [22], Wang et al. [32], Chen et al. [7], Yan et al. [33], Yu et al. [34], and He et al. [13], with a dedicated comparison in Section 2.4." 80 }, 81 "baselines_contemporary": { 82 "applies": true, 83 "answer": true, 84 "justification": "The compared surveys are all from 2024-2025, which are recent and relevant works in the same domain." 85 }, 86 "ablation_study": { 87 "applies": false, 88 "answer": false, 89 "justification": "This is a mapping study with no system components to ablate." 90 }, 91 "multiple_metrics": { 92 "applies": false, 93 "answer": false, 94 "justification": "This is a qualitative mapping study, not an experiment with metrics." 95 }, 96 "human_evaluation": { 97 "applies": false, 98 "answer": false, 99 "justification": "Human evaluation of system outputs is not applicable to a survey/mapping study." 100 }, 101 "held_out_test_set": { 102 "applies": false, 103 "answer": false, 104 "justification": "No test set is involved; this is a qualitative mapping study." 105 }, 106 "per_category_breakdown": { 107 "applies": true, 108 "answer": true, 109 "justification": "Results are broken down by SE task category (10 categories), quality attribute category (8 main + sub-categories per ISO/IEC 25010), design pattern (16 patterns), and design rationale (8 categories)." 110 }, 111 "failure_cases_discussed": { 112 "applies": true, 113 "answer": false, 114 "justification": "The paper does not discuss limitations of specific categorization decisions, edge cases in coding, or papers that were difficult to classify." 115 }, 116 "negative_results_reported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper notes underrepresented areas such as Security (10.6%) and Interaction Capability (9.6%) being rarely considered, and identifies gaps like the lack of end-to-end lifecycle benchmarks." 120 } 121 }, 122 "claims_and_evidence": { 123 "abstract_claims_supported": { 124 "applies": true, 125 "answer": true, 126 "justification": "The abstract claims about Code Generation being the most common SE task (47.9%), Functional Suitability being the most considered QA (94.7%), Role-Based Cooperation being the most used pattern (46.8%), and Improving the Quality of Generated Code being the most common rationale (44.7%) are all supported by the data in Section 4." 127 }, 128 "causal_claims_justified": { 129 "applies": false, 130 "answer": false, 131 "justification": "The paper is descriptive, reporting what was found in the literature. It does not make causal claims about why certain patterns lead to certain outcomes." 132 }, 133 "generalization_bounded": { 134 "applies": true, 135 "answer": false, 136 "justification": "The title claims broad coverage of 'LLM-based Multi-Agent Systems for Software Engineering Tasks' but the study is limited to 94 papers collected before September 2024 from two surveys and arXiv only. No bounding of generalization to this specific corpus and time window is provided in the abstract or conclusions." 137 }, 138 "alternative_explanations_discussed": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper does not discuss alternative explanations for its observed patterns, such as whether the dominance of Code Generation reflects genuine importance or simply publication bias toward easily benchmarked tasks." 142 }, 143 "proxy_outcome_distinction": { 144 "applies": true, 145 "answer": false, 146 "justification": "The paper counts papers as a proxy for design practice importance but does not discuss whether paper frequency reflects actual real-world adoption or just publication trends." 147 } 148 }, 149 "setup_transparency": { 150 "model_versions_specified": { 151 "applies": false, 152 "answer": false, 153 "justification": "No AI models are used in the methodology. This is a manual qualitative study." 154 }, 155 "prompts_provided": { 156 "applies": false, 157 "answer": false, 158 "justification": "No prompting is used. This is a manual qualitative study." 159 }, 160 "hyperparameters_reported": { 161 "applies": false, 162 "answer": false, 163 "justification": "No AI models or hyperparameters are involved in the methodology." 164 }, 165 "scaffolding_described": { 166 "applies": false, 167 "answer": false, 168 "justification": "No agentic scaffolding is used in the study methodology." 169 }, 170 "data_preprocessing_documented": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 3.2 documents the data collection pipeline: 118 papers from Liu et al., 115 from Wang et al., 194 from arXiv keyword search, deduplication to 236, then filtering by 3 criteria to 94 papers. Section 3.3 describes pilot and formal data extraction with specific rules." 174 } 175 }, 176 "limitations_and_scope": { 177 "limitations_section_present": { 178 "applies": true, 179 "answer": true, 180 "justification": "Section 6 'Threats on Validity' discusses construct validity, external validity, and reliability threats." 181 }, 182 "threats_to_validity_specific": { 183 "applies": true, 184 "answer": true, 185 "justification": "The threats section discusses specific issues: individual bias in manual data extraction (mitigated by pilot extraction and multi-author review), data source selection limited to two surveys and arXiv, and methodological uncertainties resolved through author discussions." 186 }, 187 "scope_boundaries_stated": { 188 "applies": true, 189 "answer": false, 190 "justification": "The paper does not explicitly state what its results do NOT show. It does not bound claims to the specific time window (pre-September 2024), acknowledge that industrial/proprietary MAS designs are excluded, or note that its findings may not generalize beyond academic publications." 191 } 192 }, 193 "data_integrity": { 194 "raw_data_available": { 195 "applies": true, 196 "answer": true, 197 "justification": "The raw extraction data is available in the public dataset [5] at GitHub, containing MS Excel and MAXQDA files." 198 }, 199 "data_collection_described": { 200 "applies": true, 201 "answer": true, 202 "justification": "Section 3.2 describes data collection: papers sourced from two surveys and arXiv keyword search ('large language model' AND 'agent') in the SE category, with three inclusion criteria." 203 }, 204 "recruitment_methods_described": { 205 "applies": false, 206 "answer": false, 207 "justification": "No human participants are involved. The data source is academic papers from standard sources." 208 }, 209 "data_pipeline_documented": { 210 "applies": true, 211 "answer": true, 212 "justification": "The pipeline is documented: 118 + 115 + 194 papers collected → deduplicated to 236 → filtered by 3 criteria to 94 papers. Section 3.3 describes pilot then formal extraction with review rounds." 213 } 214 }, 215 "conflicts_of_interest": { 216 "funding_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Acknowledgments section states support from NSFC Grant No. 62402348 and 62172311, and Major Science and Technology Project of Hubei Province Grant No. 2024BAA008." 220 }, 221 "affiliations_disclosed": { 222 "applies": true, 223 "answer": true, 224 "justification": "All author affiliations are clearly listed: Wuhan University, RMIT University, and Central China Normal University." 225 }, 226 "funder_independent_of_outcome": { 227 "applies": true, 228 "answer": true, 229 "justification": "The funders (NSFC, Hubei Province) are government research agencies with no financial stake in the study outcomes." 230 }, 231 "financial_interests_declared": { 232 "applies": true, 233 "answer": false, 234 "justification": "No competing interests statement is present in the paper." 235 } 236 }, 237 "contamination": { 238 "training_cutoff_stated": { 239 "applies": false, 240 "answer": false, 241 "justification": "This is a survey/mapping study that does not evaluate any pre-trained model on a benchmark." 242 }, 243 "train_test_overlap_discussed": { 244 "applies": false, 245 "answer": false, 246 "justification": "No model evaluation is performed." 247 }, 248 "benchmark_contamination_addressed": { 249 "applies": false, 250 "answer": false, 251 "justification": "No model evaluation is performed." 252 } 253 }, 254 "human_studies": { 255 "pre_registered": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants are involved in this study." 259 }, 260 "irb_or_ethics_approval": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants are involved." 264 }, 265 "demographics_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants are involved." 269 }, 270 "inclusion_exclusion_criteria": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants are involved." 274 }, 275 "randomization_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants are involved." 279 }, 280 "blinding_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants are involved." 284 }, 285 "attrition_reported": { 286 "applies": false, 287 "answer": false, 288 "justification": "No human participants are involved." 289 } 290 }, 291 "cost_and_practicality": { 292 "inference_cost_reported": { 293 "applies": false, 294 "answer": false, 295 "justification": "This is a survey paper with no computational method to cost." 296 }, 297 "compute_budget_stated": { 298 "applies": false, 299 "answer": false, 300 "justification": "This is a manual qualitative survey with no significant compute." 301 } 302 }, 303 "survey_methodology": { 304 "prisma_or_structured_protocol": { 305 "applies": true, 306 "answer": true, 307 "justification": "Section 3.2 describes a structured search strategy: seeded from two prior surveys, supplemented with arXiv keyword search, with explicit inclusion criteria (3 criteria listed) and documented filtering from 236 to 94 papers. Figure 1 shows the research process overview." 308 }, 309 "quality_assessment_of_sources": { 310 "applies": true, 311 "answer": false, 312 "justification": "The survey does not assess the methodological quality of its 94 source papers. All papers are treated equally regardless of their rigor, venue quality, or evaluation strength." 313 }, 314 "publication_bias_discussed": { 315 "applies": true, 316 "answer": false, 317 "justification": "No discussion of publication bias. The paper does not consider whether the included studies skew toward positive results about MAS effectiveness, or whether negative results about MAS designs go unpublished." 318 } 319 } 320 }, 321 "claims": [ 322 { 323 "claim": "Code Generation is the most common SE task addressed by LLM-based MASs (47.9% of 94 papers).", 324 "evidence": "Figure 2 and Table 2 show 45 of 94 papers address code generation, with detailed examples from each category.", 325 "supported": "strong" 326 }, 327 { 328 "claim": "Functional Suitability is the most commonly considered quality attribute (94.7%).", 329 "evidence": "Figure 3 and Table 3 show 89 of 94 papers consider Functional Suitability, with Functional Correctness as the dominant sub-attribute (86 papers).", 330 "supported": "strong" 331 }, 332 { 333 "claim": "Role-Based Cooperation is the most frequently employed design pattern (46.8%).", 334 "evidence": "Table 4 shows 44 papers use Role-Based Cooperation, with examples and study IDs listed.", 335 "supported": "strong" 336 }, 337 { 338 "claim": "Improving the Quality of Generated Code is the most common design rationale (44.7%).", 339 "evidence": "Table 5 shows 42 papers use this rationale, with specific examples cited.", 340 "supported": "strong" 341 }, 342 { 343 "claim": "LLM-based MASs are increasingly adopted to support the entire software lifecycle.", 344 "evidence": "15 papers address end-to-end development (7) or maintenance (8), discussed in Implication 4 (Section 5.2).", 345 "supported": "moderate" 346 } 347 ], 348 "red_flags": [ 349 { 350 "flag": "No quality assessment of included studies", 351 "detail": "The survey treats all 94 papers equally without assessing their methodological rigor. A paper with comprehensive evaluation and a position paper with no experiments contribute equally to the frequency counts, potentially laundering weak results." 352 }, 353 { 354 "flag": "Single-coder extraction bias", 355 "detail": "The first author conducted all formal data extraction independently, with second and third authors only reviewing. Despite a pilot study, subjective categorization by a single person introduces systematic bias. No inter-rater reliability metrics (e.g., Cohen's kappa) are reported." 356 }, 357 { 358 "flag": "Limited data sources", 359 "detail": "Papers were collected from only two prior surveys and arXiv. Major venues like IEEE Xplore, ACM DL, and Scopus were not searched directly, potentially missing relevant work not covered by the seed surveys." 360 }, 361 { 362 "flag": "No publication bias consideration", 363 "detail": "The survey does not consider whether published papers skew toward successful MAS designs, potentially overstating the effectiveness of identified patterns and understating failure modes." 364 } 365 ], 366 "cited_papers": [ 367 { 368 "title": "MetaGPT: Meta Programming for A Multi-Agent Collaborative Framework", 369 "authors": [ 370 "Sirui Hong", 371 "Mingchen Zhuge", 372 "Jiaqi Chen" 373 ], 374 "year": 2023, 375 "arxiv_id": "2308.00352", 376 "relevance": "Major LLM-based MAS framework for end-to-end software development, frequently cited as exemplar of role-based cooperation." 377 }, 378 { 379 "title": "ChatDev: Communicative Agents for Software Development", 380 "authors": [ 381 "Chen Qian", 382 "Wei Liu", 383 "Hongzhang Liu" 384 ], 385 "year": 2024, 386 "relevance": "Communicative multi-agent system for software development using chat-based collaboration between role-specialized agents." 387 }, 388 { 389 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 390 "authors": [ 391 "Qingyun Wu", 392 "Gagan Bansal", 393 "Jieyu Zhang" 394 ], 395 "year": 2023, 396 "arxiv_id": "2308.08155", 397 "relevance": "Multi-agent conversation framework enabling RAG and tool use for code generation and question answering." 398 }, 399 { 400 "title": "SWE-AGENT: Agent-Computer Interfaces Enable Automated Software Engineering", 401 "authors": [ 402 "John Yang", 403 "Carlos E. Jimenez", 404 "Alexander Wettig" 405 ], 406 "year": 2024, 407 "arxiv_id": "2405.15793", 408 "relevance": "Defines agent-computer interfaces for automated SE, exemplifying design patterns for agent-environment interaction." 409 }, 410 { 411 "title": "Agent Design Pattern Catalogue: A Collection of Architectural Patterns for Foundation Model based Agents", 412 "authors": [ 413 "Yue Liu", 414 "Sin Kit Lo", 415 "Qinghua Lu" 416 ], 417 "year": 2025, 418 "relevance": "Provides the architectural pattern taxonomy used as starting point for design pattern classification in this study." 419 }, 420 { 421 "title": "Large Language Model-Based Agents for Software Engineering: A Survey", 422 "authors": [ 423 "Junwei Liu", 424 "Kaixin Wang", 425 "Yixuan Chen" 426 ], 427 "year": 2024, 428 "arxiv_id": "2409.02977", 429 "relevance": "One of two seed surveys used for data collection; surveys LLM-based agent systems for SE tasks." 430 }, 431 { 432 "title": "LLM-Based Multi-Agent Systems for Software Engineering: Literature Review, Vision and the Road Ahead", 433 "authors": [ 434 "Junda He", 435 "Christoph Treude", 436 "David Lo" 437 ], 438 "year": 2025, 439 "relevance": "Systematic literature review of LLM-based MASs for SE, proposing a research agenda for agent collaboration." 440 }, 441 { 442 "title": "Why Do Multi-Agent LLM Systems Fail?", 443 "authors": [ 444 "Mert Cemri", 445 "Melissa Z. Pan" 446 ], 447 "year": 2025, 448 "arxiv_id": "2503.13657", 449 "relevance": "Empirical study of failure modes in LLM-based MASs with taxonomy of failures from 200+ dialogues." 450 }, 451 { 452 "title": "Swe-bench: Can language models resolve real-world github issues?", 453 "authors": [ 454 "Carlos E. Jimenez", 455 "John Yang", 456 "Alexander Wettig" 457 ], 458 "year": 2024, 459 "relevance": "Major benchmark for evaluating LLM agents on real-world software engineering tasks." 460 }, 461 { 462 "title": "A Survey on Trustworthy LLM Agents: Threats and Countermeasures", 463 "authors": [ 464 "Miao Yu", 465 "Fanci Meng", 466 "Xinyun Zhou" 467 ], 468 "year": 2025, 469 "arxiv_id": "2503.09648", 470 "relevance": "Survey on trustworthiness in LLM-based agents covering threats and countermeasures." 471 } 472 ], 473 "engagement_factors": { 474 "practical_relevance": { 475 "score": 2, 476 "justification": "Identifies 16 reusable design patterns and mapping relationships that practitioners building multi-agent SE systems can directly reference." 477 }, 478 "surprise_contrarian": { 479 "score": 0, 480 "justification": "Findings confirm expected patterns — code generation dominates, correctness matters most, role-based cooperation is common — with no counterintuitive results." 481 }, 482 "fear_safety": { 483 "score": 0, 484 "justification": "Security is mentioned as a minor quality attribute (10.6%) but no novel risks or vulnerabilities are demonstrated." 485 }, 486 "drama_conflict": { 487 "score": 0, 488 "justification": "A straightforward taxonomic survey with no controversy, no challenges to specific claims, and no conflict angle." 489 }, 490 "demo_ability": { 491 "score": 1, 492 "justification": "Dataset is publicly available on GitHub but there is no runnable tool, demo, or interactive artifact to try." 493 }, 494 "brand_recognition": { 495 "score": 0, 496 "justification": "From Wuhan University and RMIT — respected but not household-name labs — and the topic is an academic taxonomy rather than a famous product." 497 } 498 } 499 }