scan.json (30202B)
1 { 2 "paper": { 3 "title": "AGENTASK: Multi-Agent Systems Need to Ask", 4 "authors": [ 5 "Bohan Lin", 6 "Kuo Yang", 7 "Zelin Tan", 8 "Yingchuan Lai", 9 "Chen Zhang", 10 "Guibin Zhang", 11 "Xinlei Yu", 12 "Miao Yu", 13 "Xu Wang", 14 "Yudong Zhang", 15 "Yang Wang" 16 ], 17 "year": 2025, 18 "venue": "arXiv", 19 "arxiv_id": "2510.07593", 20 "doi": "10.48550/arXiv.2510.07593" 21 }, 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": false, 27 "justification": "The abstract mentions a 'Code' link but no working URL or GitHub repository is provided anywhere in the paper text. The word 'Code' appears at the top of the abstract without an associated URL, which does not constitute a verifiable release." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": false, 32 "justification": "The 824-agent execution logs used for taxonomy annotation are not released. The benchmarks used for evaluation (GSM8K, MATH, MMLU, HumanEval, MBPP) are publicly available, but the annotated corpus from which the training data for AgentAsk was derived is not released." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "No requirements.txt, Dockerfile, or environment specification file is referenced or provided. Model names are given (Qwen-3-4B, Llama-3.2-3B, GPT-4o-mini-0718) but no dependency list or software environment details are included." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "No step-by-step instructions for reproducing experiments are provided. Algorithm 1 provides pseudocode for training but this is not sufficient for a researcher to reproduce experimental results without the data, code, and environment." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": true, 49 "justification": "Table 1 reports standard deviations (e.g., '94.52 ± 1.27') for the +GPT-5 and +AgentAsk conditions across five benchmarks, though the origin baselines are reported as single-run point estimates without uncertainty." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper makes comparative claims (e.g., '+AgentAsk improves framework-level averages by about +3.29 to +3.45 points') but does not report any statistical significance tests (no p-values, t-tests, or equivalents). Comparisons are based solely on numeric differences." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": false, 59 "justification": "Improvements are reported in percentage-point differences (e.g., +3.29 to +3.45 points) relative to baselines, but no standardized effect sizes (Cohen's d, odds ratios) are reported. The improvements look modest (3-5 pp) but without reference to variance or baseline context this is hard to interpret." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "The paper annotates 824 execution logs for the taxonomy but provides no justification for why 824 was sufficient for the claims about error distribution. The evaluation uses standard benchmarks with no sample-size justification." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": true, 69 "justification": "Standard deviations are reported in Table 1 for +GPT-5 and +AgentAsk conditions (e.g., '94.52 ± 1.27'), providing spread information for those results. However, the origin baselines report single point estimates without variance." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "Multiple baselines are included: single-model prompting (IO, CoT, Self-Refine) and four multi-agent frameworks (GPTSwarm, AFlow, MaAS, MasRouter). AgentAsk is compared against both the unmodified frameworks and a +GPT-5 upper bound." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "The multi-agent baselines (GPTSwarm, AFlow, MaAS, MasRouter) are all recent papers from 2024-2025. The paper also compares against a +GPT-5 upper bound clarifier. These are competitive contemporary baselines." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "Section 5.4 and Table 3 present ablation experiments removing individual reward components (rpar, reff, rfmt) and comparing to R-only training. Table 9 compares E-GRPO against PPO and GRPO across four frameworks and five benchmarks." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "The evaluation uses accuracy (for GSM8K, MATH, MMLU) and Pass@1 (for HumanEval, MBPP), plus latency and extra cost as efficiency metrics. Multiple dimensions are assessed simultaneously." 92 }, 93 "human_evaluation": { 94 "applies": false, 95 "answer": false, 96 "justification": "Human evaluation of model outputs is not relevant here; the paper evaluates on automated benchmarks with objective metrics (accuracy, Pass@1). Human annotators are used to build the taxonomy, not to evaluate AgentAsk's outputs." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": false, 101 "justification": "The paper does not clarify whether the benchmark splits used for evaluation are held-out test sets or whether any tuning decisions were made on the same splits. The training procedure uses multi-agent execution logs, but the relationship between training data and evaluation splits is not made explicit." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Results are broken down per benchmark (GSM8K, MATH, HumanEval, MMLU, MBPP) and per framework (GPTSwarm, AFlow, MaAS, MasRouter). Section 5.5 additionally breaks down performance by error type (Data Gap, Signal Corruption, Referential Drift, Capability Gap)." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section 5.5 discusses error types that AgentAsk handles less effectively: 'For tasks involving more Referential Drift and Capability Gap...the improvements are more limited.' The Limitations section also discusses AgentAsk's inability to handle errors arising from internal model limitations like hallucinations." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "Table 4 shows that AgentAsk with Qwen-3-4B SFT on MaAS slightly decreases GSM8K accuracy (-0.20 pp) and Table 6 shows a -0.08 pp MMLU decrease, which are reported transparently. Resolved@Edge rates also reveal limitations for Referential Drift (58.3%) and Capability Gap (49.5%) error types." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "The abstract claims 'improving accuracy by up to 4.69%' which is confirmed in Table 1 (MaAS +GPT-5 achieves 84.71, +4.69 over 80.02). The claim of 'latency and extra costs below 10%' is supported by efficiency data in Tables 2 and 5-7." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "The ablation studies in Section 5.4 provide controlled single-variable manipulation to support causal claims about each reward component's contribution. Removing individual components (rpar, reff, rfmt) consistently degrades performance, supporting causal interpretation. The plug-and-play framework design holds orchestration fixed, isolating the effect of the clarifier." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper uses language like 'edge-level issues are a common source of failure' and 'improving the reliability of multi-agent systems' broadly, but evaluation is limited to five benchmarks (math reasoning, QA, code generation) with a specific executor (GPT-4o-mini-0718) and four specific frameworks. Claims about general applicability to arbitrary MAS settings are not adequately bounded." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper attributes gains specifically to edge-level clarification but does not substantively discuss alternative explanations, such as whether gains might partly be due to simply calling the LLM an additional time (not necessarily for clarification), increased token budget, or the effect of adding any additional processing step. The +GPT-4o-mini baseline partially addresses this but is not framed as an alternative explanation." 139 } 140 }, 141 "setup_transparency": { 142 "model_versions_specified": { 143 "applies": true, 144 "answer": true, 145 "justification": "The paper specifies 'GPT-4o-mini-0718' (with version date) and 'Qwen-3-4B' and 'Llama-3.2-3B' as backbone models. GPT-5 is referenced by an OpenAI URL accessed 2025-08-07. These are sufficiently specific." 146 }, 147 "prompts_provided": { 148 "applies": true, 149 "answer": true, 150 "justification": "Appendix F (Figure 7) provides the full edge-level clarifier prompt with the complete instruction text including the four error types (DG, SC, RD, CG), when to ask, what to ask, who to ask, and the NONE condition. This is the actual prompt text sent to the model." 151 }, 152 "hyperparameters_reported": { 153 "applies": true, 154 "answer": false, 155 "justification": "The paper mentions temperature=0.3 in the robustness experiments (Appendix C.1) but does not provide a complete hyperparameter table. Key RL training hyperparameters such as learning rate, number of training steps, lambda_ask, lambda_sw, alpha_eff, alpha_fmt, epsilon, and beta are referenced in equations but their actual values are not reported." 156 }, 157 "scaffolding_described": { 158 "applies": true, 159 "answer": true, 160 "justification": "AgentAsk's scaffolding is described in detail: Sections 4.1-4.3 describe the edge-local state, action space, policy structure, two-stage training (SFT then RL), and reward design. Algorithm 1 provides complete pseudocode for the training procedure. The interaction graph structure is also described." 161 }, 162 "data_preprocessing_documented": { 163 "applies": true, 164 "answer": false, 165 "justification": "The training corpus is described as being built 'from a set of logged multi-agent executions' with labels assigned by 'a teacher model,' but the paper does not document how execution logs were collected, filtered, or preprocessed. The number of training examples N is referenced but not stated. The criteria for including or excluding execution logs are not described." 166 } 167 }, 168 "limitations_and_scope": { 169 "limitations_section_present": { 170 "applies": true, 171 "answer": true, 172 "justification": "There is a dedicated 'Limitations' section after the Conclusion that discusses that AgentAsk cannot eliminate errors arising from internal model limitations like hallucinations and inconsistencies." 173 }, 174 "threats_to_validity_specific": { 175 "applies": true, 176 "answer": false, 177 "justification": "The limitations section is largely generic: it states AgentAsk 'cannot fully eliminate all errors' arising from 'well-known limitations in large models, such as hallucinations.' This does not constitute specific threats to validity for the experimental results, such as potential contamination of benchmarks, selection of favorable frameworks, or confounds in the efficiency measurements." 178 }, 179 "scope_boundaries_stated": { 180 "applies": true, 181 "answer": false, 182 "justification": "The paper does not explicitly state what the results do NOT show. There is no explicit discussion of settings where AgentAsk would not work, tasks or domains not covered, or limitations of the benchmark choice. The limitation paragraph discusses general model limitations rather than scope boundaries." 183 } 184 }, 185 "data_integrity": { 186 "raw_data_available": { 187 "applies": true, 188 "answer": false, 189 "justification": "The 824 annotated execution logs that form the basis of the taxonomy and the training corpus are not made available. Without access to these logs, the taxonomy distribution claims (Data Gap 29.1%, Referential Drift 27.3%, Signal Corruption 36.8%, Capability Gap 6.8%) cannot be independently verified." 190 }, 191 "data_collection_described": { 192 "applies": true, 193 "answer": false, 194 "justification": "The paper describes auditing '824 execution logs' but does not explain how these logs were collected: from which frameworks, which tasks, over what time period, or with what inclusion/exclusion criteria. Appendix A.2 states logs come 'from a set of logged multi-agent executions' without further detail." 195 }, 196 "recruitment_methods_described": { 197 "applies": false, 198 "answer": false, 199 "justification": "This paper involves annotators for the taxonomy but not participant recruitment in the human-subjects sense. The annotators are described as 'multiple professional annotators with expertise in MAS' but this is annotation quality control, not participant recruitment for a study." 200 }, 201 "data_pipeline_documented": { 202 "applies": true, 203 "answer": false, 204 "justification": "The pipeline from execution logs to training data is not fully documented. We know logs are annotated by a teacher model into (type, agent, question) triples, but the filtering criteria, the size of the training dataset, and any preprocessing steps between raw logs and the SFT dataset are not described." 205 } 206 }, 207 "conflicts_of_interest": { 208 "funding_disclosed": { 209 "applies": true, 210 "answer": false, 211 "justification": "No acknowledgments section is present in the paper, and no funding sources are disclosed. The author affiliations (USTC, Shanghai AI Lab, Xi'an Jiaotong, NUS) are listed but no grants or funding agencies are mentioned." 212 }, 213 "affiliations_disclosed": { 214 "applies": true, 215 "answer": true, 216 "justification": "Author affiliations are clearly disclosed on the title page: University of Science and Technology of China, Shanghai AI Laboratory, Xi'an Jiaotong University, and National University of Singapore. No author is affiliated with OpenAI or DeepSeek, whose models are used." 217 }, 218 "funder_independent_of_outcome": { 219 "applies": true, 220 "answer": false, 221 "justification": "No funding is disclosed despite authors being affiliated with major research institutions (USTC, Shanghai AI Lab, NUS) where unfunded research is implausible. Since funding is not disclosed, funder independence cannot be assessed. The schema says 'NA if unfunded' but the absence of a funding disclosure from well-resourced institutions is not the same as confirmed unfunded status." 222 }, 223 "financial_interests_declared": { 224 "applies": true, 225 "answer": false, 226 "justification": "There is no competing interests statement anywhere in the paper. Absence of disclosure is not the same as absence of conflict." 227 } 228 }, 229 "contamination": { 230 "training_cutoff_stated": { 231 "applies": true, 232 "answer": false, 233 "justification": "The paper evaluates GPT-4o-mini-0718 and GPT-5 on benchmarks including HumanEval and MMLU but does not state the training cutoff dates for these models. GPT-5 is referenced with an OpenAI URL accessed 2025-08-07 but no training cutoff is provided." 234 }, 235 "train_test_overlap_discussed": { 236 "applies": true, 237 "answer": false, 238 "justification": "The paper uses well-known public benchmarks (HumanEval published 2021, MMLU published 2021, GSM8K 2021) with models that were almost certainly trained on data that includes these benchmarks. No discussion of potential train/test overlap is provided." 239 }, 240 "benchmark_contamination_addressed": { 241 "applies": true, 242 "answer": false, 243 "justification": "HumanEval and MMLU were published in 2021, well before any of the evaluated models were trained. The paper does not discuss whether these benchmarks may be in the models' training data, which is a significant concern for evaluating true reasoning ability." 244 } 245 }, 246 "human_studies": { 247 "pre_registered": { 248 "applies": false, 249 "answer": false, 250 "justification": "While annotators are used for the taxonomy classification, this is not a human subjects study. The annotators classify agent execution logs, not human behavior." 251 }, 252 "irb_or_ethics_approval": { 253 "applies": false, 254 "answer": false, 255 "justification": "The paper has no human participants in a research sense. Professional annotators classify execution logs but this is not a human subjects study requiring IRB approval." 256 }, 257 "demographics_reported": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in the research sense. The 'professional annotators with expertise in MAS' are not research participants whose demographics need to be reported." 261 }, 262 "inclusion_exclusion_criteria": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants whose inclusion/exclusion criteria would need to be specified." 266 }, 267 "randomization_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human subjects experiment with conditions requiring randomization." 271 }, 272 "blinding_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human subjects experiment requiring blinding." 276 }, 277 "attrition_reported": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants who could drop out of a study." 281 } 282 }, 283 "cost_and_practicality": { 284 "inference_cost_reported": { 285 "applies": true, 286 "answer": true, 287 "justification": "Table 2 and subsequent tables report 'Extra Cost' as a relative normalized metric (with origin=0 as baseline). The paper reports that AgentAsk achieves extra costs of 4.9-7.7 on the normalized scale while +GPT-5 costs 24-44 units. However, these are relative rather than absolute API costs." 288 }, 289 "compute_budget_stated": { 290 "applies": true, 291 "answer": false, 292 "justification": "No GPU hours, hardware specifications, total API spend, or training compute are reported. The paper describes that AgentAsk is trained with SFT followed by RL (E-GRPO) but provides no quantification of the compute required for this training." 293 } 294 } 295 }, 296 "claims": [ 297 { 298 "claim": "AgentAsk consistently improves accuracy by up to 4.69% across five benchmarks and four multi-agent frameworks while keeping latency and extra costs below 10% compared to baseline MAS.", 299 "evidence": "Table 1 shows framework-level average improvements of +3.29 to +3.45 percentage points for +AgentAsk across four frameworks. The maximum individual gain is +4.69 (MaAS+GPT-5 on average). Table 2 shows extra cost of 4.9 for AgentAsk vs. 34.0 for +GPT-5.", 300 "supported": "strong" 301 }, 302 { 303 "claim": "An audit of 824 execution logs reveals four dominant error types: Data Gap (29.1%), Signal Corruption (36.8%), Referential Drift (27.3%), and Capability Gap (6.8%).", 304 "evidence": "Section 3.5 states these percentages from the annotated corpus (N=824). Fleiss' Kappa of 0.84 is reported for inter-rater reliability. Figure 2 shows the empirical distribution.", 305 "supported": "moderate" 306 }, 307 { 308 "claim": "AgentAsk achieves near upper-bound (+GPT-5) accuracy at substantially lower cost, with only a 0.24 point accuracy gap despite 21-point reduction in latency and 29.1-point reduction in extra cost.", 309 "evidence": "Table 2 (MasRouter@GSM8K): +GPT-5 achieves 95.10 accuracy at latency 129 and extra cost 34.0; +AgentAsk achieves 94.86 at latency 108 and extra cost 4.9.", 310 "supported": "strong" 311 }, 312 { 313 "claim": "Each reward component (rpar, reff, rfmt) is necessary for optimal performance in AgentAsk.", 314 "evidence": "Table 3 shows that removing each component individually leads to accuracy decreases and latency/cost increases. E.g., removing reff drops accuracy from 83.55 to 80.05 (-3.50 pp) on MBPP.", 315 "supported": "strong" 316 }, 317 { 318 "claim": "AgentAsk is architecture-agnostic and can be transferred across frameworks: trained on one framework and then integrated into others.", 319 "evidence": "Section 5.1 states 'we trained AgentAsk on one framework and then gradually integrated it into other frameworks.' Tables 1 and 4 show consistent performance across all four evaluated frameworks.", 320 "supported": "moderate" 321 } 322 ], 323 "methodology_tags": [ 324 "benchmark-eval", 325 "case-study" 326 ], 327 "key_findings": "AgentAsk proposes a lightweight edge-level clarification module for multi-agent LLM systems that intervenes at message handoffs to prevent error propagation. Based on an annotation of 824 execution logs, the authors identify four error types (Data Gap, Signal Corruption, Referential Drift, Capability Gap) and train AgentAsk with SFT followed by a novel E-GRPO reinforcement learning objective to balance clarification quality against cost. Evaluated across five benchmarks and four MAS frameworks, AgentAsk improves average accuracy by 3.29-3.45 percentage points while keeping extra cost below 10% of baseline, recovering most of the gain achievable with the much more expensive GPT-5 clarifier.", 328 "red_flags": [ 329 { 330 "flag": "No training hyperparameters reported", 331 "detail": "The paper uses SFT followed by E-GRPO but does not report key hyperparameter values (learning rate, lambda_ask, alpha_eff, alpha_fmt, epsilon, beta, number of training steps/epochs). Without these, the method cannot be reproduced." 332 }, 333 { 334 "flag": "Training data not released or fully described", 335 "detail": "The 824-execution-log corpus used for taxonomy annotation and the SFT training dataset are neither released nor fully described. The number of training examples N is referenced but never stated. Without this data, training AgentAsk from scratch is not possible." 336 }, 337 { 338 "flag": "Benchmark contamination unaddressed", 339 "detail": "The paper evaluates on HumanEval (published 2021), MMLU (2021), and GSM8K (2021) using GPT-4o-mini and GPT-5, which were almost certainly trained on data containing these benchmarks. The paper does not discuss this contamination risk at all." 340 }, 341 { 342 "flag": "Relative cost units lack absolute grounding", 343 "detail": "Latency and extra cost are reported as relative normalized values (origin=100), making it impossible to determine actual API costs or wall-clock times. This obscures whether AgentAsk is practically affordable at scale." 344 }, 345 { 346 "flag": "Taxonomy annotation corpus opacity", 347 "detail": "The 824 execution logs are described as coming from unspecified frameworks and tasks. The collection procedure, selection criteria, and representativeness of this corpus are not described, making it impossible to assess whether the reported error type distributions generalize." 348 }, 349 { 350 "flag": "No funding disclosure", 351 "detail": "The paper has no acknowledgments section and discloses no funding sources, making it impossible to assess potential conflicts of interest." 352 } 353 ], 354 "cited_papers": [ 355 { 356 "title": "Why do multi-agent LLM systems fail?", 357 "authors": [ 358 "Mert Cemri", 359 "Melissa Z Pan", 360 "Shuyi Yang", 361 "Lakshya A Agrawal", 362 "Kurt Keutzer", 363 "Matei Zaharia", 364 "Joseph E. Gonzalez", 365 "Ion Stoica" 366 ], 367 "year": 2025, 368 "relevance": "Large-scale audit of multi-agent LLM system failures, directly motivating the problem studied in AgentAsk and providing empirical evidence that MAS often fail to outperform strong single-agent baselines." 369 }, 370 { 371 "title": "GPTSwarm: Language agents as optimizable graphs", 372 "authors": [ 373 "Mingchen Zhuge", 374 "Wenyi Wang", 375 "Louis Kirsch", 376 "Francesco Faccio", 377 "Dmitrii Khizbullin", 378 "Jürgen Schmidhuber" 379 ], 380 "year": 2024, 381 "relevance": "One of four multi-agent frameworks used as baselines in AgentAsk evaluation, treating agents and communication links as an optimizable graph." 382 }, 383 { 384 "title": "AFlow: Automating agentic workflow generation", 385 "authors": [ 386 "Jiayi Zhang", 387 "Jinyu Xiang", 388 "Zhaoyang Yu", 389 "Fengwei Teng", 390 "Jiaqi Chen", 391 "Mingchen Zhuge", 392 "Bang Liu", 393 "Chenglin Wu" 394 ], 395 "year": 2025, 396 "arxiv_id": "2501.08944", 397 "relevance": "One of four multi-agent frameworks used as baselines, searching over code-represented workflows via Monte Carlo tree search." 398 }, 399 { 400 "title": "Multi-agent architecture search via agentic supernet", 401 "authors": [ 402 "Guibin Zhang", 403 "Luyang Niu", 404 "Junfeng Fang", 405 "Kun Wang", 406 "Lei Bai", 407 "Xiang Wang" 408 ], 409 "year": 2025, 410 "relevance": "One of four multi-agent frameworks used as baselines (MaAS), sampling query-dependent sub-architectures from an agentic supernet." 411 }, 412 { 413 "title": "MasRouter: Learning to route LLMs for multi-agent systems", 414 "authors": [ 415 "Yanwei Yue", 416 "Guibin Zhang", 417 "Boyang Liu", 418 "Guancheng Wan", 419 "Kun Wang", 420 "Dawei Cheng", 421 "Yiyan Qi" 422 ], 423 "year": 2025, 424 "relevance": "One of four multi-agent frameworks used as baselines in evaluation, learning a cascaded controller for collaboration mode, role allocation, and LLM selection." 425 }, 426 { 427 "title": "MetaGPT: Meta programming for a multi-agent collaborative framework", 428 "authors": [ 429 "Sirui Hong", 430 "Mingchen Zhuge", 431 "Jonathan Chen", 432 "Xiawu Zheng", 433 "Yuheng Cheng", 434 "Jinlin Wang", 435 "Chenglin Wu", 436 "Jürgen Schmidhuber" 437 ], 438 "year": 2024, 439 "relevance": "Early influential MAS framework organizing multiple LLMs into role-specialized teams with structured workflows, representing the genre of systems that AgentAsk targets." 440 }, 441 { 442 "title": "ChatDev: Communicative agents for software development", 443 "authors": [ 444 "Chen Qian", 445 "Wei Liu", 446 "Hongzhang Liu", 447 "Nuo Chen", 448 "Zhiyuan Liu", 449 "Maosong Sun" 450 ], 451 "year": 2024, 452 "relevance": "Multi-agent framework for software development that demonstrates collaborative LLM-based programming, in scope as an example of MAS evaluated in related work." 453 }, 454 { 455 "title": "Reflexion: language agents with verbal reinforcement learning", 456 "authors": [ 457 "Noah Shinn", 458 "Federico Cassano", 459 "Ashwin Gopinath", 460 "Karthik Narasimhan", 461 "Shunyu Yao" 462 ], 463 "year": 2023, 464 "relevance": "Self-feedback loop approach for improving agent reliability that AgentAsk is compared against conceptually as an alternative method for error correction." 465 }, 466 { 467 "title": "AgentBench: Evaluating LLMs as agents", 468 "authors": [ 469 "Xiao Liu", 470 "Hao Yu", 471 "Hanchen Zhang", 472 "Yifan Xu", 473 "Xuanyu Lei", 474 "Yu Gu", 475 "Tianjun Zhang", 476 "Yu Su", 477 "Huan Sun" 478 ], 479 "year": 2024, 480 "relevance": "Benchmark for evaluating LLMs as agents, representing the broader benchmark evaluation methodology relevant to this survey." 481 }, 482 { 483 "title": "Talk isn't always cheap: Understanding failure modes in multi-agent debate", 484 "authors": [ 485 "Andrea Wynn", 486 "Harsh Satija", 487 "Gillian Hadfield" 488 ], 489 "year": 2025, 490 "arxiv_id": "2509.05396", 491 "relevance": "Analysis of conditions where multi-agent debate can fail or degrade accuracy, directly relevant to understanding MAS reliability." 492 }, 493 { 494 "title": "Which agent causes task failures and when? On automated failure attribution of LLM multi-agent systems", 495 "authors": [ 496 "Shaokun Zhang", 497 "Ming Yin", 498 "Jieyu Zhang", 499 "Jiale Liu", 500 "Chi Wang", 501 "Qingyun Wu" 502 ], 503 "year": 2025, 504 "relevance": "Automated failure attribution methodology for LLM multi-agent systems, complementary to the error taxonomy developed in AgentAsk." 505 }, 506 { 507 "title": "MultiAgentBench: Evaluating the collaboration and competition of LLM agents", 508 "authors": [ 509 "Kunlun Zhu", 510 "Hongyi Du", 511 "Zhaochen Hong", 512 "Xiaocheng Yang", 513 "Shuyi Guo", 514 "Heng Ji", 515 "Jiaxuan You" 516 ], 517 "year": 2025, 518 "arxiv_id": "2503.01935", 519 "relevance": "Benchmark for evaluating LLM agent collaboration and competition, relevant to the broader multi-agent evaluation literature this paper contributes to." 520 }, 521 { 522 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", 523 "authors": [ 524 "DeepSeek-AI", 525 "Daya Guo", 526 "Dejian Yang" 527 ], 528 "year": 2025, 529 "arxiv_id": "2501.12948", 530 "relevance": "The GRPO optimization method from DeepSeek-R1 is the foundation for AgentAsk's E-GRPO training algorithm." 531 } 532 ] 533 }