scan.json (28206B)
1 { 2 "paper": { 3 "title": "MARSHAL: Incentivizing Multi-Agent Reasoning via Self-Play with Strategic LLMs", 4 "authors": ["Huining Yuan", "Zelai Xu", "Zheyue Tan", "Xiangmin Yi", "Mo Guang", "Kaiwen Long", "Haojia Hui", "Boxun Li", "Xinlei Chen", "Bo Zhao", "Xiao-Ping Zhang", "Chao Yu", "Yu Wang"], 5 "year": 2025, 6 "venue": "ICLR 2026", 7 "arxiv_id": "2510.15414" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "checklist": { 12 "artifacts": { 13 "code_released": { 14 "applies": true, 15 "answer": true, 16 "justification": "The paper states 'All code, model checkpoints, and training scripts required to reproduce the findings of this paper are publicly available at https://github.com/thu-nics/MARSHAL' in the Reproducibility Statement." 17 }, 18 "data_released": { 19 "applies": true, 20 "answer": true, 21 "justification": "The games use OpenSpiel and VS-Bench (public frameworks), and evaluation uses standard public benchmarks (MATH500, GSM8K, AIME24, GPQA-Diamond, etc.). No proprietary datasets were created." 22 }, 23 "environment_specified": { 24 "applies": true, 25 "answer": false, 26 "justification": "The paper mentions using ROLL, vLLM, Megatron-LM, and OpenSpiel but does not provide a requirements.txt, Dockerfile, or detailed library version specifications. Only framework names are given, not specific versions." 27 }, 28 "reproduction_instructions": { 29 "applies": true, 30 "answer": true, 31 "justification": "The Reproducibility Statement says the repository 'includes all necessary configurations to replicate our key experiments.' Combined with detailed hyperparameters in Table 5 and Appendix B-D, this provides step-by-step reproducibility." 32 } 33 }, 34 "statistical_methodology": { 35 "confidence_intervals_or_error_bars": { 36 "applies": true, 37 "answer": false, 38 "justification": "All results in Tables 1, 3, 4, and Figures 3-6 are reported as point estimates without confidence intervals or error bars." 39 }, 40 "significance_tests": { 41 "applies": true, 42 "answer": false, 43 "justification": "The paper claims improvements (e.g., 'up to 10.0% on AIME') based solely on comparing numbers without any statistical significance tests." 44 }, 45 "effect_sizes_reported": { 46 "applies": true, 47 "answer": true, 48 "justification": "The paper consistently reports improvements with baseline context, e.g., 'up to 28.7% performance improvements in held-out games', '10.0% on AIME' (from 56.67% to 66.67%), and percentage point differences throughout Tables 1, 6-7." 49 }, 50 "sample_size_justified": { 51 "applies": true, 52 "answer": false, 53 "justification": "For game evaluation, the paper states '1000 games' but provides no justification for why 1000 is sufficient. For reasoning benchmarks, sample sizes depend on the benchmark (AIME has only 30 problems, AMC has 40) with no discussion of statistical power." 54 }, 55 "variance_reported": { 56 "applies": true, 57 "answer": false, 58 "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. All results appear to be single-run numbers." 59 } 60 }, 61 "evaluation_design": { 62 "baselines_included": { 63 "applies": true, 64 "answer": true, 65 "justification": "The paper compares against Qwen3-4B base model and SPIRAL (a recent self-play method) across all evaluations (Section 4.1)." 66 }, 67 "baselines_contemporary": { 68 "applies": true, 69 "answer": true, 70 "justification": "SPIRAL (Liu et al., 2025) is a concurrent/recent work on self-play for LLMs. Qwen3-4B is a current model. Both are appropriate contemporary baselines." 71 }, 72 "ablation_study": { 73 "applies": true, 74 "answer": true, 75 "justification": "Section 4.5 ablates both key components: turn-level advantage estimator and agent-specific advantage normalization. Additional ablations compare self-play vs. fixed-opponent (Table 3) and decoupling algorithm vs. game environments (Appendix F)." 76 }, 77 "multiple_metrics": { 78 "applies": true, 79 "answer": true, 80 "justification": "The paper evaluates across multiple dimensions: game return (normalized), multiple reasoning benchmarks (MATH500, GSM8K, AQUA-RAT, AIME24, AMC23, MMLU-STEM, GPQA-Diamond), and failure mode analysis." 81 }, 82 "human_evaluation": { 83 "applies": true, 84 "answer": false, 85 "justification": "All evaluation is automated — game returns against MCTS/NE opponents and automated benchmark scoring. No human evaluation of reasoning quality or game play is included." 86 }, 87 "held_out_test_set": { 88 "applies": true, 89 "answer": true, 90 "justification": "The paper explicitly partitions games into training (Tic-Tac-Toe, Kuhn Poker, Mini Hanabi) and held-out testing (Connect Four, Leduc Hold'em, Simple Hanabi) sets. Reasoning benchmarks serve as zero-shot out-of-domain evaluation." 91 }, 92 "per_category_breakdown": { 93 "applies": true, 94 "answer": true, 95 "justification": "Results are broken down by individual game (Figure 3), by individual benchmark (Table 1), by first-move/second-move player roles, and by failure category (Figure 5)." 96 }, 97 "failure_cases_discussed": { 98 "applies": true, 99 "answer": true, 100 "justification": "Section 4.4 provides a detailed failure mode analysis on GPQA-Diamond within MAD, categorizing failures into System Design Issues, Inter-Agent Misalignment, and Task Verification (Figure 5), with sub-category breakdown." 101 }, 102 "negative_results_reported": { 103 "applies": true, 104 "answer": true, 105 "justification": "The fixed-opponent ablation (Table 3) shows severe performance degradation (strategic mode collapse). The ablation removing components shows performance drops. The Hanabi specialist underperforms in MAD (competitive setting), showing skill mismatch." 106 } 107 }, 108 "claims_and_evidence": { 109 "abstract_claims_supported": { 110 "applies": true, 111 "answer": true, 112 "justification": "Abstract claims of 'up to 28.7% performance improvements in held-out games' matches Leduc Hold'em generalist results. Claims of '10.0% on AIME, 7.6% on GPQA-Diamond, and 3.5% on average' are supported by Table 1 data." 113 }, 114 "causal_claims_justified": { 115 "applies": true, 116 "answer": true, 117 "justification": "The paper makes causal claims ('MARSHAL incentivizes multi-agent reasoning') and supports them with controlled ablation studies (Section 4.5) that isolate individual components. The self-play vs. fixed-opponent comparison and algorithm-vs-games decoupling (Appendix F) provide reasonable causal evidence." 118 }, 119 "generalization_bounded": { 120 "applies": true, 121 "answer": false, 122 "justification": "The title claims 'Multi-Agent Reasoning' generally but experiments are limited to Qwen3-4B and Qwen3-8B, two-player games only, and specific MAS frameworks (MAD, AutoGen). The Discussion acknowledges the two-player limitation but the abstract and title do not bound the claims to these settings." 123 }, 124 "alternative_explanations_discussed": { 125 "applies": true, 126 "answer": false, 127 "justification": "The paper does not discuss alternative explanations for the generalization results. For example, the reasoning improvements could stem from additional training compute or exposure to structured turn-based reasoning rather than specifically from game-theoretic self-play. No robustness checks against these confounds." 128 }, 129 "proxy_outcome_distinction": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper claims game self-play develops 'multi-agent reasoning capabilities' but measures this via benchmark accuracy (MATH, AIME, GPQA). The gap between benchmark scores and 'reasoning capability' is not acknowledged. Benchmark accuracy is a proxy for reasoning ability but the paper treats them as equivalent." 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper uses 'Qwen3-4B' and 'Qwen3-8B' without specifying exact model versions, snapshot dates, or checkpoint identifiers. The reference is to Yang et al. (2025) but no specific version string is given." 140 }, 141 "prompts_provided": { 142 "applies": true, 143 "answer": true, 144 "justification": "Full game prompts are provided in Appendix I (Listings 1-5) for all five games, including system prompts and user prompts with complete text." 145 }, 146 "hyperparameters_reported": { 147 "applies": true, 148 "answer": true, 149 "justification": "Table 5 provides comprehensive hyperparameters including learning rate (1e-6), batch size (128), PPO clip (0.2), KL coefficient (0.2), sampling temperature (0.5/0.6), top-P (0.99), top-K (100), and optimizer settings." 150 }, 151 "scaffolding_described": { 152 "applies": true, 153 "answer": true, 154 "justification": "The self-play framework is described in detail: Figure 2 shows the architecture, Section 3 describes the GRPO-based training loop, game environments serve as the interaction scaffold, and the player trajectory generation is fully documented." 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": true, 159 "justification": "The game environments are standardized via OpenSpiel/VS-Bench. The reward normalization across games is documented (Section 3.4). Evaluation protocols including opponent types and evaluation scripts are specified in Appendix D." 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 6 (Discussion) includes substantive discussion of limitations: the study uses only two-player games, and scaling to N-player environments introduces 'non-stationarity, population diversity, and credit assignment' challenges." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": true, 171 "justification": "The Discussion identifies specific threats: two-player games may not capture N-player dynamics, classic games differ from complex social sandboxes, and the challenges of 'non-stationarity, population diversity, and credit assignment' in scaling." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": false, 176 "justification": "While the Discussion mentions two-player limitation, the paper does not explicitly state what the results do NOT show. No explicit statement about model-specificity (only tested on Qwen3), framework-specificity (only MAD and AutoGen), or language-specificity of the results." 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": false, 183 "justification": "No raw experimental data (game trajectories, per-example benchmark results) is made available. Only aggregated results in tables and figures are provided." 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": true, 188 "justification": "Data collection is well-described: self-play trajectories generated via the MARSHAL framework, 1000 games for strategic evaluation, standard benchmark datasets with evaluation scripts specified (Appendix D)." 189 }, 190 "recruitment_methods_described": { 191 "applies": false, 192 "answer": false, 193 "justification": "No human participants. All data comes from model self-play and standard benchmark evaluation." 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": true, 198 "justification": "The pipeline is documented: self-play generates trajectories → turn-level advantage estimation → GRPO training. Evaluation pipeline: model generates responses → evaluated against MCTS/NE opponents or benchmark scoring scripts." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": true, 205 "justification": "Acknowledgments section lists funding: National Natural Science Foundation of China, Ant Group, BNRist, Beijing Innovation Center for Future Chips, Shenzhen Key Laboratory, Tsinghua Shenzhen International Graduate School." 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "Author affiliations are clearly listed: Tsinghua University (multiple departments), Aalto University, Li Auto Inc., and Infinigence AI." 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": false, 215 "justification": "Ant Group (a major tech company) is listed as a sponsor. Li Auto Inc. and Infinigence AI are corporate affiliations of some authors. These entities have potential commercial interest in LLM agent capabilities but this conflict is not discussed." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "No competing interests statement is present. Authors from Li Auto Inc. and Infinigence AI may have financial interests related to LLM agent capabilities, but no declaration is made." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": true, 226 "answer": false, 227 "justification": "The paper does not state the training data cutoff for Qwen3-4B or Qwen3-8B. These models are evaluated on benchmarks like MATH500, GSM8K, and AIME24 which may be in training data." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": true, 231 "answer": false, 232 "justification": "No discussion of whether benchmark problems (MATH500, GSM8K, AIME24, etc.) appeared in Qwen3's training data. These are well-known public benchmarks likely included in training corpora." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": true, 236 "answer": false, 237 "justification": "GSM8K (2021), MATH (2021), MMLU (2020), and AIME24 are all publicly available benchmarks that predate Qwen3's training. No contamination analysis is performed." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": false, 243 "answer": false, 244 "justification": "No human participants in this study." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human participants in this study." 250 }, 251 "demographics_reported": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in this study." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in this study." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in this study." 270 }, 271 "attrition_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this study." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": true, 280 "answer": false, 281 "justification": "No inference cost, latency, or tokens consumed is reported for any evaluation setting." 282 }, 283 "compute_budget_stated": { 284 "applies": true, 285 "answer": true, 286 "justification": "Appendix B states 'Models were trained on a single server with 8 NVIDIA H100 GPUs' and training was for 200 optimization steps. However, total GPU hours or wall-clock time is not explicitly stated — the hardware configuration is provided." 287 } 288 }, 289 "experimental_rigor": { 290 "seed_sensitivity_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "No results across multiple random seeds are reported. All results appear to be from single training runs." 294 }, 295 "number_of_runs_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "The number of training runs is not stated. For game evaluation, 1000 games are stated but it is unclear if this is a single evaluation pass or averaged." 299 }, 300 "hyperparameter_search_budget": { 301 "applies": true, 302 "answer": false, 303 "justification": "No hyperparameter search budget is reported. Table 5 presents a single configuration with no discussion of how it was selected or how many configurations were tried." 304 }, 305 "best_config_selection_justified": { 306 "applies": true, 307 "answer": false, 308 "justification": "The paper presents one hyperparameter configuration (Table 5) without justifying how it was selected. Only the length penalty α is ablated (Appendix H) — other hyperparameters appear chosen without documented justification." 309 }, 310 "multiple_comparison_correction": { 311 "applies": true, 312 "answer": false, 313 "justification": "No statistical tests are performed at all, so multiple comparison correction is moot. However, the paper makes many simultaneous comparisons across 7 benchmarks, 6 games, multiple model variants without any correction." 314 }, 315 "self_comparison_bias_addressed": { 316 "applies": true, 317 "answer": false, 318 "justification": "The authors compare their MARSHAL system against their own re-implementation of SPIRAL and MT-GRPO without acknowledging potential author-evaluation bias." 319 }, 320 "compute_budget_vs_performance": { 321 "applies": true, 322 "answer": false, 323 "justification": "MARSHAL requires additional RL training beyond the base model, but performance is not analyzed as a function of compute. SPIRAL presumably uses similar compute but this is not verified or discussed." 324 }, 325 "benchmark_construct_validity": { 326 "applies": true, 327 "answer": false, 328 "justification": "The paper uses AIME, GPQA-Diamond, MATH500 etc. as proxies for 'multi-agent reasoning capability' without discussing whether these benchmarks actually measure what is claimed. Benchmark accuracy in a multi-agent framework may reflect many factors beyond multi-agent reasoning." 329 }, 330 "scaffold_confound_addressed": { 331 "applies": true, 332 "answer": true, 333 "justification": "The paper uses the same MAS frameworks (MAD, AutoGen via MASLab) across all model comparisons, controlling for the scaffold confound. The comparison is models within the same framework, not across different frameworks." 334 } 335 }, 336 "data_leakage": { 337 "temporal_leakage_addressed": { 338 "applies": true, 339 "answer": false, 340 "justification": "No discussion of whether Qwen3's training data includes solutions to MATH500, GSM8K, AIME24, or other benchmarks used for evaluation." 341 }, 342 "feature_leakage_addressed": { 343 "applies": true, 344 "answer": false, 345 "justification": "No discussion of whether the game observation format or benchmark evaluation setup leaks information not available in realistic settings." 346 }, 347 "non_independence_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "No discussion of independence between training and test data. The same model architecture and training procedure are used across all evaluations." 351 }, 352 "leakage_detection_method": { 353 "applies": true, 354 "answer": false, 355 "justification": "No leakage detection or prevention method is applied for any of the evaluation benchmarks." 356 } 357 } 358 }, 359 "claims": [ 360 { 361 "claim": "MARSHAL agents trained from Qwen3-4B achieve up to 28.7% performance improvements in held-out games.", 362 "evidence": "Figure 3 and Tables 14-15 show the generalist model achieves 28.7% improvement on Leduc Hold'em (first move) compared to the Qwen3-4B baseline.", 363 "supported": "strong" 364 }, 365 { 366 "claim": "MARSHAL achieves zero-shot performance gains of up to 10.0% on AIME, 7.6% on GPQA-Diamond, and 3.5% on average when integrated into MAS frameworks.", 367 "evidence": "Table 1: AutoGen generalist achieves 66.67% on AIME vs 56.67% baseline (+10.0%), MAD generalist achieves 45.45% on GPQA vs 37.88% baseline (+7.57%), and 3.51% average gain in MAD.", 368 "supported": "moderate" 369 }, 370 { 371 "claim": "Self-play is essential for generalization; training against fixed opponents leads to overfitting and strategic mode collapse.", 372 "evidence": "Table 3 shows the Kuhn Poker fixed-opponent model collapses to 0.0% on non-poker games, while the self-play model generalizes across all games.", 373 "supported": "strong" 374 }, 375 { 376 "claim": "Both turn-level advantage estimator and agent-specific normalization are critical components.", 377 "evidence": "Table 4 shows removing either component degrades performance. Turn-level removal causes largest drop in long-horizon games; agent-specific removal causes largest drop in competitive games with asymmetric returns.", 378 "supported": "strong" 379 }, 380 { 381 "claim": "MARSHAL cultivates generalizable multi-agent reasoning skills including role understanding and intent recognition.", 382 "evidence": "Table 2 provides qualitative examples of reasoning traces. Figure 5 shows quantitative reduction in Inter-Agent Misalignment failures (11.5% reduction) on GPQA-Diamond.", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "MARSHAL scales to larger models (Qwen3-8B) with consistent improvements.", 387 "evidence": "Tables 6-7 (Appendix E) show improvements on 8B model: 2.60% average gain in MAD, 3.90% average in AutoGen.", 388 "supported": "moderate" 389 } 390 ], 391 "methodology_tags": ["benchmark-eval"], 392 "key_findings": "MARSHAL, an end-to-end RL framework using self-play in cooperative and competitive games, significantly improves multi-agent reasoning in LLMs. Training Qwen3-4B via self-play yields up to 28.7% improvement in held-out games and consistent zero-shot gains (up to 10% on AIME) when integrated into multi-agent systems like MAD and AutoGen. The turn-level advantage estimator and agent-specific normalization are both critical for learning, and self-play is essential over fixed-opponent training to avoid strategic mode collapse. Skills acquired in games (role understanding, intent recognition) transfer to downstream multi-agent reasoning tasks.", 393 "red_flags": [ 394 { 395 "flag": "No variance or uncertainty quantification", 396 "detail": "All results are reported as single point estimates across all experiments (games, benchmarks, ablations). With small benchmark sizes (AIME has 30 problems, AMC has 40), a few-problem difference can appear as a large percentage gain. Without variance across seeds or runs, the statistical reliability of improvements is unknown." 397 }, 398 { 399 "flag": "Small benchmark sample sizes inflate apparent gains", 400 "detail": "AIME24 has only 30 problems. The claimed '10.0% improvement' on AIME corresponds to 3 additional correct answers (from 17 to 20 out of 30). AMC23 has ~40 problems. These tiny absolute differences may not be statistically significant." 401 }, 402 { 403 "flag": "No contamination analysis for standard benchmarks", 404 "detail": "GSM8K, MATH, MMLU, and AIME problems are widely available online. Qwen3 likely saw them during pretraining. The MARSHAL training adds RL on games but the base model's contamination is never assessed. Improvements in MAS settings could partially reflect the RL training helping the model better retrieve memorized solutions." 405 }, 406 { 407 "flag": "Qualitative evidence presented as primary for generalization mechanism", 408 "detail": "The core claim that game skills generalize to reasoning is supported primarily by cherry-picked reasoning trace examples (Table 2) and a failure mode analysis on a single benchmark (GPQA in MAD). Alternative explanations (e.g., additional training compute, regularization effects of multi-task learning) are not considered." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", 414 "authors": ["Daya Guo"], 415 "year": 2025, 416 "arxiv_id": "2501.12948", 417 "relevance": "Key work on RL for LLM reasoning that MARSHAL builds upon, demonstrating reward design for reasoning models." 418 }, 419 { 420 "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversations", 421 "authors": ["Qingyun Wu"], 422 "year": 2024, 423 "relevance": "Cooperative multi-agent framework used as evaluation testbed for MARSHAL's generalization to MAS." 424 }, 425 { 426 "title": "Encouraging divergent thinking in large language models through multi-agent debate", 427 "authors": ["Tian Liang"], 428 "year": 2023, 429 "arxiv_id": "2305.19118", 430 "relevance": "MAD framework used as competitive evaluation testbed; foundational work on LLM multi-agent debate." 431 }, 432 { 433 "title": "ChatDev: Communicative agents for software development", 434 "authors": ["Chen Qian"], 435 "year": 2023, 436 "arxiv_id": "2307.07924", 437 "relevance": "Multi-agent system for software development, representative of static role-based MAS workflows." 438 }, 439 { 440 "title": "MetaGPT: Meta programming for a multi-agent collaborative framework", 441 "authors": ["Sirui Hong"], 442 "year": 2024, 443 "relevance": "Multi-agent framework with specialized roles for software development tasks." 444 }, 445 { 446 "title": "SPIRAL: Self-play on zero-sum games incentivizes reasoning via multi-agent multi-turn reinforcement learning", 447 "authors": ["Bo Liu"], 448 "year": 2025, 449 "arxiv_id": "2506.24119", 450 "relevance": "Primary baseline; concurrent work on self-play for LLM reasoning in competitive games only." 451 }, 452 { 453 "title": "Self-playing adversarial language game enhances LLM reasoning", 454 "authors": ["Pengyu Cheng"], 455 "year": 2024, 456 "relevance": "SPAG: prior work showing self-play in adversarial language games can enhance LLM reasoning." 457 }, 458 { 459 "title": "CAMEL: Communicative agents for 'mind' exploration of large language model society", 460 "authors": ["Guohao Li"], 461 "year": 2023, 462 "relevance": "Cooperative multi-agent framework for general reasoning and question answering." 463 }, 464 { 465 "title": "SWE-RL: Advancing LLM reasoning via reinforcement learning on open software evolution", 466 "authors": ["Yuxiang Wei"], 467 "year": 2025, 468 "arxiv_id": "2502.18449", 469 "relevance": "Applies RL to software engineering tasks, extending reasoning RL to coding domain." 470 }, 471 { 472 "title": "Why do multi-agent LLM systems fail?", 473 "authors": ["Mert Cemri"], 474 "year": 2025, 475 "arxiv_id": "2503.13657", 476 "relevance": "Failure taxonomy for multi-agent LLM systems adopted for MARSHAL's failure mode analysis." 477 }, 478 { 479 "title": "Reinforcing multi-turn reasoning in LLM agents via turn-level credit assignment", 480 "authors": ["Siliang Zeng"], 481 "year": 2025, 482 "arxiv_id": "2505.11821", 483 "relevance": "MT-GRPO: concurrent work on turn-level credit assignment that MARSHAL compares against." 484 }, 485 { 486 "title": "DeepSeekMath: Pushing the limits of mathematical reasoning in open language models", 487 "authors": ["Zhihong Shao"], 488 "year": 2024, 489 "arxiv_id": "2402.03300", 490 "relevance": "Introduces GRPO which forms the algorithmic foundation for MARSHAL's training objective." 491 } 492 ] 493 }