ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (29257B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "MARSHAL: Incentivizing Multi-Agent Reasoning via Self-Play with Strategic LLMs",
      6     "authors": [
      7       "Huining Yuan",
      8       "Zelai Xu",
      9       "Zheyue Tan",
     10       "Xiangmin Yi",
     11       "Mo Guang",
     12       "Kaiwen Long",
     13       "Haojia Hui",
     14       "Boxun Li",
     15       "Xinlei Chen",
     16       "Bo Zhao",
     17       "Xiao-Ping Zhang",
     18       "Chao Yu",
     19       "Yu Wang"
     20     ],
     21     "year": 2025,
     22     "venue": "ICLR 2026",
     23     "arxiv_id": "2510.15414",
     24     "doi": null
     25   },
     26   "checklist": {
     27     "claims_and_evidence": {
     28       "abstract_claims_supported": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Abstract claims (28.7% improvement in held-out games, 10.0% AIME gain in AutoGen, 7.57% GPQA gain in MAD) are directly verified by Figure 3 and Table 1 — AIME goes from 56.67% to 66.67% in AutoGen for the generalist, and GPQA from 37.88% to 45.45% in MAD.",
     32         "source": "haiku"
     33       },
     34       "causal_claims_justified": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Section 4.5 and Appendices F–G provide ablation studies that isolate the contribution of turn-level advantage estimation, agent-specific normalization, self-play vs. fixed opponent, and algorithm vs. game selection, supporting causal claims about these design choices.",
     38         "source": "haiku"
     39       },
     40       "generalization_bounded": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The conclusion claims 'self-play as a scalable paradigm for training LLM agents' based on one model family (Qwen3-4B/8B) across six narrow two-player games; scaling to N-player and diverse game types is acknowledged as future work but not tested.",
     44         "source": "haiku"
     45       },
     46       "alternative_explanations_discussed": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Appendix F explicitly decouples algorithm vs. game-environment contributions by running SPIRAL's RAE on MARSHAL's game set and MARSHAL's algorithm on competitive-only games, directly testing the alternative explanations for the observed gains.",
     50         "source": "haiku"
     51       },
     52       "proxy_outcome_distinction": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper explicitly treats game performance as a proxy for multi-agent reasoning capability and separately validates transfer to downstream reasoning benchmarks as a distinct evaluation step in Section 4.3.",
     56         "source": "haiku"
     57       }
     58     },
     59     "limitations_and_scope": {
     60       "limitations_section_present": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Section 6 (Discussion) contains a substantive limitations paragraph identifying two-player game constraints and N-player scaling challenges; this goes beyond a single concluding sentence.",
     64         "source": "haiku"
     65       },
     66       "threats_to_validity_specific": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Limitations mention two-player game constraints and N-player scaling generically but omit critical threats: benchmark contamination in Qwen3 training data, single-run variability in RL training, and single-architecture generalizability.",
     70         "source": "haiku"
     71       },
     72       "scope_boundaries_stated": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The paper explicitly states results pertain to two-player games and identifies N-player environments and complex social sandboxes as out-of-scope, representing meaningful scope boundaries.",
     76         "source": "haiku"
     77       }
     78     },
     79     "conflicts_of_interest": {
     80       "funding_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Funding is disclosed in the Acknowledgments: NSFC grants (62406159, 62325405), Ant Group, Beijing National Research Center for Information Science and Technology, and the Shenzhen Pengrui Foundation.",
     84         "source": "haiku"
     85       },
     86       "affiliations_disclosed": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Author affiliations are clearly listed: Tsinghua University (SIGS and EE departments), Aalto University, Li Auto Inc., and Infinigence AI.",
     90         "source": "haiku"
     91       },
     92       "funder_independent_of_outcome": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Ant Group and Li Auto Inc. are funders with co-authors from Li Auto, but the paper evaluates general-purpose multi-agent reasoning rather than any Li Auto or Ant Group product; primary funding is national/academic.",
     96         "source": "haiku"
     97       },
     98       "financial_interests_declared": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "No competing interests statement is present despite co-authors employed at Li Auto Inc. and Infinigence AI; the Ethics Statement discusses only dual-use concerns, not financial interests or equity.",
    102         "source": "haiku"
    103       }
    104     },
    105     "scope_and_framing": {
    106       "key_terms_defined": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Key terms including 'multi-agent system,' 'credit assignment,' 'turn-level advantage estimator,' and 'self-play' are precisely defined with mathematical formulations in Sections 2–3.",
    110         "source": "haiku"
    111       },
    112       "intended_contribution_clear": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Three contributions are explicitly enumerated in bullet points at the end of Section 1: the MARSHAL framework, two novel algorithmic techniques, and empirical validation of generalizable multi-agent reasoning.",
    116         "source": "haiku"
    117       },
    118       "engagement_with_prior_work": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Section 5 actively positions MARSHAL against SPIRAL, MT-GRPO, SPAG, and prior MAS works (AutoGen, MAD, MetaGPT), explaining mechanistic differences rather than merely listing citations.",
    122         "source": "haiku"
    123       }
    124     }
    125   },
    126   "type_checklist": {
    127     "empirical": {
    128       "artifacts": {
    129         "code_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "The Reproducibility Statement explicitly confirms code, model checkpoints, and training scripts are publicly available at https://github.com/thu-nics/MARSHAL with 'all necessary configurations.'",
    133           "source": "haiku"
    134         },
    135         "data_released": {
    136           "applies": true,
    137           "answer": true,
    138           "justification": "All evaluation benchmarks (MATH500, GSM8K, AQUA-RAT, AIME24, AMC23, MMLU-STEM, GPQA-Diamond) are standard public datasets; game environments use open-source OpenSpiel and VS-Bench.",
    139           "source": "haiku"
    140         },
    141         "environment_specified": {
    142           "applies": true,
    143           "answer": true,
    144           "justification": "The full software stack is specified in Appendix B: ROLL framework, vLLM for inference, Megatron-LM for distributed training, OpenSpiel and VS-Bench for games, on 8 NVIDIA H100 GPUs.",
    145           "source": "haiku"
    146         },
    147         "reproduction_instructions": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "The paper defers reproduction to the GitHub repository without step-by-step instructions in the paper itself; Appendix B describes implementation choices but not executable reproduction steps.",
    151           "source": "haiku"
    152         }
    153       },
    154       "statistical_methodology": {
    155         "confidence_intervals_or_error_bars": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "All results across 18 tables are reported as single point estimates with no confidence intervals or error bars, despite evaluating stochastic game outcomes over 1000 games and stochastic RL training.",
    159           "source": "haiku"
    160         },
    161         "significance_tests": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "No statistical significance tests are applied to any comparative results; performance differences between methods are reported without assessing whether they exceed random variation.",
    165           "source": "haiku"
    166         },
    167         "effect_sizes_reported": {
    168           "applies": true,
    169           "answer": true,
    170           "justification": "Absolute percentage-point improvements over baseline are consistently reported (e.g., +10.0pp on AIME, +7.57pp on GPQA, +28.7pp on Leduc Hold'em) with baseline values provided for context.",
    171           "source": "haiku"
    172         },
    173         "sample_size_justified": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "1000 games per evaluation is stated but not justified via power analysis; it is unclear whether this is sufficient to detect the effect sizes claimed given the high variance of game outcomes.",
    177           "source": "haiku"
    178         },
    179         "variance_reported": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "No standard deviations, variance, or run-to-run variability is reported for any experiment; results from single training runs are presented throughout, which is particularly problematic for high-variance RL.",
    183           "source": "haiku"
    184         }
    185       },
    186       "evaluation_design": {
    187         "baselines_included": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "The unmodified Qwen3-4B base model and SPIRAL (a competitive prior method) serve as baselines throughout all game and benchmark evaluations.",
    191           "source": "haiku"
    192         },
    193         "baselines_contemporary": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "SPIRAL (Liu et al., 2025) is described as a concurrent work directly addressing the same problem; MT-GRPO (Zeng et al., 2025) is a concurrent method also compared; Qwen3-4B is a state-of-the-art base model.",
    197           "source": "haiku"
    198         },
    199         "ablation_study": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Section 4.5 ablates self-play vs. fixed opponent, turn-level vs. trajectory-level advantage, and agent-specific vs. global normalization; Appendix F decouples algorithm from game environment; Appendix H ablates length penalty weight.",
    203           "source": "haiku"
    204         },
    205         "multiple_metrics": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "Seven reasoning benchmarks (MATH500, GSM8K, AQUA-RAT, AIME24, AMC23, MMLU-STEM, GPQA-Diamond) and six game environments with multiple role conditions provide comprehensive evaluation coverage.",
    209           "source": "haiku"
    210         },
    211         "human_evaluation": {
    212           "applies": false,
    213           "answer": false,
    214           "justification": "Human evaluation is not relevant for this RL training framework evaluated on strategic games and standard reasoning benchmarks.",
    215           "source": "haiku"
    216         },
    217         "held_out_test_set": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Three held-out games (Connect Four, Leduc Hold'em, Simple Hanabi) that were never seen during training serve as OOD generalization tests; standard reasoning benchmarks are also held-out test sets.",
    221           "source": "haiku"
    222         },
    223         "per_category_breakdown": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Results are broken down per game, per benchmark, per specialist vs. generalist agent, per multi-agent framework (MAD vs. AutoGen), and per player role (first-move vs. second-move).",
    227           "source": "haiku"
    228         },
    229         "failure_cases_discussed": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "Section 4.4 presents a quantitative failure mode analysis using the taxonomy of Cemri et al. (2025) with three categories and six sub-categories, showing which failure types MARSHAL reduces most.",
    233           "source": "haiku"
    234         },
    235         "negative_results_reported": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "The fixed-opponent ablation shows severe generalization failure (Kuhn Poker specialist collapses completely on non-poker games); removing length penalty degrades cooperative game performance substantially.",
    239           "source": "haiku"
    240         }
    241       },
    242       "setup_transparency": {
    243         "model_versions_specified": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "'Qwen3-4B' and 'Qwen3-8B' are named without specific checkpoint hashes, revision IDs, or snapshot dates; the model name alone cannot pin down the exact version used.",
    247           "source": "haiku"
    248         },
    249         "prompts_provided": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "Appendix I provides complete system prompts and user prompts for all five game environments (Tic-Tac-Toe, Kuhn Poker, Mini Hanabi, Connect Four, Leduc Hold'em) with full detail including example game states.",
    253           "source": "haiku"
    254         },
    255         "hyperparameters_reported": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Table 5 provides a comprehensive hyperparameter table covering model configuration, training settings (batch size, optimizer, learning rate, scheduler), and RL settings (PPO clip, KL coefficient, entropy, gamma, lambda).",
    259           "source": "haiku"
    260         },
    261         "scaffolding_described": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "The agentic scaffolding is described in detail: ROLL for multi-turn rollouts, vLLM for inference, Megatron-LM for distributed training, with the turn-level MDP formulation in Section 2.",
    265           "source": "haiku"
    266         },
    267         "data_preprocessing_documented": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "Self-play trajectory generation procedure is described in Section 3.1 and Figure 2 with reward normalization steps; evaluation uses cited standard scripts (Qwen2.5-Math eval and lm-evaluation-harness).",
    271           "source": "haiku"
    272         }
    273       },
    274       "data_integrity": {
    275         "raw_data_available": {
    276           "applies": true,
    277           "answer": false,
    278           "justification": "Self-play training trajectories are not released as a dataset; only code and model checkpoints are available, so the raw training data distribution cannot be independently inspected.",
    279           "source": "haiku"
    280         },
    281         "data_collection_described": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "Self-play data collection is described in Section 3.1: fully online training where trajectories are generated via nucleus sampling (temperature 0.6, Top-P 0.99, Top-K 100) and immediately used for policy updates.",
    285           "source": "haiku"
    286         },
    287         "recruitment_methods_described": {
    288           "applies": false,
    289           "answer": false,
    290           "justification": "No human participants are involved; the paper uses standard benchmarks and synthetic game environments requiring no recruitment.",
    291           "source": "haiku"
    292         },
    293         "data_pipeline_documented": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "The full pipeline from self-play trajectory generation to GRPO policy update is documented in Section 3 and Figure 2, with implementation details in Appendix B including framework dependencies.",
    297           "source": "haiku"
    298         }
    299       },
    300       "contamination": {
    301         "training_cutoff_stated": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "The Qwen3-4B training data cutoff is never stated in the paper; this is material since AIME24 and AMC23 are used as primary evaluation benchmarks and may overlap with Qwen3's training corpus.",
    305           "source": "haiku"
    306         },
    307         "train_test_overlap_discussed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "No discussion of potential overlap between Qwen3's pre-training data and the evaluation benchmarks (AIME24, AMC23, GPQA-Diamond); this is a notable omission for benchmark-based reasoning evaluation.",
    311           "source": "haiku"
    312         },
    313         "benchmark_contamination_addressed": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "AIME24 and AMC23 are used as primary evaluation benchmarks without any analysis of whether these problems were available in Qwen3's training corpus, which is a critical validity concern.",
    317           "source": "haiku"
    318         }
    319       },
    320       "human_studies": {
    321         "pre_registered": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants involved in this study.",
    325           "source": "haiku"
    326         },
    327         "irb_or_ethics_approval": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants involved in this study.",
    331           "source": "haiku"
    332         },
    333         "demographics_reported": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants involved in this study.",
    337           "source": "haiku"
    338         },
    339         "inclusion_exclusion_criteria": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants involved in this study.",
    343           "source": "haiku"
    344         },
    345         "randomization_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants involved in this study.",
    349           "source": "haiku"
    350         },
    351         "blinding_described": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants involved in this study.",
    355           "source": "haiku"
    356         },
    357         "attrition_reported": {
    358           "applies": false,
    359           "answer": false,
    360           "justification": "No human participants involved in this study.",
    361           "source": "haiku"
    362         }
    363       },
    364       "cost_and_practicality": {
    365         "inference_cost_reported": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "Inference cost and latency for self-play rollout or downstream evaluation are not reported; hardware is mentioned (8 H100 GPUs) but without timing or cost figures.",
    369           "source": "haiku"
    370         },
    371         "compute_budget_stated": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "Hardware (8 H100 GPUs) and training steps (200) are stated but total GPU-hours, wall-clock time, or FLOPs are not reported, making it impossible to assess practical cost.",
    375           "source": "haiku"
    376         }
    377       }
    378     }
    379   },
    380   "claims": [
    381     {
    382       "claim": "MARSHAL generalist agents achieve up to 28.7% performance improvement over the Qwen3-4B baseline in held-out strategic games.",
    383       "evidence": "Figure 3 and Section 4.2 show 28.7% improvement on Leduc Hold'em and 22.9% on Simple Hanabi for the generalist; Table 15 provides raw game returns supporting these normalized scores.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Game-trained MARSHAL agents achieve 10.0% gain on AIME24 and 7.57% on GPQA-Diamond in zero-shot multi-agent evaluation.",
    388       "evidence": "Table 1 shows AIME in AutoGen: Qwen3-4B 56.67% → Generalist 66.67% (+10.0pp); GPQA in MAD: Qwen3-4B 37.88% → Generalist 45.45% (+7.57pp).",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "The turn-level 'sum-then-normalize' advantage estimator is critical for long-horizon multi-turn learning and outperforms trajectory-level and concurrent MT-GRPO approaches.",
    393       "evidence": "Table 4 shows removing turn-level estimation drops Mini Hanabi from 50.48% to 34.80% and Simple Hanabi from 29.75% to 19.05%; Table 11 shows MARSHAL outperforms MT-GRPO on all held-out games.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Agent-specific advantage normalization is necessary in competitive games with asymmetric player return distributions but provides smaller benefit in cooperative games.",
    398       "evidence": "Figure 6 demonstrates asymmetric returns in Tic-Tac-Toe (player 0 mean=0.49 vs player 1 mean=-0.28) vs. near-symmetric in Hanabi; Table 4 shows normalization helps competitive specialists more than Hanabi specialist.",
    399       "supported": "strong"
    400     },
    401     {
    402       "claim": "Self-play training is essential for generalization; training against fixed opponents causes strategic overfitting and catastrophic failure on out-of-distribution games.",
    403       "evidence": "Table 3 shows the fixed-opponent Kuhn Poker specialist achieves 0.00 on Connect Four, Leduc Hold'em, and Simple Hanabi, while the self-play version maintains non-zero performance across all.",
    404       "supported": "strong"
    405     },
    406     {
    407       "claim": "Cooperative game training is necessary for cooperative downstream generalization; competitive-only training reduces AutoGen performance.",
    408       "evidence": "Table 10 shows MARSHAL with competitive-only games averages 80.34% in AutoGen vs. 82.15% for full MARSHAL; RAE+our games also underperforms at 80.39%.",
    409       "supported": "moderate"
    410     }
    411   ],
    412   "methodology_tags": [
    413     "benchmark-eval"
    414   ],
    415   "key_findings": "MARSHAL introduces turn-level advantage estimation ('sum-then-normalize') and agent-specific advantage normalization to enable stable GRPO-based RL training in multi-turn, multi-agent strategic games. Skills acquired through self-play in simple two-player games (Tic-Tac-Toe, Kuhn Poker, Hanabi) transfer zero-shot to improve performance in standard reasoning benchmarks when agents are embedded in existing MAS frameworks, with up to 10.0pp gain on AIME24 in AutoGen and 7.57pp on GPQA-Diamond in MAD. Both cooperative and competitive training environments are individually necessary: competitive games build role-aware strategy for competitive MAS (MAD), while cooperative games build intent recognition needed for cooperative MAS (AutoGen). Self-play is critical for generalization — fixed-opponent training causes catastrophic strategic overfitting to the specific opponent.",
    416   "red_flags": [
    417     {
    418       "flag": "No statistical uncertainty reported",
    419       "detail": "All results across 18 tables are single point estimates with no confidence intervals, error bars, or standard deviations; RL training is notoriously high-variance and single-run results cannot establish reliability of the observed improvements."
    420     },
    421     {
    422       "flag": "Benchmark contamination unaddressed",
    423       "detail": "AIME24 and AMC23 are primary evaluation benchmarks but Qwen3-4B's training data cutoff is never stated and no analysis of potential contamination is performed, undermining the validity of benchmark comparisons."
    424     },
    425     {
    426       "flag": "No competing interests statement",
    427       "detail": "Co-authors are employed at Li Auto Inc. and Infinigence AI without an explicit competing interests declaration; the Ethics Statement addresses only dual-use concerns, not financial interests."
    428     },
    429     {
    430       "flag": "Single training run per configuration",
    431       "detail": "No evidence of multiple training seeds or runs; given the stochasticity of RL training with self-play, single-run results may not represent typical performance for any configuration."
    432     },
    433     {
    434       "flag": "Compute costs unreported",
    435       "detail": "Training on 8 H100 GPUs for 200 steps is mentioned but no total GPU-hours or wall-clock time is provided, making it impossible to assess the practical cost and accessibility of the approach."
    436     },
    437     {
    438       "flag": "Generalization claims exceed tested scope",
    439       "detail": "The conclusion claims 'self-play as a scalable paradigm for training LLM agents' based on one model family (Qwen3), six two-player games, and two MAS frameworks — the 8B scaling experiment uses identical games and does not meaningfully broaden the claim."
    440     }
    441   ],
    442   "cited_papers": [
    443     {
    444       "title": "SPIRAL: Self-play on zero-sum games incentivizes reasoning via multi-agent multi-turn reinforcement learning",
    445       "relevance": "Primary baseline and concurrent work; MARSHAL directly extends SPIRAL beyond competitive-only games and introduces algorithmic improvements over SPIRAL's Role-Conditioned Advantage Estimation."
    446     },
    447     {
    448       "title": "DeepSeekMath: Pushing the limits of mathematical reasoning in open language models (GRPO)",
    449       "relevance": "Introduces GRPO, the base RL algorithm that MARSHAL modifies with turn-level and agent-specific extensions for multi-agent settings."
    450     },
    451     {
    452       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    453       "relevance": "Key motivation showing RL with verifiable rewards enhances single-agent reasoning; MARSHAL extends this paradigm to multi-agent cooperative and competitive settings."
    454     },
    455     {
    456       "title": "Encouraging divergent thinking in large language models through multi-agent debate (MAD)",
    457       "relevance": "One of two downstream MAS frameworks used to evaluate MARSHAL generalization; competitive debate setting where competitive-game-trained agents excel."
    458     },
    459     {
    460       "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversations",
    461       "relevance": "Second downstream MAS evaluation framework; cooperative setting where MARSHAL's Hanabi-trained cooperative agents show strongest gains."
    462     },
    463     {
    464       "title": "Reinforcing multi-turn reasoning in LLM agents via turn-level credit assignment (MT-GRPO)",
    465       "relevance": "Concurrent work addressing the same turn-level credit assignment problem; MARSHAL's 'sum-then-normalize' is theoretically and empirically compared to MT-GRPO's turn-by-turn normalization in Appendix G."
    466     },
    467     {
    468       "title": "Why do multi-agent LLM systems fail?",
    469       "relevance": "Provides the failure mode taxonomy (System Design Issues, Inter-Agent Misalignment, Task Verification) used for MARSHAL's quantitative failure analysis on GPQA-Diamond."
    470     },
    471     {
    472       "title": "OpenSpiel: A framework for reinforcement learning in games",
    473       "relevance": "Game environment infrastructure underlying all six strategic game implementations in MARSHAL's training and evaluation pipeline."
    474     },
    475     {
    476       "title": "Self-playing adversarial language game enhances LLM reasoning (SPAG)",
    477       "relevance": "Prior work showing self-play in adversarial language games generalizes to single-agent reasoning; MARSHAL extends the paradigm to multi-agent cooperative and competitive settings with formal RL."
    478     },
    479     {
    480       "title": "Qwen3 technical report",
    481       "relevance": "Base model for all MARSHAL experiments; understanding the model's pre-training is essential for interpreting benchmark results and assessing contamination risks."
    482     }
    483   ],
    484   "engagement_factors": {
    485     "practical_relevance": {
    486       "score": 2,
    487       "justification": "Code and checkpoints are released and the approach can be applied to improve any LLM's multi-agent reasoning, though 8 H100 GPUs for training limits accessibility."
    488     },
    489     "surprise_contrarian": {
    490       "score": 2,
    491       "justification": "Training on simple two-player games like Tic-Tac-Toe transferring to improve abstract AIME reasoning is counterintuitive and challenges the assumption that capability requires task-specific training."
    492     },
    493     "fear_safety": {
    494       "score": 1,
    495       "justification": "The paper acknowledges dual-use concerns (strategic reasoning agents capable of competition and deception), but safety implications are briefly noted rather than investigated."
    496     },
    497     "drama_conflict": {
    498       "score": 1,
    499       "justification": "Direct empirical comparisons against SPIRAL and MT-GRPO as concurrent works add some competitive framing, but the field is collaborative and the comparisons are constructive rather than adversarial."
    500     },
    501     "demo_ability": {
    502       "score": 2,
    503       "justification": "Code, model checkpoints, and training scripts are publicly released at GitHub, allowing practitioners to run the trained agents on strategic games immediately."
    504     },
    505     "brand_recognition": {
    506       "score": 2,
    507       "justification": "Tsinghua University is a top-tier global institution; Li Auto and Ant Group are major Chinese technology companies; publication at ICLR 2026 provides significant credibility."
    508     }
    509   },
    510   "hn_data": {
    511     "threads": [
    512       {
    513         "hn_id": "42807387",
    514         "title": "A Faster Quantum Fourier Transform",
    515         "points": 89,
    516         "comments": 6,
    517         "url": "https://news.ycombinator.com/item?id=42807387",
    518         "created_at": "2025-01-23T19:49:59Z"
    519       },
    520       {
    521         "hn_id": "41900729",
    522         "title": "Black Holes Inside and Out 2024",
    523         "points": 2,
    524         "comments": 1,
    525         "url": "https://news.ycombinator.com/item?id=41900729",
    526         "created_at": "2024-10-21T04:40:15Z"
    527       }
    528     ],
    529     "top_points": 89,
    530     "total_points": 91,
    531     "total_comments": 7
    532   }
    533 }

Impressum · Datenschutz