scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26868B)
      1 {
      2   "paper": {
      3     "title": "Stronger-MAS: Multi-Agent Reinforcement Learning for Collaborative LLMs",
      4     "authors": ["Yujie Zhao", "Lanxiang Hu", "Yang Wang", "Minmin Hou", "Hao Zhang", "Ke Ding", "Jishen Zhao"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2510.11062"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "AT-GRPO, an agent- and turn-wise grouped RL algorithm for multi-agent LLM systems, dramatically improves long-horizon planning tasks from 14-47% to 96-99.5% accuracy. On coding and math benchmarks, it yields average gains of 3.87-7.62% and 9.0-17.93% respectively over single-agent baselines. The paper shows that whether to use role-sharing or role-specialized policies depends on task characteristics, and that training agents jointly in a MAS environment is critical for emergent collaboration.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The paper provides a GitHub link: https://github.com/pettingllms-ai/PettingLLMs in the abstract. The reproducibility statement also says 'source code used for our experiments is included in the supplementary material.'"
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper states 'All datasets used in this study are publicly available' (Sec. 8) and uses standard public benchmarks: APPS, LiveCodeBench, CodeContests, AIME24/25, OlympiadBench, and programmatically generated Sudoku/Sokoban/Plan-Path instances."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No requirements.txt, Dockerfile, or dependency specifications are mentioned. The paper mentions 8×H100 GPUs and Qwen3 models but does not specify library versions or environment setup details."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper promises 'Upon acceptance, we will release the complete, documented source code' but does not include step-by-step reproduction instructions in the paper itself. Hyperparameters are listed in Appendix C.1 but no runnable commands or scripts are documented."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Tables 1 and 2 report only point estimates (e.g., '99.00', '24.00') with no confidence intervals or error bars. Figure 6(a) shows shaded bands but the main results tables have no uncertainty quantification."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper makes numerous comparative claims ('outperforms', 'gains of 3.87-7.62%') but no statistical significance tests (p-values, t-tests, etc.) are reported anywhere."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Tables 1 and 2 report absolute gains and relative improvements with baseline context (e.g., '+84% (+560.0%)' in Fig. 1, gains from baseline shown in parentheses throughout). The reader can assess magnitude of effects."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification is given for the number of evaluation instances used for each benchmark. The paper does not discuss whether the sample sizes are sufficient for the claims being made."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Figure 6(a) shows shaded variability bands for training curves, but the main results in Tables 1-4 report single numbers with no standard deviation, variance, or spread measures across runs."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper compares five variants (Sec. 5.1): Single Agent, Single Agent + GRPO, MAS (prompt-only), MAS + GRPO, and MAS + AT-GRPO with shared/per-role policies. Also compares against MAPORL, MARFT, and CURE in Table 3."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Baselines include contemporary works: MAPORL (2025), MARFT (2025), CURE (2025), and GRPO. These are recent and relevant to the multi-agent RL for LLMs space."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Section 5.4 and Table 4 present ablation studies: training agents in SA vs. MAS, swapped policies (96% → 6%), outcome-only rewards (Table 6), and multi-turn single-agent variants (Tables 7-8)."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper evaluates on accuracy across multiple domains: game (Sudoku, Sokoban), planning (Plan-Path), code (LiveCodeBench, APPS, CodeContests as Pass@1), and math (AIME24, AIME25, OlympiadBench)."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Human evaluation is not relevant here — all tasks have automated verifiers (unit tests, symbolic checkers, math verifiers)."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Sec. 5.1 states 'we generate distinct training and validation sets using different random seeds and verify there is no overlap' for Sudoku/Sokoban/Plan-Path. Code uses separate training (APPS/CodeContests train) and evaluation (LiveCodeBench, APPS test, CodeContests test) sets."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Tables 1 and 2 provide per-task breakdowns across all 9 benchmarks for each method variant, rather than just aggregate numbers."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 5.2 discusses 'Limitations of MAS-GRPO' where directly applying GRPO to MAS causes performance degradation (e.g., CodeContests 17.60→10.30). Case studies in Appendix H show before/after RL failure modes."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports that MAS+GRPO sometimes degrades performance (Tab. 2: CodeContests 17.60→10.30, OlympiadBench 56.50→53.20). Tables 7-8 show multi-turn SA hurts performance. Role-specialized policies sometimes underperform shared policies (e.g., OlympiadBench 39.60 shared vs 35.20 per-role)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims of 96.0-99.5% on planning (supported by Tables 1-2), 3.87-7.62% coding gains and 9.0-17.93% math gains are supported by the experimental results, though some individual benchmark results are more modest."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The paper makes causal claims via ablation studies (Sec. 5.4): removing MAS training drops Plan-Path from 96% to 16%; swapping policies drops to 6%. These controlled single-variable manipulations adequately support the causal claims about AT-GRPO's contribution."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title claims 'Multi-Agent Reinforcement Learning for Collaborative LLMs' broadly but experiments are limited to Qwen3 models at 1.7B and 8B scales. No other model families are tested. The paper does not bound claims to Qwen3 specifically."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper does not substantively discuss alternative explanations for the gains. For example, the MAS framework provides more compute (multiple agents, multiple turns) which could explain gains independently of the AT-GRPO algorithm. The outcome-only ablation partially addresses reward engineering but not compute confounds."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper's claims match its measurements: accuracy on specific benchmarks (Sudoku, Sokoban, LiveCodeBench, AIME, etc.). It does not make broader claims about general 'intelligence' or 'reasoning ability' beyond the tested tasks."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper specifies 'Qwen3 models at 1.7B and 8B in the no-thinking mode' (Sec. 5.1) and cites the Qwen3 technical report. For comparisons, specific models are named: Phi-3-mini-128k (3.4B), Qwen2.5-Coder-3B-Instruct, Qwen-2.5-7B-Instruct."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Appendix C.2 provides full prompt text for all agents across all workflows: Code MAS (coder and tester, generation and refinement phases), Math MAS, Sudoku MAS, Plan-Path (Appendix D), and Sokoban."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Appendix C.1 provides detailed hyperparameters: max response length 4096, batch size 128, mini-batch 64, learning rate 1e-6, weight decay 0.01, γ=1.0, λ=1.0, temperature 1.0, top-p=1.0, K=4 samples, T=4 turns, α=1, 150 training steps."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The MAS workflows are described in detail: Fig. 2 shows domain-specific workflows, Sec. 4.2 describes the training system architecture (Fig. 4), Algorithm 1 provides the full AT-GRPO procedure, and Appendix C.2 details all agent interaction patterns."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Sec. 5.1 describes data sources and preprocessing: APPS introductory subset for 1.7B, CodeContests for 8B, Polaris-Dataset-53K for math. Train/test splits are described with random seed separation and overlap verification."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated Limitations section. There is an Ethics Statement (Sec. 7) and Reproducibility Statement (Sec. 8) but neither serves as a limitations discussion. The brief mentions of limitations in Sec. 5.2 ('Limitations of MAS-GRPO') refer to a baseline method, not limitations of the proposed approach."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to validity are discussed. The ethics statement mentions generic dual-use risks and societal biases but does not discuss specific threats to the validity of the experimental findings."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to Qwen3 models, does not discuss whether results transfer to other model families, and does not state limitations of the tested domains."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw experimental data (individual run outputs, per-example results, training logs) is made available for independent verification."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Sec. 5.1 describes data sources: APPS training split (introductory), CodeContests, Polaris-Dataset-53K, and programmatic generation of Sudoku/Sokoban/Plan-Path instances with distinct seeds for train/test."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. All data comes from standard benchmarks and programmatically generated instances."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The data pipeline is documented: Sec. 5.1 describes dataset selection, Appendix C.1 covers training details, and reward designs for each task are fully specified in Appendix B."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding or acknowledgments section is present in the paper. Authors are from UCSD and Intel Corporation but no funding disclosures are made."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: University of California, San Diego and Intel Corporation."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Intel Corporation is listed as an affiliation for three authors (Yang Wang, Minmin Hou, Ke Ding). Intel has commercial interest in LLM training infrastructure. No discussion of whether this creates a conflict, and no funding independence statement."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is included in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper does not state the training data cutoff for the Qwen3 models used. This is relevant since benchmarks like APPS, AIME, and CodeContests may overlap with training data."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "For the programmatically generated tasks (Sudoku, Sokoban, Plan-Path), the paper verifies no train/test overlap via distinct seeds. However, for public benchmarks (APPS, LiveCodeBench, AIME, CodeContests), no discussion of whether Qwen3's pretraining data included these benchmarks."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of contamination risk for APPS (2021), AIME (2024/2025), CodeContests, or OlympiadBench. These public benchmarks could plausibly be in Qwen3's training data."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Appendix G.3 provides empirical latency: 'one on-policy iteration for the single-agent baseline requires approximately 4 minutes for rollout and 1 minute for AT-GRPO training' on 4×H100 GPUs. MAS setting takes ~8 min rollout + 2 min training."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Sec. 5.1 states 'All runs use a single node with 8× H100 GPUs' and Appendix C.1 specifies 150 training steps. Appendix G.3 provides per-iteration timing."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No results across multiple random seeds are reported. Tables 1-2 show single numbers per configuration with no seed sensitivity analysis."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The paper does not state how many runs produced the reported results. It appears to be single-run results given the absence of variance reporting."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No hyperparameter search budget is reported. The paper states α=1 'without further tuning' (Sec. 5.1) but does not disclose whether other hyperparameters were searched."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No justification for why K=4, T=4, learning rate 1e-6, etc. were chosen. Some appear fixed ('without further tuning') but the selection process is not documented."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Many comparisons across 9 benchmarks × 6 methods × 2 model scales with no statistical tests at all, let alone correction for multiple comparisons."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors compare their AT-GRPO system against their own implementations of baselines (GRPO, MAS) without acknowledging potential bias. Results from MAPORL, MARFT, and CURE are cited from original papers but not independently replicated."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "MAS+AT-GRPO uses multiple agents with tree sampling (K=4 branches per agent per turn), consuming substantially more compute than single-agent GRPO. Appendix G discusses complexity scaling but does not compare performance at matched compute budgets."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "No discussion of whether the benchmarks actually measure what is claimed. For example, Sudoku and Sokoban are used as proxies for 'game' and 'planning' ability without discussing construct validity."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The MAS framework itself acts as a scaffold. When comparing SA+GRPO vs MAS+AT-GRPO, the difference includes both the algorithm (AT-GRPO vs GRPO) and the scaffold (MAS vs SA). The ablation in Table 4 partially addresses this but does not fully disentangle scaffold from algorithm effects."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of temporal leakage. APPS (2021), AIME24/25, CodeContests, and OlympiadBench could all be in Qwen3's pretraining data."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the evaluation setup leaks information. For the RL-trained models, the training and test benchmarks are separated, but pretraining leakage is not addressed."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "For programmatic tasks, independence is ensured by distinct random seeds. For public benchmarks, no discussion of whether train/test examples share structural similarities."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection methods (canary strings, n-gram overlap, membership inference) are applied to any benchmark."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "AT-GRPO boosts long-horizon planning accuracy from 14-47% (single-agent RL) to 96.0-99.5%",
    364       "evidence": "Tables 1-2: Plan-Path goes from 11%/47% (SA+GRPO) to 96-97%/93-96% (MAS+AT-GRPO) for 1.7B/8B; Sokoban from 3%/14% to 10-11.5%/96-98%",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "Average coding gains of 3.87-7.62% and math gains of 9.0-17.93% over single-agent baseline",
    369       "evidence": "Tables 1-2 show per-benchmark gains. Coding gains range from +1.20 to +16.30 across benchmarks; math from +1.60 to +38.70. Average figures appear computed across benchmarks.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "RL training on MAS reinforces role-specific specialization",
    374       "evidence": "Table 4 ablation: swapping trained role-specific policies drops Plan-Path from 96% to 6%. Fig. 6(a) shows diverging reward curves for tool and plan agents.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "MAS+AT-GRPO scales effectively with increasing number of agents while MAS+GRPO saturates",
    379       "evidence": "Fig. 5(b): AT-GRPO scales from 18.2% to 47.7% on AIME24 as agent count increases, while GRPO saturates at 34.1%.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "On-policy RL training within MAS is critical; training agents in SA and combining in MAS gives limited benefit",
    384       "evidence": "Table 4: Training in SA, eval in MAS achieves only 16% on Plan-Path vs 96% for MAS RL training.",
    385       "supported": "strong"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "No variance or significance reporting",
    391       "detail": "All main results (Tables 1-4) are single point estimates with no error bars, standard deviations, or significance tests. Given that RL training is known to have high variance across seeds (Henderson et al. 2018), single-run results are insufficient to support the comparative claims."
    392     },
    393     {
    394       "flag": "Compute confound not addressed",
    395       "detail": "MAS+AT-GRPO uses multiple agents with tree-structured sampling (K=4 branches per agent per turn), consuming far more compute than single-agent GRPO. Performance is never compared at matched compute budgets, making it unclear whether gains come from the algorithm or simply from more compute."
    396     },
    397     {
    398       "flag": "No limitations section",
    399       "detail": "The paper lacks any dedicated limitations discussion despite making broad claims about a general-purpose MAS RL training method."
    400     },
    401     {
    402       "flag": "Industry affiliation without conflict disclosure",
    403       "detail": "Three authors are from Intel Corporation, which has commercial interest in LLM training infrastructure. No conflicts of interest or funding statement is provided."
    404     }
    405   ],
    406   "cited_papers": [
    407     {
    408       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    409       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    410       "year": 2023,
    411       "relevance": "Foundational multi-agent LLM framework with role-sharing architecture."
    412     },
    413     {
    414       "title": "MetaGPT: Meta Programming for a Multi-Agent Collaborative Framework",
    415       "authors": ["Sirui Hong"],
    416       "year": 2024,
    417       "relevance": "Multi-agent collaborative framework for software development tasks."
    418     },
    419     {
    420       "title": "MAGIS: LLM-Based Multi-Agent Framework for GitHub Issue Resolution",
    421       "authors": ["Wei Tao"],
    422       "year": 2024,
    423       "relevance": "Multi-agent LLM system applied to software engineering (GitHub issue resolution)."
    424     },
    425     {
    426       "title": "ToolRL: Reward is All Tool Learning Needs",
    427       "authors": ["Cheng Qian"],
    428       "year": 2025,
    429       "arxiv_id": "2504.13958",
    430       "relevance": "RL-based tool use training for LLMs, relevant to agentic AI capabilities."
    431     },
    432     {
    433       "title": "CURE: Co-evolving LLM Coder and Unit Tester via Reinforcement Learning",
    434       "authors": ["Yinjie Wang"],
    435       "year": 2025,
    436       "arxiv_id": "2506.03136",
    437       "relevance": "RL co-training of coder and tester agents, directly compared baseline."
    438     },
    439     {
    440       "title": "MAPORL: Multi-Agent Post-Co-Training for Collaborative Large Language Models with Reinforcement Learning",
    441       "authors": ["Chanwoo Park"],
    442       "year": 2025,
    443       "relevance": "Multi-agent RL post-training for LLMs via debate, directly compared baseline."
    444     },
    445     {
    446       "title": "MARFT: Multi-Agent Reinforcement Fine-Tuning",
    447       "authors": ["Junwei Liao"],
    448       "year": 2025,
    449       "arxiv_id": "2504.16129",
    450       "relevance": "Multi-agent reinforcement fine-tuning framework, directly compared baseline."
    451     },
    452     {
    453       "title": "SPIRAL: Self-Play on Zero-Sum Games Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning",
    454       "authors": ["Bo Liu"],
    455       "year": 2025,
    456       "arxiv_id": "2506.24119",
    457       "relevance": "Self-play RL for LLMs in multi-agent settings."
    458     },
    459     {
    460       "title": "RAGen: Understanding Self-Evolution in LLM Agents via Multi-Turn Reinforcement Learning",
    461       "authors": ["Zihan Wang"],
    462       "year": 2025,
    463       "arxiv_id": "2504.20073",
    464       "relevance": "Multi-turn RL for LLM agent self-evolution."
    465     },
    466     {
    467       "title": "Why Do Multi-Agent LLM Systems Fail?",
    468       "authors": ["Mert Cemri"],
    469       "year": 2025,
    470       "arxiv_id": "2503.13657",
    471       "relevance": "Failure analysis of multi-agent LLM systems, relevant to understanding MAS limitations."
    472     },
    473     {
    474       "title": "GiGPO: Group-in-Group Policy Optimization for LLM Agent Training",
    475       "authors": ["Lang Feng"],
    476       "year": 2025,
    477       "arxiv_id": "2505.10978",
    478       "relevance": "Group-relative policy optimization for LLM agents, foundational to AT-GRPO's design."
    479     },
    480     {
    481       "title": "CodeSteer: Symbolic-Augmented Language Models via Code/Text Guidance",
    482       "authors": ["Yongchao Chen"],
    483       "year": 2025,
    484       "arxiv_id": "2502.04350",
    485       "relevance": "Symbolic task benchmark (SymBench) used for Sudoku/Sokoban/Plan-Path evaluation."
    486     }
    487   ]
    488 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs