scan.json (25058B)
1 { 2 "paper": { 3 "title": "Multi-Agent Evolve: LLM Self-Improve through Co-evolution", 4 "authors": ["Yixing Chen", "Yiding Wang", "Siqi Zhu", "Haofei Yu", "Tao Feng", "Muhan Zhang", "Mostofa Patwary", "Jiaxuan You"], 5 "year": 2025, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2510.23595", 8 "doi": "10.48550/arXiv.2510.23595" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Multi-Agent Evolve (MAE) proposes a Proposer-Solver-Judge triad instantiated from a single LLM (Qwen2.5-3B-Instruct), trained with RL to self-improve without human-curated ground truth. MAE achieves an average 4.54% improvement over the base model across 22 benchmarks spanning math, coding, reasoning, and general knowledge. The 'half reference' setting (balanced exploration and reference) yields the best overall average of 59.87%, outperforming SFT (53.87%) and AZR (57.72%). Ablations confirm each agent role and quality filtering are necessary components.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "GitHub link provided: https://github.com/ulab-uiuc/Multi-agent-Evolve (on page 1)." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper uses standard public benchmarks (GSM8K, MATH, ARC, MMLU, etc.) for evaluation. Seed data composition is documented in Appendix D (Table 5) with 967 questions drawn from 14 public datasets." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No requirements.txt, Dockerfile, or environment specification is provided. Hyperparameters are listed in Table 3 but no dependency/environment setup details." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README contents or reproduction commands are described." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Tables 1 and 2 report point estimates only (e.g., '82.20', '65.80') with no confidence intervals, error bars, or ± notation." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims MAE outperforms baselines based on comparing raw numbers. No statistical significance tests (p-values, t-tests, etc.) are reported." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports percentage improvements with baseline context, e.g., 'MATH, 60.40 → 68.20', 'Overall Avg. (58.51 vs. 55.33)', and 'average improvement of 4.54%', providing enough context to assess magnitude." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification for why 300 training steps, batch size 128, or 967 seed questions were chosen. No power analysis or sample size reasoning." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No standard deviations, variance, or spread measures are reported across runs. All results appear to be single-run numbers." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper compares against the Base model, SFT baseline (LoRA with ground-truth answers), and AZR (Absolute Zero Reasoner) using its official implementation." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "AZR (Zhao et al., 2025) is a contemporary self-play baseline. The paper also compares against SFT, a standard baseline. These are appropriate for the self-improving LLM paradigm." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Table 2 presents a comprehensive ablation study removing each agent role (no solver training, no proposer training, no judge training) and removing format reward and quality filtering." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Results are reported on 22 benchmarks across math, coding, reasoning, commonsense, and general knowledge. ID Avg., OOD Avg., and Overall Avg. are also computed." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation is included. Evaluation is entirely automated via benchmarks and an LLM judge (Nemotron-70B). Given claims about general reasoning improvement, human evaluation of output quality would be relevant." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "The paper explicitly separates in-distribution (ID) benchmarks corresponding to seed data and held-out out-of-distribution (OOD) benchmarks. Test sets are standard public benchmarks not used in training." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Tables 1 and 2 provide per-benchmark breakdowns across 22 individual benchmarks, plus ID/OOD/Overall averages." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 5.2.1 discusses stability issues and potential collapse. Figure 4 shows examples of format failures. The paper discusses how suboptimal prompts can lead the proposer to generate only open-ended writing questions, degrading performance." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper reports that MAE (with reference) sometimes underperforms the base model on specific benchmarks (e.g., GSM8K drops from 85.20 to 76.00). SFT is shown to degrade overall performance. MAE (zero) shows drops on GPQA (34.67→29.42) and Winogrande." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims 'average improvement of 4.54% on multiple benchmarks' which is supported by Table 1 (Overall Avg. 59.87 vs 55.33 for base). The claim of outperforming baselines is supported by the results." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper makes causal claims about each component's contribution (e.g., 'removing any role results in falling short'). These are supported by controlled ablation studies in Table 2 where single variables are removed." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The abstract claims MAE 'enables LLMs to self-evolve in solving diverse tasks' and is 'a scalable, data-efficient method for enhancing general reasoning abilities,' but all experiments use only Qwen2.5-3B-Instruct. No other model sizes or families are tested. The title and framing are broader than what one 3B model demonstrates." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not discuss alternative explanations for the observed improvements. For example, it does not consider whether the gains come from the RL training procedure itself rather than the multi-agent structure, or whether the Judge merely teaches format compliance." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper measures benchmark accuracy and frames it as 'general reasoning abilities' and 'self-improvement.' No discussion of whether benchmark scores actually measure general reasoning capability, or whether gains transfer to real-world tasks." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper specifies 'Qwen2.5-3B-Instruct' as the base model and 'nvidia/llama-3.1-nemotron-70b-instruct' as the evaluation judge (Table 4). These are specific model identifiers." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Full prompt text for all agents (Solver, Proposer with/without reference, Judge for answers, Judge for questions) is provided in Appendix A.1 (Listings 1-5). Evaluation prompts are in Appendix B." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Table 3 provides comprehensive training hyperparameters (learning rate 1e-6, batch size 128, optimizer AdamW, etc.). Table 4 provides evaluation LLM judge configuration. Reward weights are stated in Section 4." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "The three-agent framework (Proposer, Solver, Judge) is described in detail in Section 4, including the coordination pipeline in Algorithm 1, reward design for each agent, and quality filtering mechanism." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "The seed data composition is documented in Appendix D (Table 5) with counts per benchmark. Quality filtering criteria (Judge score ≥ 0.7) are specified. The paper describes how questions flow through the pipeline." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no dedicated limitations section. The conclusion briefly mentions 'Future work includes scaling to larger backbones' but does not substantively discuss limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed. The paper does not address issues like evaluation with LLM-as-judge reliability, single-model testing, or single-run results." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "No explicit scope boundaries are stated. The paper does not clarify what the results do NOT show (e.g., that this only works on 3B models, or that improvements may not hold for larger models)." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw experimental data (model outputs, training logs, per-example scores) is provided. Only aggregated benchmark accuracies in tables." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "The seed data collection is described: 967 questions sampled from 14 training sets (Table 5). The self-generated data pipeline is documented in Algorithm 1 with quality filtering criteria." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data sources are standard public benchmarks." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "Algorithm 1 documents the full data pipeline: question generation → quality filtering (≥0.7 threshold) → solver answering → pair dataset collection → judge scoring → synchronized update." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information or acknowledgments section is present in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly listed: UIUC, Peking University, and NVIDIA. One author (Mostofa Patwary) is from NVIDIA, and one author (Jiaxuan You) has dual affiliation with UIUC and NVIDIA." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "NVIDIA affiliation means there is a potential commercial interest (NVIDIA benefits from demonstrating RL training methods for LLMs). No funding disclosure to assess independence." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper does not state the training data cutoff date for Qwen2.5-3B-Instruct. Benchmark contamination via the base model's pre-training data is not addressed." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether Qwen2.5-3B-Instruct may have seen the evaluation benchmarks (GSM8K, MATH, MMLU, etc.) during pre-training." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "Many benchmarks used (GSM8K 2021, MMLU 2020, HumanEval 2021) were published years before Qwen2.5's training. Contamination risk is not discussed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference cost, API costs, or per-example cost is reported despite the method involving multiple LLM calls per training step (Proposer + Solver + Judge + difficulty estimation with N=5 samples)." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No GPU hours, training time, or total compute budget is stated. 300 training steps with batch size 128 is mentioned but no wall-clock time or hardware specification." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No results across multiple random seeds are reported. All results appear to be single-run." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs is never explicitly stated. Results appear to be from single runs." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search budget is reported. The chosen hyperparameters (Table 3) are stated without explaining how they were selected." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The paper does not explain how the best configuration was selected. Reward weights are all set to 1/3 or 0.5 without justification for these specific values." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "The paper compares across 22 benchmarks without any correction for multiple comparisons." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors implement their own framework and compare against their re-implementations/runs of baselines (AZR 'official implementation' run by them). No acknowledgment of self-comparison bias." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "No comparison at matched compute budgets. MAE uses 3 agent roles per step with multiple rollouts (5 samples for difficulty estimation), while AZR uses a simpler setup. Compute differences are not discussed." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether the 22 benchmarks actually measure 'general reasoning ability' as claimed, or whether benchmark improvements reflect genuine capability gains." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "The paper trains and evaluates a single model, not comparing models within different scaffolds. The framework IS the thing being tested." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of temporal leakage. Many evaluation benchmarks (GSM8K, MMLU, HumanEval) predate Qwen2.5's training, meaning solutions could be in the pre-training data." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of feature leakage. The evaluation uses an LLM judge (Nemotron-70B) with ground truth answers, but no analysis of whether the base model's pre-training leaks benchmark answers." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of non-independence between seed training data and test data. Seed data is sampled from training sets of the same benchmarks used for evaluation." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is used." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "MAE achieves an average improvement of 4.54% on multiple benchmarks over the base Qwen2.5-3B-Instruct model.", 365 "evidence": "Table 1: MAE (half reference) Overall Avg. 59.87% vs Base 55.33%. Section 5.1 discusses per-benchmark improvements.", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "MAE (zero) can self-evolve from minimal seed (16 questions) without real-world data, outperforming AZR baseline.", 370 "evidence": "Table 1 upper: MAE (zero) Overall Avg. 58.51 vs AZR 57.72 vs Base 55.33.", 371 "supported": "weak" 372 }, 373 { 374 "claim": "All MAE variants outperform SFT despite SFT using ground-truth answers while MAE does not.", 375 "evidence": "Table 1 lower: SFT Overall Avg. 53.87, all MAE variants ≥57.11. SFT actually degrades from base (55.33).", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Each agent role (Proposer, Solver, Judge) is necessary for full performance.", 380 "evidence": "Table 2 ablation: removing Solver training drops 2.08%, Proposer 1.97%, Judge 2.63% in Overall Avg.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Question quality filtering is critical, with 3.72% performance drop when removed.", 385 "evidence": "Table 2: MAE (half reference) 59.87% vs MAE (no quality filtering) 56.15%, a 3.72% drop.", 386 "supported": "moderate" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "Single-run results with no variance reporting", 392 "detail": "All results appear to be from single experimental runs with no error bars, confidence intervals, or standard deviations across seeds. For RL training which is known to have high variance (Henderson et al. 2018), this is a significant concern." 393 }, 394 { 395 "flag": "No compute cost comparison", 396 "detail": "MAE requires 3 agent roles per step plus 5 difficulty estimation samples, significantly more compute than baselines. No compute budget, GPU hours, or cost-matched comparison is provided." 397 }, 398 { 399 "flag": "LLM-as-judge evaluation without reliability analysis", 400 "detail": "Most benchmarks are evaluated by Nemotron-70B rather than exact-match. No inter-annotator agreement, human-judge correlation, or judge reliability analysis is provided for this evaluation methodology." 401 }, 402 { 403 "flag": "Single model tested, broad generalization claims", 404 "detail": "All experiments use Qwen2.5-3B-Instruct only. The paper claims scalability and general applicability but provides no evidence on other model families or sizes." 405 }, 406 { 407 "flag": "No limitations section", 408 "detail": "The paper lacks any dedicated discussion of limitations, threats to validity, or scope boundaries." 409 }, 410 { 411 "flag": "Benchmark contamination unaddressed", 412 "detail": "Many evaluation benchmarks (GSM8K, MMLU, HumanEval) were published years before Qwen2.5's training. Improvements could partially reflect better elicitation of memorized solutions rather than genuine capability gains." 413 } 414 ], 415 "cited_papers": [ 416 { 417 "title": "Absolute Zero: Reinforced Self-Play Reasoning with Zero Data", 418 "authors": ["Andrew Zhao", "Yiran Wu", "Yang Yue"], 419 "year": 2025, 420 "arxiv_id": "2505.03335", 421 "relevance": "Primary baseline; self-play RL for LLM reasoning without human data, relies on verifiable environments." 422 }, 423 { 424 "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", 425 "authors": ["Daya Guo"], 426 "year": 2025, 427 "arxiv_id": "2501.12948", 428 "relevance": "Key RL-for-LLM-reasoning work demonstrating significant gains from RL training." 429 }, 430 { 431 "title": "Self-Rewarding Language Models", 432 "authors": ["Weizhe Yuan", "Richard Yuanzhe Pang"], 433 "year": 2025, 434 "arxiv_id": "2401.10020", 435 "relevance": "Self-rewarding approach where LLMs provide their own training signals, directly related to the Judge component." 436 }, 437 { 438 "title": "SPIRAL: Self-Play on Zero-Sum Games Incentivizes Reasoning via Multi-Agent Multi-Turn Reinforcement Learning", 439 "authors": ["Bo Liu"], 440 "year": 2025, 441 "arxiv_id": "2506.24119", 442 "relevance": "Self-play RL for LLM reasoning under zero-sum game settings." 443 }, 444 { 445 "title": "A Survey on LLM-as-a-Judge", 446 "authors": ["Jiawei Gu"], 447 "year": 2024, 448 "arxiv_id": "2411.15594", 449 "relevance": "Survey on using LLMs as evaluators, foundational to the Judge component of MAE." 450 }, 451 { 452 "title": "Why Do Multi-Agent LLM Systems Fail?", 453 "authors": ["Mert Cemri"], 454 "year": 2025, 455 "arxiv_id": "2503.13657", 456 "relevance": "Analysis of multi-agent LLM system failure modes, relevant to understanding MAE's stability challenges." 457 }, 458 { 459 "title": "MALT: Improving Reasoning with Multi-Agent LLM Training", 460 "authors": ["Sumeet Ramesh Motwani"], 461 "year": 2025, 462 "arxiv_id": "2412.01928", 463 "relevance": "Multi-agent LLM training with voting mechanism for reasoning enhancement." 464 }, 465 { 466 "title": "Evaluating Large Language Models Trained on Code", 467 "authors": ["Mark Chen"], 468 "year": 2021, 469 "arxiv_id": "2107.03374", 470 "relevance": "HumanEval benchmark used for evaluation; foundational LLM code evaluation work." 471 }, 472 { 473 "title": "REINFORCE++: An Efficient RLHF Algorithm with Robustness to Both Prompt and Reward Models", 474 "authors": ["Jian Hu"], 475 "year": 2025, 476 "arxiv_id": "2501.03262", 477 "relevance": "RL algorithm used as the basis for Task-Relative REINFORCE++ in MAE." 478 }, 479 { 480 "title": "R-Zero: Self-Evolving Reasoning LLM from Zero Data", 481 "authors": ["Chengsong Huang"], 482 "year": 2025, 483 "arxiv_id": "2508.05004", 484 "relevance": "Related self-evolving LLM work; discussed in context of training stability comparison." 485 } 486 ] 487 }