scan.json (29131B)
1 { 2 "paper": { 3 "title": "n-Musketeers: Reinforcement Learning Shapes Collaboration Among Language Models", 4 "authors": [ 5 "Ryozo Masukawa", 6 "Sanggeon Yun", 7 "Hyunwoo Oh", 8 "SungHeon Jeong", 9 "Raheeb Hassan", 10 "Hanning Chen", 11 "Wenjun Huang", 12 "Mahdi Imani", 13 "Pietro Mercati", 14 "Nathaniel D. Bastian", 15 "Mohsen Imani" 16 ], 17 "year": 2026, 18 "venue": "arXiv", 19 "arxiv_id": "2602.09173" 20 }, 21 "scan_version": 2, 22 "active_modules": ["experimental_rigor", "data_leakage"], 23 "methodology_tags": ["benchmark-eval"], 24 "key_findings": "Soft hidden-state collaboration, where frozen SLM experts expose hidden representations to a trainable policy via Perceiver-style cross-attention, can improve RLVR performance on arithmetic tasks (up to 22.9% over single-model baselines) but degrades or provides negligible benefit on logic, algorithmic, and GSM8K tasks where single-model RLVR already saturates. RLVR induces structured expert utilization patterns without routing supervision, with routing entropy decreasing as reward improves (mean Spearman ρ = −0.48). Ablations suggest gains are largely attributable to prefix-style latent conditioning rather than fine-grained expert reasoning traces.", 25 "checklist": { 26 "artifacts": { 27 "code_released": { 28 "applies": true, 29 "answer": false, 30 "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper." 31 }, 32 "data_released": { 33 "applies": true, 34 "answer": true, 35 "justification": "The paper uses publicly available benchmarks: Reasoning Gym (procedurally generated) and GSM8K (standard public benchmark with standard train/test split). No proprietary data was collected." 36 }, 37 "environment_specified": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper mentions 'single NVIDIA H200 GPU' and 'PyTorch' but provides no requirements.txt, Dockerfile, or detailed library version specifications needed to recreate the environment." 41 }, 42 "reproduction_instructions": { 43 "applies": true, 44 "answer": false, 45 "justification": "No step-by-step reproduction instructions are provided. The appendix includes YAML training configs but no README-style instructions for running experiments." 46 } 47 }, 48 "statistical_methodology": { 49 "confidence_intervals_or_error_bars": { 50 "applies": true, 51 "answer": true, 52 "justification": "Table 1 reports mean ± standard deviation across 3 runs for all methods. Figure 4 shows shaded regions indicating standard deviation across runs." 53 }, 54 "significance_tests": { 55 "applies": true, 56 "answer": false, 57 "justification": "No statistical significance tests are reported. Comparative claims (e.g., 'substantially outperforms') are based on comparing point estimates with standard deviations, without p-values or formal tests." 58 }, 59 "effect_sizes_reported": { 60 "applies": true, 61 "answer": true, 62 "justification": "Accuracy values are reported with baseline context (e.g., 75.26% vs 52.34% for Arithmetic), and the paper states 'reaching up to 22.9% in the best-performing cases.' Spearman ρ values are reported for entropy-reward correlations." 63 }, 64 "sample_size_justified": { 65 "applies": true, 66 "answer": false, 67 "justification": "The choice of 16,000 training points, 4,000 validation points, and 3 random seeds is not justified. No power analysis or discussion of whether these sizes are sufficient for the claims made." 68 }, 69 "variance_reported": { 70 "applies": true, 71 "answer": true, 72 "justification": "Standard deviation across 3 runs is reported in Table 1 for all methods. The paper explicitly states: 'Each experiment is repeated three times with different random seeds, and we report the mean and standard deviation.'" 73 } 74 }, 75 "evaluation_design": { 76 "baselines_included": { 77 "applies": true, 78 "answer": true, 79 "justification": "Four baselines are compared: Single-Model RLVR, Output-Level Collaboration, Hard Expert Routing (Top-1), and the authors' method without cross-attention. All share the same final policy model and GRPO setup." 80 }, 81 "baselines_contemporary": { 82 "applies": true, 83 "answer": true, 84 "justification": "Baselines represent current paradigms in multi-LM collaboration: debate-style output aggregation, router-based selection, and single-model RLVR. Expert models include recent releases (Qwen-2.5, Llama-3.2)." 85 }, 86 "ablation_study": { 87 "applies": true, 88 "answer": true, 89 "justification": "Extensive ablations: with/without context tokens C (Figure 5), without cross-attention (Table 1), different pooling strategies (Table 2), latent query count m (Figure 6), and different expert team compositions (Default, Generalist, Naive, LLM teams)." 90 }, 91 "multiple_metrics": { 92 "applies": true, 93 "answer": false, 94 "justification": "Only accuracy is reported as the evaluation metric across all tasks. No secondary metrics (e.g., output quality, reasoning chain analysis, token efficiency) are used." 95 }, 96 "human_evaluation": { 97 "applies": true, 98 "answer": false, 99 "justification": "No human evaluation is performed. All evaluation is automated (accuracy on benchmark tasks). Human inspection of reasoning chains or expert utilization quality could have provided additional insight." 100 }, 101 "held_out_test_set": { 102 "applies": true, 103 "answer": true, 104 "justification": "For Reasoning Gym: 'we train on 16,000 procedurally generated data points and report performance on a held-out validation set of 4,000 samples.' For GSM8K: 'we train on the standard training split and evaluate directly on the official test set.'" 105 }, 106 "per_category_breakdown": { 107 "applies": true, 108 "answer": true, 109 "justification": "Table 1 provides per-task-family breakdowns (Algorithmic, Arithmetic, Logic, GSM8K). Figure 4 shows per-expert utilization across different tasks and team compositions." 110 }, 111 "failure_cases_discussed": { 112 "applies": true, 113 "answer": true, 114 "justification": "The paper explicitly discusses failures: 'On Algorithmic tasks, improvements are negligible,' 'For Logic and especially GSM8K, adding expert context provides limited benefit and often degrades performance,' and discusses high variance as evidence of failure to exploit SLM information." 115 }, 116 "negative_results_reported": { 117 "applies": true, 118 "answer": true, 119 "justification": "Several negative results are reported: Generalist Team on GSM8K degrades to 41.02% from 64.32% baseline; Logic performance drops from 96.88% to 82.81%; Hard Routing fails dramatically across all tasks. The paper acknowledges expert conditioning can be 'counterproductive.'" 120 } 121 }, 122 "claims_and_evidence": { 123 "abstract_claims_supported": { 124 "applies": true, 125 "answer": true, 126 "justification": "The abstract says the approach 'is competitive with strong single-model RLVR baselines,' which is generous for GSM8K (61.59% vs 64.32%) but the abstract also qualifies with 'gains varying across tasks.' Claims about emergent expert utilization are supported by Figure 4 analysis." 127 }, 128 "causal_claims_justified": { 129 "applies": true, 130 "answer": true, 131 "justification": "Causal claims like 'expert-derived latent context can provide a useful inductive bias' and 'RLVR induces structured expert utilization' are supported by controlled ablations: removing context tokens C (Figure 5), removing cross-attention (Table 1), varying expert teams, and varying pooling strategies (Table 2)." 132 }, 133 "generalization_bounded": { 134 "applies": true, 135 "answer": true, 136 "justification": "The paper explicitly bounds claims: 'hidden state collaboration should be applied selectively, rather than treated as a general-purpose improvement over single-model RLVR.' Results are reported per-task and the paper acknowledges task-dependent effectiveness." 137 }, 138 "alternative_explanations_discussed": { 139 "applies": true, 140 "answer": true, 141 "justification": "Section 4.3 discusses 'implicit capacity bias' as an alternative to functional expertise: 'higher-capacity models are favored irrespective of nominal domain specialization.' Section 4.4 discusses that 'a coarse and relatively prompt-invariant representation of each SLM is sufficient,' suggesting gains come from static model signatures rather than reasoning traces." 142 }, 143 "proxy_outcome_distinction": { 144 "applies": true, 145 "answer": true, 146 "justification": "The paper measures accuracy on benchmark tasks and claims improvements in accuracy. There is no proxy gap — claims match the granularity of measurements. The paper does not frame benchmark accuracy as a broader capability." 147 } 148 }, 149 "setup_transparency": { 150 "model_versions_specified": { 151 "applies": true, 152 "answer": true, 153 "justification": "Specific model names are provided: Qwen-2.5-3B-Instruct, Qwen2.5-Math-1.5B-Instruct, Qwen2.5-Coder-7B-Instruct, Llama-3.2-3B-Instruct, Phi-3.5-mini-instruct, gemma-2-2b-it, Mistral-7B-Instruct-v0.3, GPT-2, GPT-OSS-20B. For open models, these names identify specific HuggingFace checkpoints." 154 }, 155 "prompts_provided": { 156 "applies": true, 157 "answer": false, 158 "justification": "The YAML configs reference 'DeepSeekZero' as the developer_prompt but the actual prompt text is not provided. For the output collaboration baseline, expert prompts are described ('prompted to generate a short textual hint') but the actual text is not given." 159 }, 160 "hyperparameters_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Extensive hyperparameter reporting in Section 4.1 and Appendix: LoRA rank 8, latent dimension 512, GRPO group size 8, KL coefficient β = 0.04, batch size 32, learning rate 1e-4, 500/233 training steps, 128 token generation budget, m = 8 latent queries, 8 attention heads, weight decay 1e-2, grad clip 1.0." 164 }, 165 "scaffolding_described": { 166 "applies": false, 167 "answer": false, 168 "justification": "No agentic scaffolding is used. The method is a training-time adapter that integrates frozen expert hidden states via cross-attention, not an agent framework." 169 }, 170 "data_preprocessing_documented": { 171 "applies": true, 172 "answer": true, 173 "justification": "Appendix Listings 1-4 provide exact YAML configs for each task family including dataset composition, weights, and curriculum settings. Reasoning Gym tasks use default settings with procedural generation. GSM8K uses the standard train/test split." 174 } 175 }, 176 "limitations_and_scope": { 177 "limitations_section_present": { 178 "applies": true, 179 "answer": false, 180 "justification": "There is no dedicated limitations section. The Conclusions (Section 6) briefly mentions that the hard-routing comparison 'does not exhaust the space of possible router designs' and that the framework 'prioritizes understanding expert utilization... rather than computational efficiency,' but this is not a substantive dedicated section." 181 }, 182 "threats_to_validity_specific": { 183 "applies": true, 184 "answer": false, 185 "justification": "Specific concerns are raised throughout the paper (high variance, capacity bias vs functional expertise, task-dependent benefits), but these are embedded in the analysis rather than collected in a threats-to-validity discussion. No dedicated discussion of specific threats to the study's validity." 186 }, 187 "scope_boundaries_stated": { 188 "applies": true, 189 "answer": false, 190 "justification": "The paper states 'hidden state collaboration should be applied selectively' but does not explicitly state what the results do NOT show. The conclusions mention future work directions but not specific negative scope boundaries." 191 } 192 }, 193 "data_integrity": { 194 "raw_data_available": { 195 "applies": true, 196 "answer": false, 197 "justification": "No raw experimental data, model outputs, or training logs are made available. Only aggregated results in tables and figures are presented." 198 }, 199 "data_collection_described": { 200 "applies": true, 201 "answer": true, 202 "justification": "Data sources are clearly described: Reasoning Gym procedurally generates problems from specified task families, and GSM8K is a standard benchmark. The appendix configs detail exact generation settings." 203 }, 204 "recruitment_methods_described": { 205 "applies": false, 206 "answer": false, 207 "justification": "No human participants. All data comes from standard benchmarks (Reasoning Gym, GSM8K)." 208 }, 209 "data_pipeline_documented": { 210 "applies": true, 211 "answer": true, 212 "justification": "The pipeline is documented: procedural generation with specified configs → GRPO training with stated hyperparameters → evaluation on held-out sets. Appendix YAML configs make the pipeline reproducible at a high level." 213 } 214 }, 215 "conflicts_of_interest": { 216 "funding_disclosed": { 217 "applies": true, 218 "answer": false, 219 "justification": "No funding acknowledgments, grants, or sponsor information is provided anywhere in the paper." 220 }, 221 "affiliations_disclosed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Author affiliations are clearly listed: UC Irvine (majority), Intel Corporation (Mercati), West Point (Bastian), Northeastern University (Imani). The Intel affiliation is disclosed prominently." 225 }, 226 "funder_independent_of_outcome": { 227 "applies": true, 228 "answer": false, 229 "justification": "No funding is disclosed, so independence cannot be assessed. An Intel co-author raises potential conflict since Intel has commercial interest in efficient multi-model systems, but this is not discussed." 230 }, 231 "financial_interests_declared": { 232 "applies": true, 233 "answer": false, 234 "justification": "No competing interests or financial disclosure statement is present. Intel affiliation creates a potential conflict that is not addressed." 235 } 236 }, 237 "contamination": { 238 "training_cutoff_stated": { 239 "applies": true, 240 "answer": false, 241 "justification": "No training data cutoff dates are stated for any of the frozen expert models (Qwen-2.5, Llama-3.2, Phi-3.5, Gemma-2, Mistral-7B, GPT-2). These models could have seen GSM8K during pre-training." 242 }, 243 "train_test_overlap_discussed": { 244 "applies": true, 245 "answer": false, 246 "justification": "No discussion of whether the frozen expert models' pre-training data includes GSM8K test problems. Reasoning Gym's procedural generation mitigates contamination for those tasks, but this is not explicitly discussed." 247 }, 248 "benchmark_contamination_addressed": { 249 "applies": true, 250 "answer": false, 251 "justification": "GSM8K was published in 2021; all expert models were trained after 2021 and may have seen it. The paper does not discuss contamination risk. Reasoning Gym's procedural generation inherently mitigates contamination but this advantage is not explicitly noted." 252 } 253 }, 254 "human_studies": { 255 "pre_registered": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "irb_or_ethics_approval": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "demographics_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "inclusion_exclusion_criteria": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "randomization_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "blinding_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 }, 285 "attrition_reported": { 286 "applies": false, 287 "answer": false, 288 "justification": "No human participants in this study." 289 } 290 }, 291 "cost_and_practicality": { 292 "inference_cost_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No inference cost or latency is reported despite the method requiring forward passes through multiple frozen expert models per input, which adds significant computational overhead." 296 }, 297 "compute_budget_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "The paper mentions 'single NVIDIA H200 GPU' but does not quantify total GPU hours, training time, or cost for any experiment." 301 } 302 }, 303 "experimental_rigor": { 304 "seed_sensitivity_reported": { 305 "applies": true, 306 "answer": true, 307 "justification": "The paper states: 'Each experiment is repeated three times with different random seeds, and we report the mean and standard deviation.' Results across seeds are visible in Table 1 standard deviations." 308 }, 309 "number_of_runs_stated": { 310 "applies": true, 311 "answer": true, 312 "justification": "Explicitly stated in Section 4.1: 'Each experiment is repeated three times with different random seeds.'" 313 }, 314 "hyperparameter_search_budget": { 315 "applies": true, 316 "answer": false, 317 "justification": "No hyperparameter search budget is reported. The paper fixes key hyperparameters (m = 8, latent dim 512, LoRA rank 8) without describing how these values were selected or how many configurations were explored." 318 }, 319 "best_config_selection_justified": { 320 "applies": true, 321 "answer": false, 322 "justification": "The default configuration (m = 8 latent queries) is used without justification. Figure 6 shows m = 8 is optimal in a sweep, but this sweep was presumably done on the same evaluation data used for reporting, and no separate validation set for hyperparameter selection is mentioned." 323 }, 324 "multiple_comparison_correction": { 325 "applies": true, 326 "answer": false, 327 "justification": "Multiple comparisons are made across 4 tasks, 5+ methods, and several team configurations without any correction for multiple comparisons." 328 }, 329 "self_comparison_bias_addressed": { 330 "applies": true, 331 "answer": false, 332 "justification": "All baselines (Hard Routing, Output Collaboration) are the authors' own implementations. The paper does not acknowledge the bias of evaluating their own system against their own implementations of competing approaches." 333 }, 334 "compute_budget_vs_performance": { 335 "applies": true, 336 "answer": false, 337 "justification": "The method requires multiple frozen expert forward passes plus cross-attention per input. The paper acknowledges 'Not minimal compute' in Figure 2 but never reports performance as a function of compute budget or provides compute-matched comparisons." 338 }, 339 "benchmark_construct_validity": { 340 "applies": true, 341 "answer": false, 342 "justification": "No discussion of whether Reasoning Gym or GSM8K actually measures the 'reasoning' capabilities the paper claims to evaluate. The paper uses these benchmarks without questioning their construct validity." 343 }, 344 "scaffold_confound_addressed": { 345 "applies": false, 346 "answer": false, 347 "justification": "No agentic scaffolding is involved. All methods share the same policy model and RLVR training setup; they differ only in how expert context tokens are constructed." 348 } 349 }, 350 "data_leakage": { 351 "temporal_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "Not discussed. GSM8K (2021) predates all expert models' training; solutions may appear in their training data. Reasoning Gym's procedural generation inherently avoids temporal leakage, but this is not explicitly noted." 355 }, 356 "feature_leakage_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "Not discussed. The frozen experts see the same problem prompt as the policy, but whether this introduces any feature leakage through their hidden representations is not examined." 360 }, 361 "non_independence_addressed": { 362 "applies": true, 363 "answer": false, 364 "justification": "Not discussed. Reasoning Gym's procedural generation likely ensures independence, but this is not explicitly verified. For GSM8K, no analysis of train/test independence is provided." 365 }, 366 "leakage_detection_method": { 367 "applies": true, 368 "answer": false, 369 "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination pipelines are mentioned." 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "Soft hidden-state collaboration improves reasoning performance over single-model RLVR, with gains up to 22.9% in the best-performing case (Arithmetic).", 376 "evidence": "Table 1: Default Team achieves 75.26% on Arithmetic vs 52.34% for Single-model RLVR. However, performance degrades on Logic (82.81% vs 96.88%) and GSM8K (61.59% vs 64.32%).", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "RLVR induces structured, task-dependent expert utilization without routing or expert-role supervision.", 381 "evidence": "Figure 4 shows routing entropy decreasing monotonically during training across tasks, with mean Spearman ρ = −0.48 between entropy and reward. Expert attention distributions shift from uniform to concentrated allocations.", 382 "supported": "strong" 383 }, 384 { 385 "claim": "A substantial fraction of performance gains can be attributed to prefix-style latent conditioning rather than fine-grained expert reasoning traces.", 386 "evidence": "Table 2: First-token pooling (75.13%) matches last-token pooling (75.26%) on Arithmetic, suggesting the method relies on coarse model signatures rather than prompt-specific reasoning. Section 4.4 states experts act as 'stable model signatures that regularize the policy.'", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "The learned interface can suppress low-utility experts like GPT-2 and redistribute attention toward higher-capacity experts like GPT-OSS.", 391 "evidence": "Figure 4: GPT-2 is suppressed throughout training in the Naive LM Team; GPT-OSS receives pronounced attention redistribution in the LLM Team. Qualitative evidence from attention weight evolution.", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "Hidden state collaboration is task-dependent: beneficial when the policy has remaining capacity for improvement, counterproductive when the task is near saturation.", 396 "evidence": "Table 1: Clear improvement on Arithmetic (52.34% → 75.26%), negligible on Algorithmic (51.56% → 51.82%), degradation on Logic (96.88% → 82.81%) and GSM8K (64.32% → 61.59%). Section 4.2 analysis.", 397 "supported": "strong" 398 } 399 ], 400 "red_flags": [ 401 { 402 "flag": "Best-case emphasis in abstract", 403 "detail": "The abstract highlights 'up to 22.9% improvement' from the best setting (Arithmetic, Default Team). Most other task-team combinations show marginal or negative effects. The word 'competitive' in the abstract is generous given GSM8K degradation." 404 }, 405 { 406 "flag": "Extreme variance suggesting training instability", 407 "detail": "Generalist Team on GSM8K shows 41.02 ± 29.01, meaning at least one of the three runs likely collapsed near zero. This extreme variance is acknowledged but not adequately explained or resolved." 408 }, 409 { 410 "flag": "No compute-matched comparison", 411 "detail": "The method runs multiple frozen expert forward passes per input in addition to the policy. Single-model RLVR uses only the policy. No experiment controls for the additional compute, so improvements on Arithmetic could partly reflect compute advantages." 412 }, 413 { 414 "flag": "Only 3 random seeds", 415 "detail": "With only 3 seeds and high run-to-run variance, the standard deviations have very wide confidence intervals themselves. Many differences between methods may not be statistically reliable." 416 }, 417 { 418 "flag": "Missing contamination analysis for GSM8K", 419 "detail": "GSM8K (2021) is one of the most widely contaminated benchmarks. All frozen expert models were trained after 2021 and may encode GSM8K solutions in their hidden states. This confound is not discussed." 420 } 421 ], 422 "cited_papers": [ 423 { 424 "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning", 425 "authors": ["Daya Guo"], 426 "year": 2025, 427 "arxiv_id": "2501.12948", 428 "relevance": "Foundational work on RLVR for reasoning in LLMs, demonstrating that structured reasoning can be transferred to smaller models through RL and distillation." 429 }, 430 { 431 "title": "Phi-4 technical report", 432 "authors": ["Marah Abdin"], 433 "year": 2024, 434 "arxiv_id": "2412.08905", 435 "relevance": "Demonstrates that small language models can reach or surpass GPT-3.5-class reasoning performance, supporting the modular SLM paradigm." 436 }, 437 { 438 "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversations", 439 "authors": ["Qingyun Wu"], 440 "year": 2024, 441 "relevance": "Multi-agent LLM conversation framework representing the output-level collaboration paradigm that this paper compares against." 442 }, 443 { 444 "title": "MetaGPT: Meta programming for a multi-agent collaborative framework", 445 "authors": ["Sirui Hong"], 446 "year": 2024, 447 "relevance": "Multi-agent collaborative framework for software development, representing agentic AI coordination approaches." 448 }, 449 { 450 "title": "Improving factuality and reasoning in language models through multiagent debate", 451 "authors": ["Yilun Du"], 452 "year": 2023, 453 "relevance": "Foundational work on multi-LLM debate for reasoning improvement, a key baseline paradigm for multi-model collaboration." 454 }, 455 { 456 "title": "Reasoning gym: Reasoning environments for reinforcement learning with verifiable rewards", 457 "authors": ["Zafir Stojanovski"], 458 "year": 2025, 459 "relevance": "Primary benchmark framework used in this paper for evaluating RLVR-based reasoning, with procedurally generated tasks." 460 }, 461 { 462 "title": "Training verifiers to solve math word problems", 463 "authors": ["Karl Cobbe"], 464 "year": 2021, 465 "arxiv_id": "2110.14168", 466 "relevance": "GSM8K benchmark paper, a standard mathematical reasoning evaluation used as a secondary benchmark in this work." 467 }, 468 { 469 "title": "LoRA: Low-rank adaptation of large language models", 470 "authors": ["Edward J Hu"], 471 "year": 2022, 472 "relevance": "Parameter-efficient fine-tuning method used for the trainable policy in this work, central to the lightweight adapter approach." 473 }, 474 { 475 "title": "Small language models are the future of agentic ai", 476 "authors": ["Peter Belcak"], 477 "year": 2025, 478 "arxiv_id": "2506.02153", 479 "relevance": "Argues for composing collections of off-the-shelf expert SLMs rather than scaling monolithic LLMs, directly motivating this paper's approach." 480 }, 481 { 482 "title": "Router-r1: Teaching LLMs multi-round routing and aggregation via reinforcement learning", 483 "authors": ["Haozhen Zhang"], 484 "year": 2025, 485 "relevance": "RL-based LLM routing approach representing the hard expert routing paradigm that this paper compares against." 486 }, 487 { 488 "title": "DAPO: An open-source LLM reinforcement learning system at scale", 489 "authors": ["Qiying Yu"], 490 "year": 2025, 491 "relevance": "Open-source RL system for LLM training; DAPO is one of the RL algorithms the paper's RLVR framework generalizes over." 492 }, 493 { 494 "title": "Does reinforcement learning really incentivize reasoning capacity in LLMs beyond the base model?", 495 "authors": ["Yang Yue"], 496 "year": 2025, 497 "relevance": "Questions whether RLVR induces new reasoning capabilities or merely reshapes existing behaviors, providing important context for interpreting this paper's claims." 498 } 499 ] 500 }