scan.json (35019B)
1 { 2 "paper": { 3 "title": "MagicAgent: Towards Generalized Agent Planning", 4 "authors": [ 5 "Xuhui Ren", 6 "Shaokang Dong", 7 "Chen Yang", 8 "Qing Gao", 9 "Yunbin Zhao", 10 "Yongsheng Liu", 11 "Xinwei Geng", 12 "Xiang Li", 13 "Demei Yan", 14 "Yanqing Li", 15 "Chenhao Huang", 16 "Dingwei Zhu", 17 "Junjie Ye", 18 "Boxuan Yue", 19 "Yingnan Fu", 20 "Mengzhe Lv", 21 "Zezeng Feng", 22 "Boshen Zhou", 23 "Bocheng Wang", 24 "Xuanjing Huang", 25 "Yu-Gang Jiang", 26 "Tao Gui", 27 "Qi Zhang", 28 "Yunke Zhang" 29 ], 30 "year": 2026, 31 "venue": "arXiv", 32 "arxiv_id": "2602.19000" 33 }, 34 "scan_version": 2, 35 "active_modules": ["experimental_rigor", "data_leakage"], 36 "methodology_tags": ["benchmark-eval"], 37 "key_findings": "MagicAgent introduces a lightweight synthetic data framework for diverse planning tasks and a two-stage SFT+RL training paradigm to address multi-task interference. MagicAgent-32B and MagicAgent-30B-A3B achieve strong results across five public benchmarks (Worfbench, NaturalPlan, τ2-Bench, BFCL-v3, ACEBench), outperforming many sub-100B and several ultra-scale models. The proposed χPO online RL algorithm matches or exceeds prior methods on ALFWorld with a smaller model (Qwen2.5-3B). A global load-balancing strategy with z-loss stabilizes MoE training for heterogeneous agent tasks.", 38 "checklist": { 39 "artifacts": { 40 "code_released": { 41 "applies": true, 42 "answer": false, 43 "justification": "The paper states 'The MagicAgent series models will be released soon' (footnote, page 1). This is a promise of future release, not an actual release. No repository URL is provided." 44 }, 45 "data_released": { 46 "applies": true, 47 "answer": false, 48 "justification": "The synthetic training data pipeline is described in detail (Sections 3.1-3.5) but the actual generated training data is not released. The in-house MagicEval benchmarks are proprietary. Evaluation uses public benchmarks (NaturalPlan, Worfbench, etc.) but the training data is not shared." 49 }, 50 "environment_specified": { 51 "applies": true, 52 "answer": false, 53 "justification": "Table 8 lists training hyperparameters (optimizer, learning rate, bf16 precision) and mentions 32 GPUs (4 machines × 8 GPUs) with NVIDIA A800s for inference (Table 7). However, no requirements.txt, Dockerfile, or detailed software dependency list is provided — not enough detail to recreate the environment." 54 }, 55 "reproduction_instructions": { 56 "applies": true, 57 "answer": false, 58 "justification": "No reproduction instructions, README, or scripts to replicate experiments. The paper describes the methodology but does not provide step-by-step instructions for reproducing results." 59 } 60 }, 61 "statistical_methodology": { 62 "confidence_intervals_or_error_bars": { 63 "applies": true, 64 "answer": false, 65 "justification": "Main results in Tables 2, 3, and 4 report point estimates only (e.g., '80.3%', '69.7%') with no confidence intervals or error bars. Even the ALFWorld results (Table 5), which use 5 seeds, only report Succ.* (best) and Succ. (average) without CIs." 66 }, 67 "significance_tests": { 68 "applies": true, 69 "answer": false, 70 "justification": "No statistical significance tests are reported anywhere in the paper. Claims like 'substantially outperforming' and 'surpassing GPT-5.2 by approximately 19.5%' are based solely on comparing raw numbers without any statistical test." 71 }, 72 "effect_sizes_reported": { 73 "applies": true, 74 "answer": true, 75 "justification": "The paper reports percentage improvements with baseline context throughout: e.g., '27.0% relative improvement over the base Qwen3-32B model (48.8%)' (Section 6.1.2), 'reduction of 28.9% to 59.6% in Average TTFT' (Section 6.4, Table 7). Tables provide all baseline numbers for comparison." 76 }, 77 "sample_size_justified": { 78 "applies": true, 79 "answer": false, 80 "justification": "No justification for benchmark sizes, number of training examples, or number of experimental runs. The sizes of the synthetic training datasets are not stated, nor is there any power analysis or discussion of whether the benchmark sizes are adequate for the claims." 81 }, 82 "variance_reported": { 83 "applies": true, 84 "answer": false, 85 "justification": "Main results (Tables 2-4) appear to be single-run numbers with no variance reported. For ALFWorld (Table 5), they state '5 random seeds' but report only Succ.* (mean of best) and Succ. (average after convergence) without standard deviation, IQR, or any spread measure." 86 } 87 }, 88 "evaluation_design": { 89 "baselines_included": { 90 "applies": true, 91 "answer": true, 92 "justification": "Extensive baselines in Tables 2-5: ultra-scale models (GPT-5.2, Kimi-K2, DeepSeek-V3.1, GLM-4.7, Qwen3-235B, Qwen3-MAX) and large-scale models (Qwen3-32B, Llama3.3-70B, ERNIE-4.5, Olmo-3.1, GLM-4.7-Flash). For ALFWorld, additional RL baselines (AgentGym, SFT, GRPO, GiGPO, RLVMR, EPO)." 93 }, 94 "baselines_contemporary": { 95 "applies": true, 96 "answer": true, 97 "justification": "Baselines include GPT-5.2 (2025), DeepSeek-V3.1, Kimi-K2, GLM-4.7, Qwen3 series — all state-of-the-art models contemporary with the paper's 2026 publication date." 98 }, 99 "ablation_study": { 100 "applies": true, 101 "answer": true, 102 "justification": "Table 6 presents ablation of MoE optimization strategies (z-loss, global-batch LBL, micro-batch LBL, variance loss, expert capacity). Section 6.3 compares χPO components. Figure 8 analyzes entropy dynamics. Both dense and MoE variants are compared." 103 }, 104 "multiple_metrics": { 105 "applies": true, 106 "answer": true, 107 "justification": "Multiple metrics across benchmarks: F1 Chain and F1 Graph (Worfbench), accuracy per subtask (NaturalPlan: Trip/Meeting/Calendar), domain-specific accuracy (τ2-Bench: Retail/Airline), Live/Non-Live accuracy (BFCL-v3), English/Chinese accuracy (ACEBench), plus Step/Embedding/LLM metrics for MagicEval-Plan and Tool Name/Argument accuracy for MagicEval-Tool." 108 }, 109 "human_evaluation": { 110 "applies": true, 111 "answer": false, 112 "justification": "All evaluation is fully automated using benchmark metrics and automated judges (GPT-5.2 as LLM-as-a-Judge for MagicEval-Plan). No human evaluation of model outputs is included." 113 }, 114 "held_out_test_set": { 115 "applies": true, 116 "answer": true, 117 "justification": "Evaluations follow official benchmark protocols. ALFWorld explicitly uses separate 'seen' and 'unseen' validation splits (Section 6.3). Public benchmarks (NaturalPlan, Worfbench, BFCL-v3, ACEBench, τ2-Bench) have their own test splits, and the paper states 'we strictly followed the official instructions' (Section 6.1.1)." 118 }, 119 "per_category_breakdown": { 120 "applies": true, 121 "answer": true, 122 "justification": "Detailed breakdowns: NaturalPlan by Trip/Meeting/Calendar, τ2-Bench by Retail/Airline, BFCL-v3 by Live/Non-Live, ACEBench by English/Chinese, MagicEval by General/Dependency/Condition/Context Inheritance. Tables 2-4 provide per-subtask numbers." 123 }, 124 "failure_cases_discussed": { 125 "applies": true, 126 "answer": false, 127 "justification": "No failure case analysis or error analysis is presented. The paper discusses training dynamics (Figure 8) and ablation results but never examines what types of planning tasks MagicAgent fails on or why." 128 }, 129 "negative_results_reported": { 130 "applies": true, 131 "answer": true, 132 "justification": "Table 6 shows ablation configurations that fail to maintain balanced performance: 'variance loss and sequence-level LBL exhibit suboptimal performance on τ2-Bench: retail and airline scores are mostly below 45' (Section 6.4). The paper explicitly notes that 'most mainstream training tricks fail to maintain balanced performance.'" 133 } 134 }, 135 "claims_and_evidence": { 136 "abstract_claims_supported": { 137 "applies": true, 138 "answer": true, 139 "justification": "Abstract claims (75.1% Worfbench, 55.9% NaturalPlan, 57.5% τ2-Bench, 86.9% BFCL-v3, 81.2% ACEBench) match the numbers derived from Table 2 (e.g., Worfbench: average of F1 Chain 80.3 and F1 Graph 69.7 ≈ 75.0; NaturalPlan: average of Trip/Meeting/Calendar). Claims of outperforming sub-100B models are supported by Table 2 comparisons." 140 }, 141 "causal_claims_justified": { 142 "applies": true, 143 "answer": true, 144 "justification": "The paper makes causal claims about training components ('our planning-centric training can effectively enhance the performance'). Table 6 provides controlled ablation studies (single-variable manipulation of training strategies with identical data/hyperparameters), which constitutes adequate causal justification for component-level claims." 145 }, 146 "generalization_bounded": { 147 "applies": true, 148 "answer": false, 149 "justification": "The title claims 'Towards Generalized Agent Planning' but results are limited to five specific benchmarks. The abstract claims models 'substantially outperform existing sub-100B models' broadly, without bounding this to the tested benchmarks. No discussion of what planning domains or task types were NOT tested." 150 }, 151 "alternative_explanations_discussed": { 152 "applies": true, 153 "answer": false, 154 "justification": "No discussion of alternative explanations for the results. The improvements could be due to the larger synthetic training data volume rather than the proposed training methodology, or due to data similarity to benchmark formats. No threats-to-validity or confound analysis is provided." 155 }, 156 "proxy_outcome_distinction": { 157 "applies": true, 158 "answer": false, 159 "justification": "The paper measures accuracy on specific benchmarks (Worfbench F1, NaturalPlan accuracy, etc.) but frames results as 'generalized agent planning' capability. The gap between benchmark performance and actual generalized planning ability is not discussed. No acknowledgment that benchmark accuracy may not reflect real-world planning effectiveness." 160 } 161 }, 162 "setup_transparency": { 163 "model_versions_specified": { 164 "applies": true, 165 "answer": false, 166 "justification": "The paper uses marketing names without version snapshots or API versions: 'GPT-5.2', 'Kimi-K2-Instruct', 'DeepSeek-V3.1', 'GLM-4.7', 'Qwen3-32B', 'Qwen3-30B-A3B-Instruct-2507'. While some model names include date suffixes (e.g., '-2507'), exact model snapshot IDs or API versions are not provided." 167 }, 168 "prompts_provided": { 169 "applies": true, 170 "answer": false, 171 "justification": "System prompts and training prompts are described in natural language (e.g., 'heterogeneous, task-specific prompts to further distinguish tasks') but never shown in full. Appendices C-G show data format examples, not the actual prompts used for data generation or model inference." 172 }, 173 "hyperparameters_reported": { 174 "applies": true, 175 "answer": true, 176 "justification": "Table 8 provides comprehensive training hyperparameters: learning rates (1e-5 SFT, 1e-6 RL), optimizer settings (Adam β1=0.9, β2=0.95), generation parameters (temperature=0.9, top_p=0.96, top_k=20), batch sizes, parallelism configuration, MoE loss coefficients, and more." 177 }, 178 "scaffolding_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "The multi-agent data synthesis pipeline is described in detail: Reasoning Agent, Planner, Simulator Agent roles (Section 3.2, Figure 3). The ReAct paradigm for inference is described (Section 3.2). State transition mechanisms and workflow are documented with figures (Figures 2-6)." 182 }, 183 "data_preprocessing_documented": { 184 "applies": true, 185 "answer": true, 186 "justification": "Data preprocessing is documented: SimHash deduplication (Section 3.1), schema validation enforcing strict structural format, semantic verification via GPT-5.2, negative sampling with distractor tools (Section 3.2), rejection sampling for constraint scheduling (Section 3.3), DAG verification (Section 3.4), NovelSum metric for sample selection (Section 4.1)." 187 } 188 }, 189 "limitations_and_scope": { 190 "limitations_section_present": { 191 "applies": true, 192 "answer": false, 193 "justification": "No dedicated limitations section exists in the paper. The conclusion (Section 7) mentions future work ('advancing long-horizon reasoning and code-augmented task completion') but does not discuss limitations of the current approach." 194 }, 195 "threats_to_validity_specific": { 196 "applies": true, 197 "answer": false, 198 "justification": "No threats to validity are discussed anywhere in the paper. There is no consideration of specific methodological limitations, potential confounds, or scenarios where the approach might fail." 199 }, 200 "scope_boundaries_stated": { 201 "applies": true, 202 "answer": false, 203 "justification": "No explicit scope boundaries are stated. The paper does not specify what planning tasks, domains, or scenarios were NOT tested, nor what claims the authors are NOT making. The broad title ('Generalized Agent Planning') is not bounded by explicit caveats." 204 } 205 }, 206 "data_integrity": { 207 "raw_data_available": { 208 "applies": true, 209 "answer": false, 210 "justification": "No raw training data, synthetic datasets, or MagicEval benchmark data is released. Only processed results are shown in tables. The synthetic data generation process is described but the actual data is not available for verification." 211 }, 212 "data_collection_described": { 213 "applies": true, 214 "answer": true, 215 "justification": "Sections 3.1-3.5 describe data collection in detail: 5,000+ tools aggregated from open-source APIs and internal repository (Section 3.1), tool dependency graphs constructed, atomic plan synthesis pipeline, constraint satisfaction framework for scheduling data (Section 3.3), and long-horizon trajectory generation process (Section 3.5)." 216 }, 217 "recruitment_methods_described": { 218 "applies": true, 219 "answer": false, 220 "justification": "For MagicEval benchmarks, the paper states data was collected from 'actual user interaction logs from our production environment' (Section 6.2.1) with PII removed, but does not describe what users, how many, what time period, or whether this introduces selection bias. For training data, the sampling of 5,000+ tools is described only as 'from open-source APIs and our internal repository' without detailing the selection process." 221 }, 222 "data_pipeline_documented": { 223 "applies": true, 224 "answer": false, 225 "justification": "The pipeline stages are described (tool curation → graph construction → atomic synthesis → trajectory composition → filtering), but counts at each filtering stage are not provided. The paper does not state how many examples were generated, how many passed each filter, or how many were retained in the final training set." 226 } 227 }, 228 "conflicts_of_interest": { 229 "funding_disclosed": { 230 "applies": true, 231 "answer": false, 232 "justification": "No funding section, acknowledgments, or grant information is provided in the paper. The work appears to be funded by Honor Device Co. (corporate employer of 16 of 24 authors) but this is not explicitly disclosed as funding." 233 }, 234 "affiliations_disclosed": { 235 "applies": true, 236 "answer": true, 237 "justification": "Author affiliations are clearly listed: Honor Device Co., Ltd (superscript 1) and Fudan University (superscript 2). The corporate affiliation is transparent on the first page." 238 }, 239 "funder_independent_of_outcome": { 240 "applies": true, 241 "answer": false, 242 "justification": "Honor Device Co. is both the employer of the majority of authors and the company deploying MagicAgent in 'Honor's intelligent assistant' (Section 7). The company has a direct financial interest in demonstrating superior performance of their model. This is a clear non-independent funder." 243 }, 244 "financial_interests_declared": { 245 "applies": true, 246 "answer": false, 247 "justification": "No competing interests or financial interests statement is included in the paper. No patent disclosures or equity declarations despite the model being commercially deployed by Honor." 248 } 249 }, 250 "contamination": { 251 "training_cutoff_stated": { 252 "applies": true, 253 "answer": false, 254 "justification": "The base model is described as 'Qwen3 series' (Section 4.1) but no training data cutoff date is stated for either the Qwen3 base model or the synthetic training data. It is unknown whether the base model's training data includes benchmark examples." 255 }, 256 "train_test_overlap_discussed": { 257 "applies": true, 258 "answer": false, 259 "justification": "No discussion of whether any of the evaluation benchmarks (NaturalPlan, Worfbench, BFCL-v3, ACEBench, τ2-Bench) or their constituent problems appeared in the Qwen3 base model's training data or in the synthetic training data." 260 }, 261 "benchmark_contamination_addressed": { 262 "applies": true, 263 "answer": false, 264 "justification": "All evaluation benchmarks are publicly available and predate the paper. No contamination analysis is performed. For example, NaturalPlan (2024), Worfbench (2025), and ALFWorld (2020) could plausibly be in Qwen3's training data, but this is never addressed." 265 } 266 }, 267 "human_studies": { 268 "pre_registered": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study. All experiments involve model evaluation on automated benchmarks." 272 }, 273 "irb_or_ethics_approval": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants. The MagicEval data uses anonymized production logs but this is not a human subjects study." 277 }, 278 "demographics_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in experiments." 282 }, 283 "inclusion_exclusion_criteria": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in experiments." 287 }, 288 "randomization_described": { 289 "applies": false, 290 "answer": false, 291 "justification": "No human participants in experiments." 292 }, 293 "blinding_described": { 294 "applies": false, 295 "answer": false, 296 "justification": "No human participants in experiments." 297 }, 298 "attrition_reported": { 299 "applies": false, 300 "answer": false, 301 "justification": "No human participants in experiments." 302 } 303 }, 304 "cost_and_practicality": { 305 "inference_cost_reported": { 306 "applies": true, 307 "answer": true, 308 "justification": "Table 7 reports inference efficiency metrics: Average TTFT (time to first token), Average TPOT (time per output token), and Average Latency across concurrency levels 1/10/50/100, comparing dense vs MoE models on 4× NVIDIA A800 GPUs with 1024 input/output tokens." 309 }, 310 "compute_budget_stated": { 311 "applies": true, 312 "answer": false, 313 "justification": "Table 8 mentions 32 GPUs (4 machines × 8 GPUs) and A800 GPUs for inference, but total GPU hours, training wall-clock time, and total computational cost are not stated. The training budget for the synthetic data generation (LLM calls to GPT-5.2 for verification) is also unstated." 314 } 315 }, 316 "experimental_rigor": { 317 "seed_sensitivity_reported": { 318 "applies": true, 319 "answer": false, 320 "justification": "For ALFWorld (Section 6.3), experiments use 5 random seeds and report Succ.* and Succ. metrics. However, main benchmark results (Tables 2-4) do not report any seed sensitivity. It is unclear whether main results are single-run or averaged, and no standard deviations are provided." 321 }, 322 "number_of_runs_stated": { 323 "applies": true, 324 "answer": false, 325 "justification": "ALFWorld experiments explicitly state '5 random seeds' (Section 6.3). However, for the main benchmark evaluations (Tables 2-4), the number of runs is never stated. It is unclear whether results are from a single run or averaged." 326 }, 327 "hyperparameter_search_budget": { 328 "applies": true, 329 "answer": false, 330 "justification": "Table 8 lists final hyperparameters but does not describe any search process. No information on how many configurations were tried, what search method was used, or total compute spent on hyperparameter tuning." 331 }, 332 "best_config_selection_justified": { 333 "applies": true, 334 "answer": false, 335 "justification": "The final training configuration is presented without justification for how it was selected. The MoE ablation (Table 6) compares strategies but doesn't describe how the final joint configuration was arrived at or validated." 336 }, 337 "multiple_comparison_correction": { 338 "applies": true, 339 "answer": false, 340 "justification": "No statistical tests are performed at all, so no multiple comparison correction is possible. The paper makes many comparisons across models, benchmarks, and conditions without any statistical testing framework." 341 }, 342 "self_comparison_bias_addressed": { 343 "applies": true, 344 "answer": false, 345 "justification": "The authors (Honor employees) evaluate their own MagicAgent system against baselines using their own evaluation code and in-house benchmarks. No acknowledgment of author-evaluation bias or self-comparison bias. Baseline results appear to use the authors' own evaluation runs." 346 }, 347 "compute_budget_vs_performance": { 348 "applies": true, 349 "answer": false, 350 "justification": "MagicAgent-32B is compared against ultra-scale models like GPT-5.2 and Qwen3-235B without discussing compute budget differences. While Table 7 compares dense vs MoE inference efficiency, no performance-vs-compute analysis is provided for the main results." 351 }, 352 "benchmark_construct_validity": { 353 "applies": true, 354 "answer": false, 355 "justification": "No discussion of whether the selected benchmarks actually measure 'generalized agent planning' as claimed. The paper does not question whether Worfbench F1, NaturalPlan accuracy, or BFCL-v3 scores are valid proxies for the claimed capability of generalized planning." 356 }, 357 "scaffold_confound_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "While the paper uses official evaluation protocols for all benchmarks, it does not explicitly discuss scaffolding as a variable. For τ2-Bench, it notes that GLM uses 'an optimized user simulator' but does not analyze whether different evaluation setups affect comparability. The scaffold confound is not treated as an explicit factor." 361 } 362 }, 363 "data_leakage": { 364 "temporal_leakage_addressed": { 365 "applies": true, 366 "answer": false, 367 "justification": "No discussion of temporal leakage. The Qwen3 base model's training data likely includes published benchmarks (ALFWorld 2020, NaturalPlan 2024, Worfbench 2025), but the temporal relationship between training data and benchmarks is never analyzed." 368 }, 369 "feature_leakage_addressed": { 370 "applies": true, 371 "answer": false, 372 "justification": "No discussion of feature leakage. The paper does not analyze whether the synthetic training data generation process (which uses benchmark-style task formulations) could leak structural information about the evaluation benchmarks." 373 }, 374 "non_independence_addressed": { 375 "applies": true, 376 "answer": false, 377 "justification": "No discussion of whether training and test data share structural similarities. The synthetic data uses similar task types (tool calling, scheduling, workflow orchestration) to the evaluation benchmarks, creating a potential overlap in task distributions, but this is not addressed." 378 }, 379 "leakage_detection_method": { 380 "applies": true, 381 "answer": false, 382 "justification": "No concrete leakage detection or prevention method is used. No canary strings, n-gram overlap analysis, membership inference tests, or decontamination pipelines are mentioned." 383 } 384 } 385 }, 386 "claims": [ 387 { 388 "claim": "MagicAgent-32B achieves state-of-the-art among sub-100B models with 75.1% on Worfbench, 55.9% on NaturalPlan, 57.5% on τ2-Bench, 86.9% on BFCL-v3, and 81.2% on ACEBench, surpassing several closed-source ultra-scale models.", 389 "evidence": "Table 2 shows MagicAgent-32B outperforming all large-scale models and several ultra-scale models (GPT-5.2, DeepSeek-V3.1) across these benchmarks. Specific numbers: Worfbench F1 Chain 80.3%/F1 Graph 69.7%, NaturalPlan Trip 48.6%/Meeting 57.7%/Calendar 61.5%.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "The two-stage SFT+RL training paradigm mitigates the 'seesaw effect' in multi-task learning, enabling synergistic utilization of heterogeneous planning data.", 394 "evidence": "Section 4 describes the paradigm conceptually. Table 6 ablates MoE optimization strategies, showing that joint optimization (LBLgbl + z_loss) achieves the best balanced performance. However, no direct measurement of the seesaw effect (e.g., showing per-task performance degradation without the proposed method) is provided.", 395 "supported": "weak" 396 }, 397 { 398 "claim": "χPO achieves state-of-the-art on ALFWorld, matching or exceeding larger models with Qwen2.5-3B: 91.7% IID Succ.*, 91.8% OOD Succ.*, 78.2% IID Succ., 75.9% OOD Succ.", 399 "evidence": "Table 5 shows χPO with Qwen2.5-3B matching EPO's IID Succ.* (91.7%) while exceeding it on OOD Succ.* (91.8% vs 89.6%), IID Succ. (78.2% vs 75.8%), and OOD Succ. (75.9% vs 75.4%). Results are averaged over 5 random seeds.", 400 "supported": "moderate" 401 }, 402 { 403 "claim": "Global batch load balancing combined with z-loss achieves the best balanced MoE training for heterogeneous agent tasks.", 404 "evidence": "Table 6 compares seven strategies. BP + LBLgbl + z_loss achieves the best τ2-Bench performance (52.6% retail, 52.5% airline) with competitive Worfbench/NaturalPlan scores. Figure 9 shows MaxVio stabilization. Figure 10 shows task-specific expert specialization.", 405 "supported": "moderate" 406 }, 407 { 408 "claim": "MagicAgent achieves 98.0% step accuracy, 97.7% tool name accuracy, and 87.3% tool argument accuracy on in-house MagicEval benchmarks.", 409 "evidence": "Tables 3 and 4 show these results under the General scenario. MagicAgent-32B substantially outperforms all baselines across all MagicEval settings (General, Dependency, Condition, Context Inheritance).", 410 "supported": "weak" 411 }, 412 { 413 "claim": "The MoE variant (MagicAgent-30B-A3B) reduces inference latency by up to 54.7% compared to the dense model (MagicAgent-32B) while maintaining competitive performance.", 414 "evidence": "Table 7 shows latency reductions across concurrency levels: 54.7% at concurrency=1, 39.9% at 10, 35.8% at 50, 40.6% at 100. Table 2 shows MagicAgent-30B-A3B achieves comparable benchmark scores to MagicAgent-32B.", 415 "supported": "moderate" 416 } 417 ], 418 "red_flags": [ 419 { 420 "flag": "Company evaluating its own product", 421 "detail": "Honor Device Co. employees (16 of 24 authors) develop and evaluate MagicAgent, which is deployed in 'Honor's intelligent assistant' (Section 7). The company has a direct commercial interest in demonstrating superior performance. No conflicts of interest statement is provided." 422 }, 423 { 424 "flag": "No error bars on main results", 425 "detail": "Tables 2, 3, and 4 report point estimates only with no uncertainty quantification. It is unclear whether results are single-run or averaged. Without variance information, it is impossible to assess whether performance differences are meaningful." 426 }, 427 { 428 "flag": "Proprietary in-house benchmarks", 429 "detail": "MagicEval-Plan and MagicEval-Tool benchmarks (Tables 3-4) are proprietary and cannot be independently verified or reproduced. The paper shows near-perfect scores (98%) on these benchmarks while competitors score much lower, but external validation is impossible." 430 }, 431 { 432 "flag": "Models and data not released", 433 "detail": "Both the trained models and synthetic training data are not released. The paper only promises future model release ('will be released soon'). Without access to models or data, no claims can be independently verified." 434 }, 435 { 436 "flag": "No limitations section", 437 "detail": "The paper lacks any limitations section, threats to validity, or discussion of scenarios where the approach might fail. For a paper making broad claims about 'generalized agent planning,' the absence of scope boundaries is concerning." 438 }, 439 { 440 "flag": "Missing contamination analysis", 441 "detail": "The Qwen3 base model's training data likely includes several evaluation benchmarks (ALFWorld published 2020, NaturalPlan 2024). The synthetic training data uses similar task formats to the benchmarks. No decontamination or overlap analysis is performed." 442 }, 443 { 444 "flag": "Cherry-picked baseline comparisons", 445 "detail": "Several baseline entries in Table 2 show suspiciously poor performance (DeepSeek-V3.1-nothink at 1.4% on NaturalPlan Trip, Qwen3-MAX at 1.2%) suggesting the 'nothink' mode may be disadvantageous. Baselines are run with thinking disabled while MagicAgent may benefit from its specialized training." 446 } 447 ], 448 "cited_papers": [ 449 { 450 "title": "React: Synergizing reasoning and acting in language models", 451 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik R Narasimhan", "Yuan Cao"], 452 "year": 2022, 453 "relevance": "Foundational agent framework (Reasoning + Acting) used as the inference paradigm in MagicAgent and as a baseline in ALFWorld experiments." 454 }, 455 { 456 "title": "ToolLLM: Facilitating large language models to master 16000+ real-world APIs", 457 "authors": ["Yujia Qin", "Shihao Liang", "Yining Ye"], 458 "year": 2023, 459 "arxiv_id": "2307.16789", 460 "relevance": "Large-scale tool-learning dataset using real-world APIs with depth-first search trajectory collection; directly relevant to tool-augmented planning." 461 }, 462 { 463 "title": "AgentTuning: Enabling generalized agent abilities for LLMs", 464 "authors": ["Aohan Zeng", "Mingdao Liu", "Rui Lu"], 465 "year": 2024, 466 "relevance": "SFT-based approach for adapting LLMs to agentic tasks; a key prior work in agent model fine-tuning." 467 }, 468 { 469 "title": "Agent-FLAN: Designing data and methods of effective agent tuning for large language models", 470 "authors": ["Zehui Chen", "Kuikun Liu", "Qiuchen Wang"], 471 "year": 2024, 472 "arxiv_id": "2403.12881", 473 "relevance": "Agent fine-tuning methodology using curated instruction corpora and interaction trajectories." 474 }, 475 { 476 "title": "AgentGym: Evolving large language model-based agents across diverse environments", 477 "authors": ["Z. Xi", "Y. Ding", "W. Chen"], 478 "year": 2024, 479 "arxiv_id": "2406.04151", 480 "relevance": "RL-based agent optimization framework used as a baseline in ALFWorld experiments." 481 }, 482 { 483 "title": "WebRL: Training LLM web agents via self-evolving online curriculum reinforcement learning", 484 "authors": ["Zehan Qi", "Xiao Liu"], 485 "year": 2024, 486 "arxiv_id": "2411.02337", 487 "relevance": "Self-supervised RL method for web agents; relevant to online RL training methodology." 488 }, 489 { 490 "title": "Natural plan: Benchmarking LLMs on natural language planning", 491 "authors": ["Huaixiu Steven Zheng", "Swaroop Mishra", "Hugh Zhang"], 492 "year": 2024, 493 "arxiv_id": "2406.04520", 494 "relevance": "Multi-constraint scheduling benchmark (Trip, Meeting, Calendar) used as primary evaluation in this paper." 495 }, 496 { 497 "title": "Benchmarking agentic workflow generation", 498 "authors": ["Shuofei Qiao", "Runnan Fang", "Zhisong Qiu"], 499 "year": 2025, 500 "arxiv_id": "2410.07869", 501 "relevance": "WorfBench benchmark for procedural logic orchestration; used as primary evaluation benchmark in this paper." 502 }, 503 { 504 "title": "τ2-bench: Evaluating conversational agents in a dual-control environment", 505 "authors": ["Victor Barres", "Honghua Dong", "Soham Ray", "Xujie Si", "Karthik Narasimhan"], 506 "year": 2025, 507 "arxiv_id": "2506.07982", 508 "relevance": "Long-horizon tool execution benchmark (Retail/Airline domains) used as primary evaluation in this paper." 509 }, 510 { 511 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", 512 "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"], 513 "year": 2025, 514 "arxiv_id": "2501.12948", 515 "relevance": "Reinforcement learning with verifiable rewards methodology; foundational approach for the RLVR design in MagicAgent." 516 }, 517 { 518 "title": "EPO: Entropy-regularized policy optimization for LLM agents reinforcement learning", 519 "authors": ["Wujiang Xu", "Wentian Zhao", "Zhenting Wang"], 520 "year": 2025, 521 "arxiv_id": "2509.22576", 522 "relevance": "Entropy-regularized RL algorithm for agents; direct competitor and inspiration for the proposed χPO algorithm. Used as primary baseline on ALFWorld." 523 }, 524 { 525 "title": "MagNet: Multi-turn tool-use data synthesis and distillation via graph translation", 526 "authors": ["Fan Yin", "Zifeng Wang", "I-Hung Hsu"], 527 "year": 2025, 528 "arxiv_id": "2503.07826", 529 "relevance": "Bidirectional translation methodology for tool-augmented data synthesis; directly used in MagicAgent's data generation pipeline." 530 }, 531 { 532 "title": "ALFWorld: Aligning text and embodied environments for interactive learning", 533 "authors": ["Mohit Shridhar", "Xingdi Yuan", "Marc-Alexandre Côté"], 534 "year": 2020, 535 "relevance": "Interactive household task environment used for online RL evaluation of χPO algorithm." 536 } 537 ] 538 }