scan-v5.json (29308B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "MagicAgent: Towards Generalized Agent Planning", 6 "authors": [ 7 "Xuhui Ren", 8 "Shaokang Dong", 9 "Cheng Yang", 10 "Qingying Gao", 11 "Yunbin Zhao", 12 "Yongsheng Liu", 13 "Xinwei Geng", 14 "Xiang Li", 15 "Demei Yan", 16 "Yanqing Li", 17 "Chenhao Huang", 18 "Dingwei Zhu", 19 "Junjie Ye", 20 "Boxuan Yue", 21 "Yingnan Fu", 22 "Mengzhe Lv", 23 "Zezeng Feng", 24 "Boshen Zhou", 25 "Bocheng Wang", 26 "Xuanjing Huang", 27 "Yu-Gang Jiang", 28 "Tao Gui", 29 "Qi Zhang", 30 "Yunke Zhang" 31 ], 32 "year": 2026, 33 "venue": "arXiv", 34 "arxiv_id": "2602.19000", 35 "doi": null 36 }, 37 "checklist": { 38 "claims_and_evidence": { 39 "abstract_claims_supported": { 40 "applies": true, 41 "answer": false, 42 "justification": "The abstract claims the model 'even surpasses leading closed-source models,' but Table 2 shows MagicAgent-32B scoring 62.0 on τ²-Bench Retail while GPT-5.2 scores 75.2, and Kimi-K2 outperforms on several BFCL-v3 and ACEBench subtasks. The absolute accuracy numbers are supported but the sweeping 'surpass' claim is overstated.", 43 "source": "haiku" 44 }, 45 "causal_claims_justified": { 46 "applies": true, 47 "answer": true, 48 "justification": "Ablation studies in Section 6.4 (Table 6) compare individual training tricks (z-loss, various LBL strategies, variance loss) against the joint optimization strategy; Section 6.3 (Table 5) compares GRPO vs χPO vs EPO under controlled settings. These ablations provide adequate causal grounding for the design choice claims.", 49 "source": "haiku" 50 }, 51 "generalization_bounded": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper's title and framing claim 'generalized agent planning,' but evaluation is limited to 5 specific benchmark types (hierarchical decomposition, tool-augmented planning, multi-constraint scheduling, procedural logic orchestration, long-horizon tool execution) with no explicit acknowledgment that this does not cover the broader space of agentic tasks.", 55 "source": "haiku" 56 }, 57 "alternative_explanations_discussed": { 58 "applies": true, 59 "answer": false, 60 "justification": "No alternative explanations are considered, such as whether the gains come primarily from the Qwen3 base model quality rather than the training pipeline, or whether simply fine-tuning on the public benchmark data would yield similar results.", 61 "source": "haiku" 62 }, 63 "proxy_outcome_distinction": { 64 "applies": true, 65 "answer": false, 66 "justification": "Benchmark accuracy scores (WorfBench F1, NaturalPlan accuracy) are equated with 'generalized agent planning' capability without discussing the gap between benchmark performance and real-world agentic behavior; the connection between these proxies and the claimed generalization ability is asserted rather than argued.", 67 "source": "haiku" 68 } 69 }, 70 "limitations_and_scope": { 71 "limitations_section_present": { 72 "applies": true, 73 "answer": false, 74 "justification": "There is no dedicated limitations or threats-to-validity section. Section 7 (Conclusions) briefly mentions future work directions but does not discuss limitations of the current study.", 75 "source": "haiku" 76 }, 77 "threats_to_validity_specific": { 78 "applies": true, 79 "answer": false, 80 "justification": "No specific threats to validity are discussed anywhere in the paper — no mention of benchmark contamination, in-house evaluation bias, synthetic data quality limitations, or generalizability beyond tested task types.", 81 "source": "haiku" 82 }, 83 "scope_boundaries_stated": { 84 "applies": true, 85 "answer": false, 86 "justification": "The paper does not explicitly state what its results do NOT show; the sweeping framing around 'generalized' planning implies broader applicability than the 5 tested task types with no explicit scope caveat.", 87 "source": "haiku" 88 } 89 }, 90 "conflicts_of_interest": { 91 "funding_disclosed": { 92 "applies": true, 93 "answer": false, 94 "justification": "No funding source is disclosed anywhere in the paper. The institutional affiliation is Honor Device Co., Ltd, a commercial entity, but no formal funding statement appears.", 95 "source": "haiku" 96 }, 97 "affiliations_disclosed": { 98 "applies": true, 99 "answer": true, 100 "justification": "Author affiliations are clearly disclosed: Honor Device Co., Ltd (affiliation 1) and Fudan University (affiliation 2) are listed on the title page.", 101 "source": "haiku" 102 }, 103 "funder_independent_of_outcome": { 104 "applies": true, 105 "answer": false, 106 "justification": "The majority of authors are from Honor Device Co., Ltd, and the conclusion explicitly states the model 'has been deployed across multiple high-value scenarios under Honor's intelligent assistant' — the developing company is evaluating its own commercial product.", 107 "source": "haiku" 108 }, 109 "financial_interests_declared": { 110 "applies": true, 111 "answer": false, 112 "justification": "No competing interests statement or financial disclosure is present in the paper.", 113 "source": "haiku" 114 } 115 }, 116 "scope_and_framing": { 117 "key_terms_defined": { 118 "applies": true, 119 "answer": false, 120 "justification": "'Generalized planning' — the paper's central claim — is never formally defined; the paper distinguishes it from 'single-point optimization' but provides no criterion for what would or would not count as generalized. The five task types are described but the scope boundary of 'generalized' is left implicit.", 121 "source": "haiku" 122 }, 123 "intended_contribution_clear": { 124 "applies": true, 125 "answer": true, 126 "justification": "Section 1 explicitly lists four contributions: (1) scalable synthetic data framework, (2) two-stage multi-task optimization, (3) load-balanced MoE strategy, and (4) SOTA performance results.", 127 "source": "haiku" 128 }, 129 "engagement_with_prior_work": { 130 "applies": true, 131 "answer": true, 132 "justification": "Section 2 reviews related work across LLM agent models, training paradigms (SFT, RL), and MoE training, positioning MagicAgent relative to AgentTuning, AgentFLAN, AgentGym, WebRL, and others, explaining how the proposed approach addresses gaps in prior work.", 133 "source": "haiku" 134 } 135 } 136 }, 137 "type_checklist": { 138 "empirical": { 139 "artifacts": { 140 "code_released": { 141 "applies": true, 142 "answer": false, 143 "justification": "The paper states 'The MagicAgent series models will be released soon' — this is a promise of future release, not an actual release.", 144 "source": "haiku" 145 }, 146 "data_released": { 147 "applies": true, 148 "answer": false, 149 "justification": "The synthetic training data pipeline is described but the data itself is not released. The in-house MagicEval benchmark is proprietary. Public benchmarks used for evaluation are available but the paper does not release its own datasets.", 150 "source": "haiku" 151 }, 152 "environment_specified": { 153 "applies": true, 154 "answer": false, 155 "justification": "Table 8 specifies training hyperparameters and GPU hardware (NVIDIA A800), but no environment specification file (requirements.txt, Dockerfile, or equivalent) is provided.", 156 "source": "haiku" 157 }, 158 "reproduction_instructions": { 159 "applies": true, 160 "answer": false, 161 "justification": "The paper describes the methodology and refers to official codebases for benchmark evaluation, but provides no step-by-step instructions to reproduce training or inference results for MagicAgent itself.", 162 "source": "haiku" 163 } 164 }, 165 "statistical_methodology": { 166 "confidence_intervals_or_error_bars": { 167 "applies": true, 168 "answer": false, 169 "justification": "No confidence intervals or error bars are reported in any of the main results tables (Tables 2, 3, 4, 5, 6, 7). The ALFWorld experiment uses 5 random seeds but reports only mean best and mean steady-state without standard deviation.", 170 "source": "haiku" 171 }, 172 "significance_tests": { 173 "applies": true, 174 "answer": false, 175 "justification": "No statistical significance tests are applied to any comparative claims despite numerous model comparisons with sometimes small differences (e.g., χPO vs EPO in Table 5).", 176 "source": "haiku" 177 }, 178 "effect_sizes_reported": { 179 "applies": true, 180 "answer": true, 181 "justification": "Percentage improvements are cited in the results discussion (e.g., '27.0% relative improvement over the base Qwen3-32B model' on τ²-Bench Retail, '19.5% and 30.8%' improvement over GPT-5.2 on WorfBench).", 182 "source": "haiku" 183 }, 184 "sample_size_justified": { 185 "applies": true, 186 "answer": false, 187 "justification": "No justification or power analysis is provided for the size of any evaluation benchmark. The number of evaluation examples per benchmark is not even reported.", 188 "source": "haiku" 189 }, 190 "variance_reported": { 191 "applies": true, 192 "answer": false, 193 "justification": "No variance, standard deviation, or spread statistics are reported for any main results. Even Table 5's ALFWorld results across 5 seeds report only best and average values without standard deviation.", 194 "source": "haiku" 195 } 196 }, 197 "evaluation_design": { 198 "baselines_included": { 199 "applies": true, 200 "answer": true, 201 "justification": "Extensive baselines are included: 6 ultra-scale closed-source models (GPT-5.2, Kimi-K2, GLM-4.7, DeepSeek-V3.1, Qwen3-235B, Qwen3-MAX) and 6 large-scale open-source models (Tables 2-5).", 202 "source": "haiku" 203 }, 204 "baselines_contemporary": { 205 "applies": true, 206 "answer": true, 207 "justification": "Baselines include very recent (2025-2026) models such as Kimi-K2 (2025), GLM-4.7 (2025), Qwen3-235B-A22B-Instruct-2507, and GPT-5.2 (December 2025), representing the current state-of-the-art.", 208 "source": "haiku" 209 }, 210 "ablation_study": { 211 "applies": true, 212 "answer": true, 213 "justification": "Section 6.4 (Table 6) provides an ablation study comparing 7 individual MoE optimization strategies against the joint strategy. Section 6.3 (Table 5) compares multiple RL training algorithms (GRPO, EPO, χPO, GiGPO, RLVMR).", 214 "source": "haiku" 215 }, 216 "multiple_metrics": { 217 "applies": true, 218 "answer": true, 219 "justification": "Multiple metrics are used: WorfBench uses F1 Chain and F1 Graph; NaturalPlan uses Trip, Meeting, and Calendar accuracy; τ²-Bench uses Retail and Airline; BFCL-v3 uses Live and Non-Live; ACEBench uses En and Zh; MagicEval uses Step, Embedding, and LLM metrics.", 220 "source": "haiku" 221 }, 222 "human_evaluation": { 223 "applies": true, 224 "answer": false, 225 "justification": "No human evaluation of system outputs is conducted. MagicEval uses LLM-as-a-Judge (GPT-5.2) for qualitative assessment, which is automated rather than human evaluation.", 226 "source": "haiku" 227 }, 228 "held_out_test_set": { 229 "applies": true, 230 "answer": true, 231 "justification": "Public benchmarks use their official test sets; ALFWorld evaluation uses seen (IID) and unseen (OOD) splits; the in-house MagicEval uses production interaction logs not used in training.", 232 "source": "haiku" 233 }, 234 "per_category_breakdown": { 235 "applies": true, 236 "answer": true, 237 "justification": "All benchmarks are broken down by subtask: NaturalPlan (Trip/Meeting/Calendar), τ²-Bench (Retail/Airline), BFCL-v3 (Live/Non-Live), ACEBench (En/Zh), and MagicEval (General/Dependency/Condition/Context Inheritance).", 238 "source": "haiku" 239 }, 240 "failure_cases_discussed": { 241 "applies": true, 242 "answer": false, 243 "justification": "No failure cases or error analysis are discussed. The paper only reports success metrics without examining where or why the model fails.", 244 "source": "haiku" 245 }, 246 "negative_results_reported": { 247 "applies": true, 248 "answer": true, 249 "justification": "Table 6 reports that variance loss and sequence-level LBL perform worse than the joint optimization strategy on τ²-Bench (retail and airline scores mostly below 45%), and that BP+LBLseq degrades WorfBench performance relative to the baseline.", 250 "source": "haiku" 251 } 252 }, 253 "setup_transparency": { 254 "model_versions_specified": { 255 "applies": true, 256 "answer": false, 257 "justification": "Baselines are listed with identifiers (e.g., Qwen3-32B-nothink, Qwen3-30B-A3B-Instruct-2507) but no snapshot dates or commit hashes are provided for training base models. The paper says 'We adopt the Qwen3 series as the base model' without specifying the exact checkpoint used.", 258 "source": "haiku" 259 }, 260 "prompts_provided": { 261 "applies": true, 262 "answer": false, 263 "justification": "Appendices show data format examples (with actual content in Chinese, partially obscured) but the actual evaluation prompts and system instructions used for benchmarking are not provided.", 264 "source": "haiku" 265 }, 266 "hyperparameters_reported": { 267 "applies": true, 268 "answer": true, 269 "justification": "Table 8 (Appendix A) provides comprehensive hyperparameters: learning rate (1e-5 SFT, 1e-6 RL), batch size, gradient accumulation, temperature (0.9), top-p (0.96), top-k (20), weight decay, scheduler type, MoE parallel settings, and reward weighting coefficients.", 270 "source": "haiku" 271 }, 272 "scaffolding_described": { 273 "applies": true, 274 "answer": true, 275 "justification": "The agentic scaffolding is described in detail: Section 3.2 describes the ReAct-based reasoning-action-observation format; Section 4.2.3 details the think-action entropy smoothing; Table 1 specifies the unified data format for each task type.", 276 "source": "haiku" 277 }, 278 "data_preprocessing_documented": { 279 "applies": true, 280 "answer": true, 281 "justification": "Sections 3.1-3.5 document preprocessing and filtering for each data type: SimHash deduplication, schema validation, and semantic verification (via GPT-5.2) for hierarchical decomposition; JSON schema validation, semantic auditing, and negative sampling for tool-augmented planning.", 282 "source": "haiku" 283 } 284 }, 285 "data_integrity": { 286 "raw_data_available": { 287 "applies": true, 288 "answer": false, 289 "justification": "No raw training data or evaluation data is released. Training data is synthetic and proprietary; in-house MagicEval data is from production logs and not released.", 290 "source": "haiku" 291 }, 292 "data_collection_described": { 293 "applies": true, 294 "answer": true, 295 "justification": "The synthetic data collection pipeline is described in substantial detail across Sections 3.1-3.5, including tool graph construction, atomic plan synthesis, filtering steps, and scaling strategies. In-house MagicEval is described as anonymized production user interaction logs.", 296 "source": "haiku" 297 }, 298 "recruitment_methods_described": { 299 "applies": false, 300 "answer": false, 301 "justification": "No human participants are involved; standard synthetic and benchmark data are used.", 302 "source": "haiku" 303 }, 304 "data_pipeline_documented": { 305 "applies": true, 306 "answer": true, 307 "justification": "The full synthetic data pipeline is documented end-to-end in Sections 3.1-3.5, from tool curation through graph construction, atomic synthesis, trajectory composition, and quality filtering. In-house data collection is briefly described as anonymized production logs.", 308 "source": "haiku" 309 } 310 }, 311 "contamination": { 312 "training_cutoff_stated": { 313 "applies": true, 314 "answer": false, 315 "justification": "The training data cutoff for the Qwen3 base model is never stated. Fine-tuning uses synthetic data but the base model's pretraining cutoff, which affects benchmark contamination risk, is not mentioned.", 316 "source": "haiku" 317 }, 318 "train_test_overlap_discussed": { 319 "applies": true, 320 "answer": false, 321 "justification": "No discussion of whether the synthetic training data could overlap with benchmark test cases, or whether the Qwen3 base model's pretraining included the benchmark data.", 322 "source": "haiku" 323 }, 324 "benchmark_contamination_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "Public benchmarks (WorfBench, NaturalPlan, BFCL-v3, ACEBench) predate this paper and may have been included in Qwen3's pretraining corpus; this potential contamination is never acknowledged or addressed.", 328 "source": "haiku" 329 } 330 }, 331 "human_studies": { 332 "pre_registered": { 333 "applies": false, 334 "answer": false, 335 "justification": "No human participants.", 336 "source": "haiku" 337 }, 338 "irb_or_ethics_approval": { 339 "applies": false, 340 "answer": false, 341 "justification": "No human participants.", 342 "source": "haiku" 343 }, 344 "demographics_reported": { 345 "applies": false, 346 "answer": false, 347 "justification": "No human participants.", 348 "source": "haiku" 349 }, 350 "inclusion_exclusion_criteria": { 351 "applies": false, 352 "answer": false, 353 "justification": "No human participants.", 354 "source": "haiku" 355 }, 356 "randomization_described": { 357 "applies": false, 358 "answer": false, 359 "justification": "No human participants.", 360 "source": "haiku" 361 }, 362 "blinding_described": { 363 "applies": false, 364 "answer": false, 365 "justification": "No human participants.", 366 "source": "haiku" 367 }, 368 "attrition_reported": { 369 "applies": false, 370 "answer": false, 371 "justification": "No human participants.", 372 "source": "haiku" 373 } 374 }, 375 "cost_and_practicality": { 376 "inference_cost_reported": { 377 "applies": true, 378 "answer": true, 379 "justification": "Table 7 provides detailed inference efficiency comparisons (TTFT, TPOT, average latency) between dense and MoE models at concurrency levels 1, 10, 50, and 100 on 4 NVIDIA A800 GPUs.", 380 "source": "haiku" 381 }, 382 "compute_budget_stated": { 383 "applies": true, 384 "answer": false, 385 "justification": "Table 8 states 32 GPUs (4 machines × 8 GPUs) and 1 training epoch, but total GPU-hours, wall-clock time, or FLOPs consumed for training are not reported.", 386 "source": "haiku" 387 } 388 } 389 } 390 }, 391 "claims": [ 392 { 393 "claim": "MagicAgent-32B achieves 75.1% average on WorfBench, substantially outperforming GPT-5.2 (49.6%) and all other tested sub-100B and ultra-scale models.", 394 "evidence": "Table 2 shows F1 Chain 80.3% and F1 Graph 69.7% for MagicAgent-32B vs F1 Chain 60.1% and F1 Graph 39.1% for GPT-5.2. Figure 1 presents aggregated averages.", 395 "supported": "strong" 396 }, 397 { 398 "claim": "The proposed χPO online RL algorithm outperforms EPO on the ALFWorld benchmark.", 399 "evidence": "Table 5 shows χPO (Qwen2.5-3B) achieving IID Succ.* 91.7%/Succ. 78.2% and OOD Succ.* 91.8%/Succ. 75.9% vs EPO's 91.7%/75.8% and 89.6%/75.4%.", 400 "supported": "moderate" 401 }, 402 { 403 "claim": "Joint optimization of global-batch LBL and z-loss outperforms all individual MoE training strategies across benchmarks.", 404 "evidence": "Table 6 ablation shows BP+LBLgbl+z_loss achieving balanced performance across WorfBench, NaturalPlan, and τ²-Bench subtasks where individual strategies each show weaknesses.", 405 "supported": "moderate" 406 }, 407 { 408 "claim": "MagicAgent-30B-A3B (MoE) achieves up to 54.7% inference latency reduction compared to MagicAgent-32B (dense) with competitive accuracy.", 409 "evidence": "Table 7 reports average latency reduction of 54.7% at concurrency 1 (7.582s vs 16.722s) while benchmark scores remain within 1-2pp.", 410 "supported": "strong" 411 }, 412 { 413 "claim": "The two-stage SFT+RL training mitigates the 'seesaw effect' where gains in one planning task degrade others.", 414 "evidence": "Section 6.4 shows joint training outperforms naive multi-task approaches in ablation, and Figure 8b shows steady reward increases in both offline and online RL.", 415 "supported": "weak" 416 }, 417 { 418 "claim": "MagicAgent substantially outperforms existing sub-100B models across all five planning benchmarks.", 419 "evidence": "Tables 2-4 show MagicAgent-32B consistently above Qwen3-32B, Llama3.3-70B, and other large-scale models. E.g., WorfBench F1 Chain: 80.3% vs best competing 65.8%.", 420 "supported": "strong" 421 } 422 ], 423 "methodology_tags": [ 424 "benchmark-eval", 425 "empirical" 426 ], 427 "key_findings": "MagicAgent-32B and MagicAgent-30B-A3B achieve top performance among sub-100B models on five agent planning benchmarks (WorfBench, NaturalPlan, τ²-Bench, BFCL-v3, ACEBench), with particular dominance on WorfBench procedural logic orchestration (80.3% F1 Chain vs GPT-5.2's 60.1%). The proposed χPO online RL algorithm marginally outperforms EPO on ALFWorld while using a 3B rather than 7B model. The MoE variant (MagicAgent-30B-A3B) achieves comparable task performance with 35-55% inference latency reduction versus the dense model. However, MagicAgent does not uniformly surpass closed-source models — GPT-5.2 outperforms on τ²-Bench Retail (75.2% vs 62.0%), and several ultra-scale models match or exceed performance on specific subtasks.", 428 "red_flags": [ 429 { 430 "flag": "Reproducibility blocked", 431 "detail": "Code and training data are not released ('will be released soon'), and the in-house MagicEval benchmark is proprietary. Third-party verification of the results is currently impossible." 432 }, 433 { 434 "flag": "Overstated generalization claim", 435 "detail": "The paper claims 'generalized agent planning' in its title and throughout, but 'generalized' is never formally defined and evaluation is restricted to 5 specific pre-selected benchmark types." 436 }, 437 { 438 "flag": "Circular evaluation with GPT-5.2", 439 "detail": "GPT-5.2 is used as the quality validator for synthetic training data generation AND as an LLM-as-a-Judge for in-house MagicEval evaluation, while simultaneously being a primary benchmark baseline — creating potential bias toward GPT-5.2-aligned outputs." 440 }, 441 { 442 "flag": "No statistical rigor", 443 "detail": "No confidence intervals, error bars, or significance tests anywhere in the paper despite numerous model comparisons with sometimes small margins (e.g., χPO vs EPO differ by 2.4pp IID Succ. and 0.5pp OOD Succ.)." 444 }, 445 { 446 "flag": "Company evaluating own product", 447 "detail": "The majority of authors are from Honor Device Co., Ltd; the conclusion explicitly states the model is deployed in Honor's commercial assistant, and no independent evaluation or conflict-of-interest disclosure is present." 448 }, 449 { 450 "flag": "No limitations section", 451 "detail": "Despite making broad claims about 'generalized' planning and surpassing large closed-source models, the paper includes no limitations or threats-to-validity section." 452 }, 453 { 454 "flag": "Benchmark contamination unaddressed", 455 "detail": "Public benchmarks (WorfBench, NaturalPlan, BFCL-v3, etc.) predated this paper and may be in Qwen3's pretraining corpus; neither the base model's training cutoff nor potential train-test overlap are mentioned." 456 } 457 ], 458 "cited_papers": [ 459 { 460 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 461 "relevance": "Core ReAct scaffolding paradigm used as evaluation framework for ALFWorld and as basis for trajectory data generation in tool-augmented planning module." 462 }, 463 { 464 "title": "Benchmarking Agentic Workflow Generation (WorfBench)", 465 "relevance": "Primary benchmark for procedural logic orchestration evaluation; also provides source data for DAG-based procedural logic orchestration training data." 466 }, 467 { 468 "title": "Natural Plan: Benchmarking LLMs on Natural Language Planning", 469 "relevance": "Primary benchmark for multi-constraint scheduling evaluation (Trip, Meeting, Calendar subtasks)." 470 }, 471 { 472 "title": "ALFWorld: Aligning Text and Embodied Environments for Interactive Learning", 473 "relevance": "Online embodied RL environment used to evaluate χPO against competing RL algorithms." 474 }, 475 { 476 "title": "EPO: Entropy-Regularized Policy Optimization for LLM Agents Reinforcement Learning", 477 "relevance": "Direct baseline for χPO online RL algorithm; χPO extends EPO's exploration-exploitation framework with information bottleneck and think-action disentanglement." 478 }, 479 { 480 "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", 481 "relevance": "Validates the RLVR (Reinforcement Learning with Verifiable Rewards) approach that MagicAgent's unified reward function builds upon." 482 }, 483 { 484 "title": "AgentTuning: Enabling Generalized Agent Abilities for LLMs", 485 "relevance": "Key prior work on agent-specific fine-tuning; direct comparison baseline and represents the SFT-only paradigm that MagicAgent extends with RL." 486 }, 487 { 488 "title": "AgentGym: Evolving Large Language Model-Based Agents Across Diverse Environments", 489 "relevance": "Prior RL-for-agents framework used as baseline in ALFWorld evaluation (Table 5)." 490 }, 491 { 492 "title": "τ²-Bench: Evaluating Conversational Agents in a Dual-Control Environment", 493 "relevance": "Primary benchmark for long-horizon tool execution evaluation across Retail and Airline domains." 494 }, 495 { 496 "title": "Magnet: Multi-Turn Tool-Use Data Synthesis and Distillation via Graph Translation", 497 "relevance": "Related work on bidirectional translation methodology for tool-augmented planning data generation that MagicAgent's framework draws upon." 498 } 499 ], 500 "engagement_factors": { 501 "practical_relevance": { 502 "score": 3, 503 "justification": "Model is already deployed in production at Honor's commercial intelligent assistant, directly applicable to any tool-calling or task-planning agent system." 504 }, 505 "surprise_contrarian": { 506 "score": 1, 507 "justification": "The core result (curated fine-tuning on diverse planning data improves benchmark performance) is expected; χPO is a modest algorithmic novelty over EPO." 508 }, 509 "fear_safety": { 510 "score": 0, 511 "justification": "No safety, alignment, or risk discussion anywhere in the paper." 512 }, 513 "drama_conflict": { 514 "score": 1, 515 "justification": "Claims to outperform GPT-5.2 on WorfBench by ~25pp absolute are notable, but the framing is straightforward product improvement rather than provocative." 516 }, 517 "demo_ability": { 518 "score": 2, 519 "justification": "Model is deployed in Honor's assistant (usable indirectly) and will be released; Appendix H shows three concrete demo application scenarios." 520 }, 521 "brand_recognition": { 522 "score": 1, 523 "justification": "Honor Device Co., Ltd is a large Chinese phone manufacturer and Fudan University is a prominent institution, but neither is a tier-1 AI research lab." 524 } 525 }, 526 "hn_data": { 527 "threads": [ 528 { 529 "hn_id": "46991734", 530 "title": "RL on GPT-5 to write better kernels", 531 "points": 4, 532 "comments": 1, 533 "url": "https://news.ycombinator.com/item?id=46991734", 534 "created_at": "2026-02-12T17:22:31Z" 535 }, 536 { 537 "hn_id": "47004259", 538 "title": "Fine-Tuning GPT-5 for GPU Kernel Generation", 539 "points": 4, 540 "comments": 0, 541 "url": "https://news.ycombinator.com/item?id=47004259", 542 "created_at": "2026-02-13T16:04:35Z" 543 } 544 ], 545 "top_points": 4, 546 "total_points": 8, 547 "total_comments": 1 548 } 549 }