scan.json (25075B)
1 { 2 "paper": { 3 "title": "Single-Agent Scaling Fails Multi-Agent Intelligence: Towards Foundation Models with Native Multi-Agent Intelligence", 4 "authors": ["Shuyue Hu", "Haoyang Yan", "Yiqun Zhang", "Yang Chen", "Dongzhan Zhou", "Lei Bai"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2512.08743", 8 "doi": "10.48550/arXiv.2512.08743" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Across 41 LLMs from the Qwen and LLaMA families (0.5B–235B parameters), single-agent task performance improves substantially across model generations while multi-agent understanding gains are modest and multi-agent planning gains are near-flat. Logarithmic regression shows diminishing returns: even models with 0.6–0.8 SA accuracy show high variance in MA planning (R²≈0.6). The authors argue multi-agent intelligence requires intentional training beyond single-agent scaling.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No repository URL, code archive, or mention of code release found anywhere in the paper." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "All 7 benchmarks used are publicly available (MATH-500, HumanEval, MMLU-Pro, GPQA, ToMBench, EmoBench, CoordinationQA) and all models are open-weight (Qwen, LLaMA families on HuggingFace)." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions using vLLM with 'officially recommended hyperparameters' but provides no environment specification, dependency list, or hardware details." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No reproduction instructions, README, or scripts provided. The experimental procedure is described at a high level but lacks step-by-step reproducibility guidance." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Tables 1 and 2 report only point estimates (single accuracy values) with no confidence intervals, error bars, or uncertainty quantification." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims SA scaling does not yield MA improvement but provides no statistical tests for these comparisons. The logarithmic regressions in Figure 3 report R² and p-values but no tests are applied to the generational comparisons that form the paper's central claim." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports effect sizes in context: 'accuracy nearly triples from Qwen-1-1.8B to Qwen-3-1.7B' for SA tasks vs. 'only about 30%' increase for MA planning. Specific accuracy values (e.g., 0.23→0.64 for SA, 0.44→0.55 for MA understanding) provide baseline context." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification for why 41 models were selected, why only two model families were used, or whether this sample is sufficient to support the generalization claims." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "All results appear to be single-run numbers. No standard deviation, variance across seeds, or spread measures reported for any benchmark." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper's design inherently uses single-agent benchmarks (MATH-500, HumanEval, MMLU-Pro, GPQA) as baselines for comparison against multi-agent benchmarks, and compares across model generations." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Models span from 2023 to 2025 including recent releases (Qwen3, LLaMA-3.3). Benchmarks include contemporary choices like MMLU-Pro and GPQA." 74 }, 75 "ablation_study": { 76 "applies": false, 77 "answer": false, 78 "justification": "The paper evaluates existing models on existing benchmarks; there is no system with components to ablate." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Results are reported across 7 benchmarks with multiple sub-scores (e.g., CoordinationQA has EC, ToM, JP; EmoBench has EU, EA; ToMBench has Tasks and Abilities)." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "Human evaluation is not relevant here; the paper evaluates LLM performance on existing benchmarks with automated scoring." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "All benchmarks used are standard held-out test sets (MATH-500, HumanEval, MMLU-Pro, GPQA, ToMBench, EmoBench, CoordinationQA). The paper does not tune on these benchmarks." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Tables 1 and 2 in the Appendix provide per-benchmark, per-model breakdowns. CoordinationQA is broken into EC, ToM, and JP sub-scores. EmoBench into EU and EA. ToMBench into Tasks and Abilities." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": false, 103 "justification": "No discussion of failure cases, error analysis, or qualitative examples of where models fail on multi-agent tasks." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The central finding is essentially a negative result: single-agent scaling fails to produce multi-agent intelligence. The paper also notes cases where MA planning accuracy declines across generations (e.g., LLaMA-3 to LLaMA-3.1 for 70B)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims 'scaling single-agent performance alone does not automatically yield robust multi-agent intelligence' across 41 LLMs and 7 benchmarks, which is supported by the results in Section 3, Figures 2-3, and Tables 1-2." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper makes causal claims like 'scaling single-agent abilities does not yield comparable improvements in multi-agent abilities' (Section 3.2). However, the study is observational — it observes correlations across model generations but cannot control for confounds like different training data, RLHF procedures, or instruction tuning approaches that vary across generations." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title and abstract claim broadly about 'foundation models' and 'multi-agent intelligence' but only test two open-weight model families (Qwen, LLaMA). Closed-source models (GPT, Claude, Gemini) and other open families are not tested. The paper does not bound its claims to the tested families." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not discuss alternative explanations for the observed pattern. For example, it does not consider whether the MA benchmarks are simply harder (ceiling effects), whether instruction tuning varies across generations, or whether the MA benchmarks have lower construct validity." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper equates performance on ToMBench, EmoBench, and CoordinationQA with 'multi-agent intelligence' without discussing whether these QA-style benchmarks actually capture the multi-agent capabilities described in Section 2 (real-time adaptation, efficient communication, decentralized planning). The paper even acknowledges in Section 4.2 that QA-style evaluation is limited, yet does not apply this caveat to its own results." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Tables 1-2 list specific model versions (e.g., 'Qwen-1.8B-Chat', 'Qwen2.5-72B-Instruct', 'Llama-2-7b-chat-hf', 'Meta-Llama-3-70B-Instruct') and the paper states these are 'official checkpoints' from HuggingFace." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "No prompts or system instructions are provided. The paper does not describe how benchmarks were administered to the models beyond mentioning vLLM deployment." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "The paper states only 'officially recommended hyperparameters' without specifying temperature, top-p, max tokens, or any other setting." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used; models are evaluated directly on QA-style benchmarks." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": false, 162 "justification": "No description of how benchmark data was preprocessed, formatted, or administered to the models. No mention of answer extraction or parsing procedures." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no limitations or threats-to-validity section in the paper." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed anywhere in the paper." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state what its results do NOT show. It does not bound claims to the tested model families or acknowledge that QA-style MA benchmarks may not capture the full scope of multi-agent intelligence." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "Only aggregated accuracy scores are provided in Tables 1-2. No raw model outputs, per-example predictions, or detailed results are available." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 3.1 describes the model selection criteria (two open-weight families with documented histories, 41 models from 0.5B to 235B, instruction-tuned versions) and the 7 benchmarks used with their characteristics." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants; data sources are standard public benchmarks." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "No documentation of the pipeline from running benchmarks to final accuracy numbers. Answer extraction, parsing, and scoring procedures are not described." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding source or acknowledgments section is present in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All authors are listed as affiliated with Shanghai Artificial Intelligence Laboratory, with email addresses at pjlab.org.cn." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed. Shanghai AI Laboratory is a research institution that could have interests in the outcomes of FM evaluation research." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement or financial interest disclosure found in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper evaluates 41 pre-trained models on benchmarks but never states any model's training data cutoff date." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether any benchmark data appeared in the models' training data, despite using well-known benchmarks like HumanEval (published 2021) and MATH-500." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "Several benchmarks (HumanEval, MMLU-Pro) were available online before many tested models were trained. No discussion of contamination risk, which could differentially affect SA vs MA benchmarks and confound the central claim." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference costs, token counts, or wall-clock time reported despite running 41 models across 7 benchmarks." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No mention of total GPU hours, hardware used, or computational budget for running the experiments." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of multiple random seeds. All results appear to be single-run evaluations." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs is never stated. Results are presented as single accuracy values." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search is described. The paper mentions 'officially recommended hyperparameters' but does not report what these are or whether alternatives were tried." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "No discussion of configuration selection. The paper uses 'officially recommended hyperparameters' without justification or comparison to alternatives." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "The paper compares 41 models across 7 benchmarks but applies no multiple comparison correction. The logarithmic regressions report p-values without correction." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": false, 320 "answer": false, 321 "justification": "The paper evaluates existing models on existing benchmarks; there is no 'own system' being compared against baselines." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "No performance-vs-compute analysis despite comparing models ranging from 0.5B to 235B parameters with vastly different compute requirements." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper uses QA-style benchmarks (ToMBench, EmoBench, CoordinationQA) as proxies for 'multi-agent intelligence' without discussing whether these benchmarks actually measure the capabilities described in Section 2. Section 4.2 acknowledges QA limitations generally but does not apply this critique to its own evaluation." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is used; models are evaluated directly on benchmarks." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "HumanEval (2021), MATH-500 (2023), and other SA benchmarks predate many tested models. If SA benchmarks are more contaminated than MA benchmarks, this confounds the central claim. Not discussed." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether evaluation setups leak information through context or prompt formatting." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether benchmark examples share structural similarities with training data." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method applied despite the critical confound: if SA benchmarks are more contaminated than MA benchmarks, the observed SA>MA improvement pattern could be an artifact." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Scaling single-agent performance does not automatically yield robust multi-agent intelligence across 41 LLMs and 7 benchmarks.", 365 "evidence": "Section 3.2, Figures 2-3: SA accuracy of ~8B Qwen models rises from 0.23 to 0.64 across generations while MA understanding only increases from 0.44 to 0.55 and MA planning remains 0.2-0.35.", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "Multi-agent planning performance shows little to no improvement across model generations and in some cases declines.", 370 "evidence": "Section 3.2, Figure 2: For ~8B models, MA planning accuracy stabilizes between 0.2-0.35 across Qwen-1 to Qwen-2.5 and LLaMA-2 to LLaMA-3.1. For ~70B LLaMA models, accuracy slightly declines from LLaMA-3 to LLaMA-3.1.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "Even models with high single-agent accuracy (0.6-0.8) exhibit substantial variability in multi-agent planning accuracy.", 375 "evidence": "Figure 3: Logarithmic regression with R²=0.608 (Qwen) and R²=0.576 (LLaMA) shows substantial unexplained variance. 95% CI bands are wide.", 376 "supported": "moderate" 377 } 378 ], 379 "red_flags": [ 380 { 381 "flag": "Contamination confound threatens central claim", 382 "detail": "SA benchmarks like HumanEval (2021) and MATH-500 (2023) are older and more likely contaminated in newer model training data than MA benchmarks like CoordinationQA. Differential contamination could inflate SA improvements relative to MA, making the SA-MA gap an artifact. This is never discussed." 383 }, 384 { 385 "flag": "No limitations section", 386 "detail": "The paper lacks any limitations, threats to validity, or scope boundary discussion despite making strong generalization claims about 'foundation models' based on only two model families." 387 }, 388 { 389 "flag": "Benchmark difficulty confound", 390 "detail": "The paper does not consider whether MA benchmarks are simply harder or have different performance ceilings than SA benchmarks, which could explain slower improvement independent of any fundamental SA-MA gap." 391 }, 392 { 393 "flag": "No variance or multi-run results", 394 "detail": "All 287 data points (41 models × 7 benchmarks) appear to be single-run evaluations with no uncertainty quantification, making it impossible to assess result stability." 395 }, 396 { 397 "flag": "Overclaiming from observational data", 398 "detail": "The paper treats the observed correlation between model generation and differential SA/MA improvement as evidence of a fundamental inability, but model generations differ in training data, RLHF, and instruction tuning — not just scale. The causal claim is not supported by the observational design." 399 } 400 ], 401 "cited_papers": [ 402 { 403 "title": "Large language model based multi-agents: A survey of progress and challenges", 404 "authors": ["Tao Guo", "Xiao Chen"], 405 "year": 2024, 406 "arxiv_id": "2402.01680", 407 "relevance": "Comprehensive survey of LLM-based multi-agent systems, relevant to the survey's coverage of agentic AI." 408 }, 409 { 410 "title": "Are emergent abilities of large language models a mirage?", 411 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 412 "year": 2023, 413 "relevance": "Challenges claims of emergent LLM abilities, directly relevant to methodology quality assessment in AI evaluation." 414 }, 415 { 416 "title": "Evaluating Large Language Models Trained on Code", 417 "authors": ["Mark Chen", "Jerry Tworek"], 418 "year": 2021, 419 "arxiv_id": "2107.03374", 420 "relevance": "HumanEval benchmark paper, foundational for AI code generation evaluation." 421 }, 422 { 423 "title": "Scaling laws for neural language models", 424 "authors": ["Jared Kaplan", "Sam McCandlish"], 425 "year": 2020, 426 "arxiv_id": "2001.08361", 427 "relevance": "Foundational scaling laws paper, central to the debate about whether capabilities emerge from scale." 428 }, 429 { 430 "title": "ToMBench: Benchmarking Theory of Mind in Large Language Models", 431 "authors": ["Zhuang Chen", "Jincenzi Wu"], 432 "year": 2024, 433 "arxiv_id": "2402.15052", 434 "relevance": "Multi-agent understanding benchmark used in this study, relevant to LLM capability evaluation." 435 }, 436 { 437 "title": "MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark", 438 "authors": ["Yubo Wang", "Xueguang Ma"], 439 "year": 2024, 440 "relevance": "Major LLM evaluation benchmark, relevant to AI capability assessment methodology." 441 }, 442 { 443 "title": "GPQA: A Graduate-Level Google-Proof Q&A Benchmark", 444 "authors": ["David Rein", "Betty Li Hou"], 445 "year": 2024, 446 "relevance": "Graduate-level QA benchmark for LLM evaluation, relevant to AI capability assessment." 447 }, 448 { 449 "title": "Large Language Models Miss the Multi-Agent Mark", 450 "authors": ["Emanuele La Malfa"], 451 "year": 2025, 452 "relevance": "Directly related work arguing FMs lack multi-agent capabilities, relevant to survey's coverage of LLM limitations." 453 }, 454 { 455 "title": "Secret collusion among ai agents: Multi-agent deception via steganography", 456 "authors": ["Sumeet Motwani"], 457 "year": 2024, 458 "relevance": "Demonstrates multi-agent safety risks (covert collusion), relevant to AI safety assessment." 459 }, 460 { 461 "title": "Multi-agent risks from advanced ai", 462 "authors": ["Lewis Hammond"], 463 "year": 2025, 464 "arxiv_id": "2502.14143", 465 "relevance": "Taxonomy of multi-agent AI risks, directly relevant to AI safety research methodology." 466 }, 467 { 468 "title": "Why do multi-agent llm systems fail?", 469 "authors": ["Mert Cemri"], 470 "year": 2025, 471 "arxiv_id": "2503.13657", 472 "relevance": "Analysis of multi-agent LLM system failures, relevant to agentic AI reliability research." 473 }, 474 { 475 "title": "Towards a Science of Scaling Agent Systems", 476 "authors": ["Yoonsang Kim"], 477 "year": 2025, 478 "arxiv_id": "2512.08296", 479 "relevance": "Framework for scaling agent systems, directly relevant to agentic AI methodology." 480 } 481 ] 482 }