scan.json (24319B)
1 { 2 "paper": { 3 "title": "Interpreting Emergent Extreme Events in Multi-Agent Systems", 4 "authors": ["Ling Tang", "Jilin Mei", "Dongrui Liu", "Chen Qian", "Dawei Cheng", "Jing Shao", "Xia Hu"], 5 "year": 2026, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2601.20538", 8 "doi": "10.48550/arXiv.2601.20538" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval", "theoretical"], 13 "key_findings": "The paper proposes a Shapley value-based framework for attributing emergent extreme events in LLM-powered multi-agent systems across three dimensions: when, who, and what. Experiments across economic, financial, and social simulation scenarios show extreme events originate from either dormant risks or immediate shocks, are driven by a small subset of agents, and stem from a few dominant behaviors. The proposed method outperforms baseline attribution methods (random, LLM-prompting, surrogate model) in faithfulness as measured by risk drop when removing top-attributed actions.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "GitHub link provided in abstract: https://github.com/mjl0613ddm/IEEE" 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper uses publicly available MAS frameworks (EconAgent, TwinMarket, SocialNetwork) which are existing open-source systems." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No requirements.txt, Dockerfile, or environment setup details provided. The paper does not specify library versions or dependencies." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions. The paper describes methodology but does not provide a README or script for replicating experiments." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": true, 41 "justification": "Tables 1, 2, and 3 all report mean ± standard deviation across independent runs." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "No statistical significance tests used. Claims that 'our method achieved the highest risk drop' are based on comparing raw numbers in Table 3 without any tests." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Table 3 reports risk drop as percentage (e.g., '36.31% vs 5.32%'), providing magnitude of effect relative to baseline. Table 1 reports cosine similarity values." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "Five independent trajectories per setting are used with no justification for why five is sufficient. N=10 or N=20 agents chosen without justification." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Standard deviations reported across independent runs in all main results tables (Tables 1, 2, 3)." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Four baselines compared: Random, Failure Taxonomy (Cemri et al., 2025), Failure Attribution (Zhang et al., 2025b), and Agent Tracer (Zhang et al., 2025a). Table 3." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "All three non-random baselines are from 2025, representing the current state of the art in MAS attribution." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": false, 78 "justification": "No ablation study. The framework has multiple components (Shapley attribution, dimensional aggregation, metrics) but none are ablated to show individual contribution." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Multiple metrics used: cosine similarity for approximation accuracy (Table 1), risk drop for faithfulness (Table 3), plus five designed metrics (Ltm, Gag, Cag, Zag, Gbe) in Table 2." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation of the attribution quality. The framework claims to 'explain' extreme events but interpretability is only assessed via automated risk drop, not human judgment." 89 }, 90 "held_out_test_set": { 91 "applies": false, 92 "answer": false, 93 "justification": "Not a learning task. The method is applied to simulation trajectories, not trained on data." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results broken down by scenario (EconAgent, TwinMarket, SocialNetwork) and by model (5 LLMs) in all tables." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": false, 103 "justification": "No discussion of cases where the method fails. Table 3 shows their method underperforms Agent Tracer on SocialNetwork (e.g., GPT-4o Top-3: 17.9 vs 22.2) but this is not discussed." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": false, 108 "justification": "No negative results discussed. The SocialNetwork scenario shows weaker performance but the paper does not analyze why or report any failed approaches." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims about 'effectiveness' and 'general insights' are supported by Tables 1-3 and the five insights derived from quantitative analysis." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The causal claims are justified through a principled framework: Shapley values provide axiomatic attribution (Properties 1-4 proved in Appendix A), and faithfulness is validated through counterfactual intervention (deleting top actions and measuring risk drop)." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper claims 'general insights into the emergence of extreme phenomena' (abstract) based on only 3 simulation scenarios with small agent counts (N=10-20). The title says 'Multi-Agent Systems' broadly without bounding to these specific simulation types." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No discussion of alternative explanations for the results. For example, the risk drop metric may favor methods that identify correlated rather than causal actions, but this is not discussed." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "Risk drop is used as a proxy for 'faithfulness' of attribution, but the paper does not discuss whether removing high-attribution actions and measuring risk reduction truly validates explanatory quality versus just measuring correlation." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Models listed as 'GPT-4o mini', 'Claude-3-Haiku', 'Qwen-Plus', 'DeepSeek-V3.2' — marketing names without snapshot dates or API versions. 'Llama-3.1-8B-Instruct' includes size but no specific checkpoint." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Appendix C provides full prompt text for baseline methods (FT, FA). Figure 4(c) shows agent configuration prompts. The MAS agent prompts are from published frameworks (EconAgent, TwinMarket, SocialNetwork)." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "Paper states 'All LLM APIs were accessed using default parameters' (Section 4.1) without specifying temperature, top-p, or max tokens. M=1000 and λ=0.94 are stated but LLM inference parameters are not." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "MAS scenario implementations described in detail in Appendix B, including action spaces, belief update mechanisms, risk computation formulas, and counterfactual simulation procedures." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Trajectory generation, extreme event detection via threshold, counterfactual trajectory construction (replacing actions with safe baselines), and Monte Carlo sampling procedure all documented in Sections 3.1 and Appendix A-B." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "No dedicated limitations section. The conclusion briefly mentions 'Scaling the framework to significantly larger systems remains a primary objective' but this is one sentence, not substantive discussion." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity discussed. No mention of potential issues with the Monte Carlo approximation in practice, sensitivity to safe action definitions, or limitations of the simulation environments." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "No explicit boundaries stated about what the results do NOT show. The paper generalizes from 3 specific simulations to 'general insights' without stating scope limitations." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw trajectory data or attribution scores released. Only aggregated results in tables." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Simulation procedures described in detail: agent counts (N=10, 20), trajectory lengths (T=27-34), seed count (5 per setting, 10 for Monte Carlo), and extreme event threshold definition (Appendix B)." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. All data generated through LLM-powered simulations." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "Full pipeline documented: trajectory generation → extreme event detection → Shapley value computation via Monte Carlo → dimensional aggregation → metric computation (Sections 3.1-3.4, Algorithm 1)." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information mentioned anywhere in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations clearly listed: Shanghai AI Laboratory, SJTU, Fudan University, Renmin University, Tongji University." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding information disclosed, so independence cannot be assessed." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "The paper does not evaluate pre-trained model capability on benchmarks. LLMs are used as MAS components to generate simulation trajectories; the evaluation target is the attribution method, not the LLMs." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "Not applicable — the paper evaluates an attribution framework, not model knowledge on a benchmark." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Not applicable — no benchmark capability evaluation of pre-trained models." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No cost or latency reported. The method requires M=1000 re-simulations per trajectory, each involving multiple LLM calls, but no wall-clock time or API costs are mentioned." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total computational budget stated despite the method requiring extensive LLM API calls (1000 Monte Carlo samples × trajectory length × re-simulation cost)." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": true, 296 "justification": "Results reported across 5 independent trajectories with different random seeds (Section 4.1), and 10 independent runs for Monte Carlo accuracy (Table 1). Mean and std dev reported." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Explicitly stated: 'five independent trajectories' for main experiments, '10 independent runs' for Monte Carlo accuracy, 'five independent trajectories' for behavior visualization." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "M=1000 selected 'to balance efficiency and accuracy' with Table 1 showing accuracy vs M, but no systematic search over other hyperparameters (threshold ρ, q=0.9, etc.)." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "M=1000 selection justified by Table 1 showing cosine similarity >0.99 at M=1000. q=0.9 is stated as a convention. Threshold ρ is domain-expert defined." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No hypothesis testing performed, so no multiple comparison issue." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "Authors implement all baseline methods themselves (adapted FT, FA, AT prompts in Appendix C) without acknowledging potential bias from their own implementations of competitors." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "Table 1 shows accuracy vs sample size M, but does not report the corresponding compute cost. No comparison of compute cost between the proposed method and baselines." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether risk drop after action removal truly measures attribution faithfulness. This metric assumes removed actions cannot be compensated by remaining agents, which may not hold." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding comparison between models. All models are tested in the same MAS framework." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "The paper evaluates an attribution method on simulation data, not model capability on a benchmark. Temporal leakage is not applicable." 344 }, 345 "feature_leakage_addressed": { 346 "applies": false, 347 "answer": false, 348 "justification": "Not applicable — no prediction task where features could leak answer information." 349 }, 350 "non_independence_addressed": { 351 "applies": false, 352 "answer": false, 353 "justification": "Not applicable — evaluation is on simulation trajectories, not a train/test data split." 354 }, 355 "leakage_detection_method": { 356 "applies": false, 357 "answer": false, 358 "justification": "Not applicable — no benchmark capability evaluation where leakage could occur." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "The Shapley value-based attribution method achieves the highest risk drop across the majority of experimental settings when removing top-attributed actions.", 365 "evidence": "Table 3 shows risk drop percentages for top-3 and top-10 action removal across 5 models and 3 scenarios. The proposed method achieves highest risk drop in most but not all settings.", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "Monte Carlo sampling with M=1000 achieves cosine similarity >0.99 with exact Shapley values.", 370 "evidence": "Table 1 reports cosine similarity between exact and approximated Shapley values across 5 LLMs and 2 scenarios. At M=1000, values typically exceed 0.99.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Extreme events originate from either early dormant risks or immediate shocks (Insight 1).", 375 "evidence": "Table 2 shows EconAgent has high Ltm (>0.6) while TwinMarket and SocialNetwork have Ltm≈0, demonstrated across 5 LLMs.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Extreme events are typically driven by a small subset of agents (Insight 2).", 380 "evidence": "Table 2 shows agent risk concentration Gag often >0.4 across all scenarios and models.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Agents with high risk contribution often exhibit high instability (Insight 3).", 385 "evidence": "Table 2 shows risk-instability correlation Cag often >0.6, with Figure 4(a) visualizing the correlation.", 386 "supported": "moderate" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "Small simulation scale", 392 "detail": "Experiments use only N=10-20 agents with trajectory lengths T=20-34. Claims about 'general insights into extreme phenomena' are based on these small-scale simulations, yet no discussion of whether findings hold at scale." 393 }, 394 { 395 "flag": "No failure analysis", 396 "detail": "Table 3 shows the method underperforms Agent Tracer on SocialNetwork (e.g., GPT-4o Top-3: 17.9 vs 22.2) but this is not discussed or analyzed." 397 }, 398 { 399 "flag": "Missing compute costs", 400 "detail": "The method requires 1000 Monte Carlo re-simulations per trajectory, each involving LLM calls. No cost or time estimates are provided, making practical applicability impossible to assess." 401 }, 402 { 403 "flag": "Self-implemented baselines", 404 "detail": "All baseline methods are re-implemented by the authors with adapted prompts (Appendix C). The adaptations change the original methods' objectives (e.g., FA designed for single-point attribution is adapted to score all actions), potentially disadvantaging baselines." 405 }, 406 { 407 "flag": "Overclaiming generality", 408 "detail": "Five 'insights' are presented as general findings about extreme events in MAS, but they are derived from 3 specific simulation environments with fixed parameters and small agent counts." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "Generative agents: Interactive simulacra of human behavior", 414 "authors": ["J. S. Park", "J. O'Brien", "C. J. Cai", "M. R. Morris", "P. Liang", "M. S. Bernstein"], 415 "year": 2023, 416 "relevance": "Foundational work on LLM-powered multi-agent systems simulating human behavior." 417 }, 418 { 419 "title": "Why do multi-agent LLM systems fail?", 420 "authors": ["M. Cemri", "M. Z. Pan", "S. Yang"], 421 "year": 2025, 422 "arxiv_id": "2503.13657", 423 "relevance": "Taxonomy of failure modes in multi-agent LLM systems, used as a baseline attribution method." 424 }, 425 { 426 "title": "AgentTracer: Who is inducing failure in the LLM agentic systems?", 427 "authors": ["G. Zhang", "J. Wang", "J. Chen"], 428 "year": 2025, 429 "arxiv_id": "2509.03312", 430 "relevance": "Counterfactual-based method for estimating agent contribution in multi-agent systems, used as baseline." 431 }, 432 { 433 "title": "Which agent causes task failures and when? On automated failure attribution of LLM multi-agent systems", 434 "authors": ["S. Zhang", "M. Yin", "J. Zhang"], 435 "year": 2025, 436 "arxiv_id": "2505.00212", 437 "relevance": "LLM-prompting based failure attribution for multi-agent systems, used as baseline." 438 }, 439 { 440 "title": "EconAgent: Large language model-empowered agents for simulating macroeconomic activities", 441 "authors": ["N. Li", "C. Gao", "M. Li", "Y. Li", "Q. Liao"], 442 "year": 2024, 443 "relevance": "LLM-powered economic simulation framework used as one of the three evaluation scenarios." 444 }, 445 { 446 "title": "TwinMarket: A scalable behavioral and social simulation for financial markets", 447 "authors": ["Y. Yang", "Y. Zhang", "M. Wu"], 448 "year": 2025, 449 "arxiv_id": "2502.01506", 450 "relevance": "LLM-powered financial market simulation used as evaluation scenario." 451 }, 452 { 453 "title": "Decoding echo chambers: LLM-powered simulations revealing polarization in social networks", 454 "authors": ["C. Wang", "Z. Liu", "D. Yang", "X. Chen"], 455 "year": 2025, 456 "relevance": "LLM-powered social network simulation showing extreme polarization events." 457 }, 458 { 459 "title": "The rise and potential of large language model based agents: A survey", 460 "authors": ["Z. Xi", "W. Chen", "X. Guo"], 461 "year": 2025, 462 "relevance": "Survey of LLM-based agents covering capabilities, architecture, and applications." 463 }, 464 { 465 "title": "A unified approach to interpreting model predictions", 466 "authors": ["S. M. Lundberg", "S.-I. Lee"], 467 "year": 2017, 468 "relevance": "SHAP framework for model interpretability, foundational to the attribution approach used." 469 }, 470 { 471 "title": "CAMEL: Communicative agents for 'mind' exploration of large language model society", 472 "authors": ["G. Li", "H. Hammoud", "H. Itani"], 473 "year": 2023, 474 "relevance": "Multi-agent LLM framework for exploring emergent social behaviors." 475 }, 476 { 477 "title": "CompeteAI: Understanding the competition dynamics of large language model-based agents", 478 "authors": ["Q. Zhao", "J. Wang", "Y. Zhang"], 479 "year": 2024, 480 "relevance": "Study of competition dynamics in LLM-based multi-agent systems." 481 } 482 ] 483 }