scan-v5.json (25183B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Interpreting Emergent Extreme Events in Multi-Agent Systems", 6 "authors": [ 7 "Ling Tang", 8 "Jilin Mei", 9 "Dongrui Liu", 10 "Chen Qian", 11 "Dawei Cheng" 12 ], 13 "year": 2026, 14 "venue": "arXiv.org", 15 "arxiv_id": "2601.20538", 16 "doi": "10.48550/arXiv.2601.20538" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "The abstract claims (first framework for explaining extreme events in MAS, Shapley-based attribution, three-dimensional aggregation, effectiveness across three scenarios) are all demonstrated in the paper body.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper uses causal language ('drives', 'contributes to') in all five insights, but observations are drawn from only 5 independent trajectories per setting with N=4-20 agents; no design adequate for causal inference is employed.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "The five insights are presented as general properties of extreme events across MAS, but are derived from only three simulated scenarios with very small agent counts and five trajectories each; the paper does not bound the generalization.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "No alternative explanations are considered for any of the five insights; for example, the risk-instability correlation (Insight 3) could reflect shared confounders rather than a meaningful relationship.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "Risk metrics are explicitly defined (EWMA conditional variance for EconAgent/TwinMarket, belief variance for SocialNetwork), and claims are made about these specific metrics rather than broader notions.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": false, 55 "justification": "The conclusion mentions one limitation ('scaling to significantly larger systems remains future work') but there is no dedicated limitations or threats-to-validity section.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "No specific threats are discussed — the small trajectory count (n=5), expert-annotated thresholds, minimal agent counts, or reliance on default API parameters are never flagged as validity threats.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "The paper does not state what the results do not show; the five insights are presented without caveats about the restricted simulation settings from which they were derived.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No funding acknowledgment appears anywhere in the paper text.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations are clearly listed: Shanghai AI Laboratory, Shanghai Jiao Tong University, Fudan University, Renmin University of China, Tongji University.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": false, 86 "answer": false, 87 "justification": "No funding is disclosed, so independence cannot be assessed.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "There is no competing interests or financial interests declaration in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Key terms are defined: 'extreme events' (Black Swans) are defined with three properties (outliers, extreme impact, retrospective predictability); 'Shapley value' is formally introduced; risk metrics are precisely specified per scenario.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper explicitly states it proposes 'the first framework for explaining emergent extreme events in MAS' and articulates three specific questions the framework answers.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 2 situates the work against three streams: LLM-based MAS simulation, attribution methods in MAS, and quantitative analysis of extreme events, explaining how this work differs from each.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": true, 124 "justification": "The abstract explicitly states 'The source code is available at https://github.com/mjl0613ddm/IEEE.'", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": false, 130 "justification": "No specific simulation trajectories are released; the environments (EconAgent, TwinMarket) are referenced to other papers, and the paper does not release the specific trajectory data used in experiments.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "No requirements file, Dockerfile, or dependency specification is mentioned; model APIs are accessed with 'default parameters' which are unspecified.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "No step-by-step reproduction instructions are provided in the paper; the reader is pointed to a GitHub link without further guidance.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": true, 150 "justification": "Tables 1 and 2 report mean ± standard deviation across independent runs for all main results.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are used for any comparisons between methods or across scenarios despite comparative claims being made.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Table 3 reports percentage risk drop values that convey magnitude of improvement over baselines, providing interpretable effect-size information.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "Only 5 independent trajectories per experimental setting are used; no justification or power analysis is provided for this choice.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": true, 174 "justification": "Standard deviations are reported in Tables 1 and 2 across all metrics and model/scenario combinations.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Four baselines are included: Random, Failure Taxonomy (Cemri et al., 2025), Failure Attribution (Zhang et al., 2025b), and Agent Tracer (Zhang et al., 2025a).", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "All baselines are from 2025, contemporary with the submission; they represent current approaches to MAS attribution.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": false, 194 "justification": "No ablation of framework components is performed; Table 1 tests Monte Carlo sample sizes but this is a sensitivity analysis, not an ablation of design choices.", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Five distinct metrics are proposed and evaluated (Ltm, Gag, Cag, Zag, Gbe) in addition to the faithfulness metric (risk drop).", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": false, 205 "answer": false, 206 "justification": "The framework evaluates computational systems; human evaluation is not relevant to the claims.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": false, 211 "answer": false, 212 "justification": "This is not a prediction task; results are derived from generated simulation trajectories rather than a train/test split.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Results in Tables 2 and 3 are broken down per scenario (EconAgent, TwinMarket, SocialNetwork) and per LLM model.", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": false, 224 "justification": "The paper does not explicitly discuss failure cases of its own framework; results where their method underperforms (e.g., SocialNetwork Top-3 in Table 3) are not highlighted or analyzed.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": false, 230 "justification": "In Table 3, the proposed method is outperformed by AT on SocialNetwork for several models, but this is not acknowledged or discussed as a negative result.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "Specific model versions are named and cited: GPT-4o mini, Llama-3.1-8B-Instruct, Claude-3-Haiku, Qwen-Plus, DeepSeek-V3.2.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": false, 244 "justification": "Prompts for the FT and FA baselines are provided in the appendix, but the core agent prompts used in the EconAgent, TwinMarket, and SocialNetwork simulations are not provided — they are deferred to external papers.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": false, 250 "justification": "'All LLM APIs were accessed using default parameters' — temperature, top-p, and other hyperparameters are not specified.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": true, 255 "answer": true, 256 "justification": "The agentic scaffolding is described in Section 4.1 and Appendix B, including agent action spaces, transition dynamics, risk metric definitions, and baseline action definitions for counterfactuals.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "The full data pipeline from trajectory generation to Shapley computation to metric derivation is documented in Sections 3 and Appendix A-B, including EWMA risk computations.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": false, 270 "justification": "The raw trajectories used in experiments are not explicitly released; the code is available but specific experimental trajectories are not.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "The data generation process is described: run simulations with different random seeds until extreme events occur, collect 5 trajectories per setting.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants; data is from LLM-powered simulations.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "The pipeline from simulation trajectory to attribution scores to metric computation is fully documented in Sections 3-4 and Appendix A-B.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": false, 295 "answer": false, 296 "justification": "The paper uses LLMs as agents in simulations, not evaluating model capabilities on benchmarks; training cutoff is not relevant.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": false, 301 "answer": false, 302 "justification": "Not applicable — LLMs are used as simulation agents, not evaluated on held-out benchmarks.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": false, 307 "answer": false, 308 "justification": "Not applicable — the paper does not evaluate model capabilities on established benchmarks.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": false, 360 "justification": "The paper discusses computational complexity (O(M × |Ω|)) but reports no actual inference cost, latency, or API expenditure.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "No total compute budget or wall-clock time is reported anywhere in the paper.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "Monte Carlo sampling with M=1000 achieves >0.99 cosine similarity to exact Shapley values across five LLMs and two scenarios.", 375 "evidence": "Table 1 shows cosine similarity consistently exceeds 0.99 at M=10^3 for both EconAgent and TwinMarket across all five tested LLMs.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "The proposed Shapley-based attribution is more faithful than competing MAS attribution methods (FT, FA, AT, Random) as measured by risk drop after deleting top-k attributed actions.", 380 "evidence": "Table 3 shows the proposed method achieves the highest risk drop in the majority of settings, though AT outperforms on SocialNetwork for several models at Top-3.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Extreme events originate with distinct temporal patterns — either early dormant risks (EconAgent, Ltm>0.6) or immediate shocks (TwinMarket and SocialNetwork, Ltm≈0).", 385 "evidence": "Table 2 shows consistently high Ltm for EconAgent and near-zero Ltm for TwinMarket/SocialNetwork across all five LLMs.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Extreme events are typically driven by a small subset of agents, with agent risk concentration Gag consistently above 0.4.", 390 "evidence": "Table 2 shows Gag > 0.4 in most experimental settings across three scenarios and five LLMs.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Agents with high risk contribution exhibit high behavioral instability, as reflected by consistently positive risk-instability correlation Cag > 0.6.", 395 "evidence": "Table 2 shows Cag > 0.6 in most EconAgent and TwinMarket settings; SocialNetwork shows weaker and sometimes negative correlations.", 396 "supported": "weak" 397 }, 398 { 399 "claim": "A small number of behavior patterns contribute the majority of risk leading to extreme events, with behavior risk concentration Gbe > 0.5.", 400 "evidence": "Table 2 shows Gbe consistently above 0.5 across all three scenarios and five LLMs.", 401 "supported": "moderate" 402 } 403 ], 404 "methodology_tags": [ 405 "empirical", 406 "benchmark-eval", 407 "case-study" 408 ], 409 "key_findings": "This paper proposes a Shapley value-based framework to explain extreme events in LLM-powered multi-agent systems by attributing risk contributions across temporal, agent, and behavioral dimensions. Experiments across economic, financial, and social network simulations show the framework more faithfully identifies high-risk actions than competing methods, measured by the risk drop when those actions are removed. Analysis of three scenarios reveals consistent patterns: economic simulation extreme events originate from early dormant risks while financial and social network events emerge from immediate shocks; a small subset of agents drives most risk; and risk-contributing agents tend to exhibit high behavioral instability. Monte Carlo approximation of Shapley values converges at M=1000 samples with >0.99 cosine similarity to exact values.", 410 "red_flags": [ 411 { 412 "flag": "Tiny sample size", 413 "detail": "Only 5 independent trajectories per experimental setting are used to derive all five insights; this is insufficient for reliable statistical inference about general properties of extreme events." 414 }, 415 { 416 "flag": "Small agent counts", 417 "detail": "Experiments use N=4-20 agents per scenario, far below real-world multi-agent system scales; the paper claims general insights without bounding to these constrained settings." 418 }, 419 { 420 "flag": "Expert-annotated thresholds", 421 "detail": "Extreme event thresholds are determined by 'domain experts' without specifying who, how, or validation of inter-rater agreement, introducing unquantified subjectivity." 422 }, 423 { 424 "flag": "Unspecified LLM hyperparameters", 425 "detail": "'All LLM APIs were accessed using default parameters' — temperature and sampling parameters are not reported, making exact reproduction impossible." 426 }, 427 { 428 "flag": "No significance testing", 429 "detail": "All comparative claims in Table 3 are made without statistical significance tests, despite high variance in results (some negative risk drops indicate the method can harm attribution)." 430 }, 431 { 432 "flag": "Underperformance not addressed", 433 "detail": "The proposed method is outperformed by Agent Tracer on SocialNetwork for multiple LLMs at Top-3 (e.g., AT 22.2% vs ours 17.9% for GPT) but this negative result is not discussed." 434 }, 435 { 436 "flag": "No limitations section", 437 "detail": "The only acknowledged limitation is scalability to larger systems; threats from small samples, simulated environments, expert-annotated labels, or API parameter sensitivity are ignored." 438 } 439 ], 440 "cited_papers": [ 441 { 442 "title": "EconAgent: Large language model-empowered agents for simulating macroeconomic activities", 443 "relevance": "Primary simulation environment used for economic scenario experiments; defines the EconAgent MAS framework applied in this paper." 444 }, 445 { 446 "title": "TwinMarket: A scalable behavioral and social simulation for financial markets", 447 "relevance": "Primary simulation environment for financial market scenario; provides the MAS framework for market crash experiments." 448 }, 449 { 450 "title": "Decoding echo chambers: LLM-powered simulations revealing polarization in social networks", 451 "relevance": "Provides the SocialNetwork simulation environment used as the third experimental scenario." 452 }, 453 { 454 "title": "Why do multi-agent LLM systems fail? (Cemri et al., 2025)", 455 "relevance": "Baseline method (Failure Taxonomy) compared against in faithfulness evaluation; introduces 14 failure modes for MAS." 456 }, 457 { 458 "title": "Which agent causes task failures and when? On automated failure attribution of LLM multi-agent systems (Zhang et al., 2025b)", 459 "relevance": "Baseline method (Failure Attribution) compared in faithfulness evaluation; direct prompting approach to MAS attribution." 460 }, 461 { 462 "title": "AgentTracer: Who is inducing failure in the LLM agentic systems? (Zhang et al., 2025a)", 463 "relevance": "Baseline method (Agent Tracer) compared in faithfulness evaluation; counterfactual-based surrogate model approach." 464 }, 465 { 466 "title": "A unified approach to interpreting model predictions (SHAP/Lundberg & Lee, 2017)", 467 "relevance": "Foundational work on Shapley values for ML interpretability that this paper adapts to the MAS attribution setting." 468 }, 469 { 470 "title": "Generative agents: Interactive simulacra of human behavior (Park et al., 2023)", 471 "relevance": "Landmark paper establishing LLM-powered agents for human behavior simulation; motivates the MAS simulation paradigm." 472 }, 473 { 474 "title": "A value for n-person games (Shapley, 1953)", 475 "relevance": "Original Shapley value paper providing the game-theoretic foundation for the attribution method." 476 }, 477 { 478 "title": "OASIS: Open agent social interaction simulations with one million agents (Yang et al., 2024)", 479 "relevance": "Large-scale LLM-powered MAS simulation work contextualizing the scale at which extreme event interpretation is needed." 480 } 481 ], 482 "engagement_factors": { 483 "practical_relevance": { 484 "score": 2, 485 "justification": "Framework for identifying which agents and behaviors cause systemic failures in AI simulations has direct application to AI safety and multi-agent system design." 486 }, 487 "surprise_contrarian": { 488 "score": 1, 489 "justification": "Applying Shapley values to MAS attribution is methodologically novel but the insights (few agents drive risk, behavior concentration) are intuitive rather than surprising." 490 }, 491 "fear_safety": { 492 "score": 2, 493 "justification": "Paper explicitly frames the problem as Black Swan events and systemic collapse in AI-powered systems, connecting to AI safety concerns about emergent dangerous behavior." 494 }, 495 "drama_conflict": { 496 "score": 1, 497 "justification": "No significant controversy; the paper presents a technical framework without challenging established results or making strong contrarian claims." 498 }, 499 "demo_ability": { 500 "score": 2, 501 "justification": "Code is released on GitHub, enabling practitioners to run the attribution framework on their own MAS simulations." 502 }, 503 "brand_recognition": { 504 "score": 1, 505 "justification": "Shanghai AI Laboratory and affiliated universities are respected institutions but not in the top tier of brand recognition for this community." 506 } 507 }, 508 "hn_data": { 509 "threads": [], 510 "top_points": 0, 511 "total_points": 0, 512 "total_comments": 0 513 } 514 }