scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25183B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Interpreting Emergent Extreme Events in Multi-Agent Systems",
      6     "authors": [
      7       "Ling Tang",
      8       "Jilin Mei",
      9       "Dongrui Liu",
     10       "Chen Qian",
     11       "Dawei Cheng"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2601.20538",
     16     "doi": "10.48550/arXiv.2601.20538"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims (first framework for explaining extreme events in MAS, Shapley-based attribution, three-dimensional aggregation, effectiveness across three scenarios) are all demonstrated in the paper body.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper uses causal language ('drives', 'contributes to') in all five insights, but observations are drawn from only 5 independent trajectories per setting with N=4-20 agents; no design adequate for causal inference is employed.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The five insights are presented as general properties of extreme events across MAS, but are derived from only three simulated scenarios with very small agent counts and five trajectories each; the paper does not bound the generalization.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No alternative explanations are considered for any of the five insights; for example, the risk-instability correlation (Insight 3) could reflect shared confounders rather than a meaningful relationship.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Risk metrics are explicitly defined (EWMA conditional variance for EconAgent/TwinMarket, belief variance for SocialNetwork), and claims are made about these specific metrics rather than broader notions.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The conclusion mentions one limitation ('scaling to significantly larger systems remains future work') but there is no dedicated limitations or threats-to-validity section.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No specific threats are discussed — the small trajectory count (n=5), expert-annotated thresholds, minimal agent counts, or reliance on default API parameters are never flagged as validity threats.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not state what the results do not show; the five insights are presented without caveats about the restricted simulation settings from which they were derived.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment appears anywhere in the paper text.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly listed: Shanghai AI Laboratory, Shanghai Jiao Tong University, Fudan University, Renmin University of China, Tongji University.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding is disclosed, so independence cannot be assessed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "There is no competing interests or financial interests declaration in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are defined: 'extreme events' (Black Swans) are defined with three properties (outliers, extreme impact, retrospective predictability); 'Shapley value' is formally introduced; risk metrics are precisely specified per scenario.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly states it proposes 'the first framework for explaining emergent extreme events in MAS' and articulates three specific questions the framework answers.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 situates the work against three streams: LLM-based MAS simulation, attribution methods in MAS, and quantitative analysis of extreme events, explaining how this work differs from each.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The abstract explicitly states 'The source code is available at https://github.com/mjl0613ddm/IEEE.'",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "No specific simulation trajectories are released; the environments (EconAgent, TwinMarket) are referenced to other papers, and the paper does not release the specific trajectory data used in experiments.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements file, Dockerfile, or dependency specification is mentioned; model APIs are accessed with 'default parameters' which are unspecified.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided in the paper; the reader is pointed to a GitHub link without further guidance.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "Tables 1 and 2 report mean ± standard deviation across independent runs for all main results.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are used for any comparisons between methods or across scenarios despite comparative claims being made.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Table 3 reports percentage risk drop values that convey magnitude of improvement over baselines, providing interpretable effect-size information.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Only 5 independent trajectories per experimental setting are used; no justification or power analysis is provided for this choice.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Standard deviations are reported in Tables 1 and 2 across all metrics and model/scenario combinations.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Four baselines are included: Random, Failure Taxonomy (Cemri et al., 2025), Failure Attribution (Zhang et al., 2025b), and Agent Tracer (Zhang et al., 2025a).",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "All baselines are from 2025, contemporary with the submission; they represent current approaches to MAS attribution.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "No ablation of framework components is performed; Table 1 tests Monte Carlo sample sizes but this is a sensitivity analysis, not an ablation of design choices.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Five distinct metrics are proposed and evaluated (Ltm, Gag, Cag, Zag, Gbe) in addition to the faithfulness metric (risk drop).",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "The framework evaluates computational systems; human evaluation is not relevant to the claims.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "This is not a prediction task; results are derived from generated simulation trajectories rather than a train/test split.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results in Tables 2 and 3 are broken down per scenario (EconAgent, TwinMarket, SocialNetwork) and per LLM model.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "The paper does not explicitly discuss failure cases of its own framework; results where their method underperforms (e.g., SocialNetwork Top-3 in Table 3) are not highlighted or analyzed.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": false,
    230           "justification": "In Table 3, the proposed method is outperformed by AT on SocialNetwork for several models, but this is not acknowledged or discussed as a negative result.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Specific model versions are named and cited: GPT-4o mini, Llama-3.1-8B-Instruct, Claude-3-Haiku, Qwen-Plus, DeepSeek-V3.2.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "Prompts for the FT and FA baselines are provided in the appendix, but the core agent prompts used in the EconAgent, TwinMarket, and SocialNetwork simulations are not provided — they are deferred to external papers.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "'All LLM APIs were accessed using default parameters' — temperature, top-p, and other hyperparameters are not specified.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The agentic scaffolding is described in Section 4.1 and Appendix B, including agent action spaces, transition dynamics, risk metric definitions, and baseline action definitions for counterfactuals.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "The full data pipeline from trajectory generation to Shapley computation to metric derivation is documented in Sections 3 and Appendix A-B, including EWMA risk computations.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The raw trajectories used in experiments are not explicitly released; the code is available but specific experimental trajectories are not.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "The data generation process is described: run simulations with different random seeds until extreme events occur, collect 5 trajectories per setting.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; data is from LLM-powered simulations.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The pipeline from simulation trajectory to attribution scores to metric computation is fully documented in Sections 3-4 and Appendix A-B.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "The paper uses LLMs as agents in simulations, not evaluating model capabilities on benchmarks; training cutoff is not relevant.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": false,
    301           "answer": false,
    302           "justification": "Not applicable — LLMs are used as simulation agents, not evaluated on held-out benchmarks.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "Not applicable — the paper does not evaluate model capabilities on established benchmarks.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "The paper discusses computational complexity (O(M × |Ω|)) but reports no actual inference cost, latency, or API expenditure.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No total compute budget or wall-clock time is reported anywhere in the paper.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Monte Carlo sampling with M=1000 achieves >0.99 cosine similarity to exact Shapley values across five LLMs and two scenarios.",
    375       "evidence": "Table 1 shows cosine similarity consistently exceeds 0.99 at M=10^3 for both EconAgent and TwinMarket across all five tested LLMs.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "The proposed Shapley-based attribution is more faithful than competing MAS attribution methods (FT, FA, AT, Random) as measured by risk drop after deleting top-k attributed actions.",
    380       "evidence": "Table 3 shows the proposed method achieves the highest risk drop in the majority of settings, though AT outperforms on SocialNetwork for several models at Top-3.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Extreme events originate with distinct temporal patterns — either early dormant risks (EconAgent, Ltm>0.6) or immediate shocks (TwinMarket and SocialNetwork, Ltm≈0).",
    385       "evidence": "Table 2 shows consistently high Ltm for EconAgent and near-zero Ltm for TwinMarket/SocialNetwork across all five LLMs.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Extreme events are typically driven by a small subset of agents, with agent risk concentration Gag consistently above 0.4.",
    390       "evidence": "Table 2 shows Gag > 0.4 in most experimental settings across three scenarios and five LLMs.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Agents with high risk contribution exhibit high behavioral instability, as reflected by consistently positive risk-instability correlation Cag > 0.6.",
    395       "evidence": "Table 2 shows Cag > 0.6 in most EconAgent and TwinMarket settings; SocialNetwork shows weaker and sometimes negative correlations.",
    396       "supported": "weak"
    397     },
    398     {
    399       "claim": "A small number of behavior patterns contribute the majority of risk leading to extreme events, with behavior risk concentration Gbe > 0.5.",
    400       "evidence": "Table 2 shows Gbe consistently above 0.5 across all three scenarios and five LLMs.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "empirical",
    406     "benchmark-eval",
    407     "case-study"
    408   ],
    409   "key_findings": "This paper proposes a Shapley value-based framework to explain extreme events in LLM-powered multi-agent systems by attributing risk contributions across temporal, agent, and behavioral dimensions. Experiments across economic, financial, and social network simulations show the framework more faithfully identifies high-risk actions than competing methods, measured by the risk drop when those actions are removed. Analysis of three scenarios reveals consistent patterns: economic simulation extreme events originate from early dormant risks while financial and social network events emerge from immediate shocks; a small subset of agents drives most risk; and risk-contributing agents tend to exhibit high behavioral instability. Monte Carlo approximation of Shapley values converges at M=1000 samples with >0.99 cosine similarity to exact values.",
    410   "red_flags": [
    411     {
    412       "flag": "Tiny sample size",
    413       "detail": "Only 5 independent trajectories per experimental setting are used to derive all five insights; this is insufficient for reliable statistical inference about general properties of extreme events."
    414     },
    415     {
    416       "flag": "Small agent counts",
    417       "detail": "Experiments use N=4-20 agents per scenario, far below real-world multi-agent system scales; the paper claims general insights without bounding to these constrained settings."
    418     },
    419     {
    420       "flag": "Expert-annotated thresholds",
    421       "detail": "Extreme event thresholds are determined by 'domain experts' without specifying who, how, or validation of inter-rater agreement, introducing unquantified subjectivity."
    422     },
    423     {
    424       "flag": "Unspecified LLM hyperparameters",
    425       "detail": "'All LLM APIs were accessed using default parameters' — temperature and sampling parameters are not reported, making exact reproduction impossible."
    426     },
    427     {
    428       "flag": "No significance testing",
    429       "detail": "All comparative claims in Table 3 are made without statistical significance tests, despite high variance in results (some negative risk drops indicate the method can harm attribution)."
    430     },
    431     {
    432       "flag": "Underperformance not addressed",
    433       "detail": "The proposed method is outperformed by Agent Tracer on SocialNetwork for multiple LLMs at Top-3 (e.g., AT 22.2% vs ours 17.9% for GPT) but this negative result is not discussed."
    434     },
    435     {
    436       "flag": "No limitations section",
    437       "detail": "The only acknowledged limitation is scalability to larger systems; threats from small samples, simulated environments, expert-annotated labels, or API parameter sensitivity are ignored."
    438     }
    439   ],
    440   "cited_papers": [
    441     {
    442       "title": "EconAgent: Large language model-empowered agents for simulating macroeconomic activities",
    443       "relevance": "Primary simulation environment used for economic scenario experiments; defines the EconAgent MAS framework applied in this paper."
    444     },
    445     {
    446       "title": "TwinMarket: A scalable behavioral and social simulation for financial markets",
    447       "relevance": "Primary simulation environment for financial market scenario; provides the MAS framework for market crash experiments."
    448     },
    449     {
    450       "title": "Decoding echo chambers: LLM-powered simulations revealing polarization in social networks",
    451       "relevance": "Provides the SocialNetwork simulation environment used as the third experimental scenario."
    452     },
    453     {
    454       "title": "Why do multi-agent LLM systems fail? (Cemri et al., 2025)",
    455       "relevance": "Baseline method (Failure Taxonomy) compared against in faithfulness evaluation; introduces 14 failure modes for MAS."
    456     },
    457     {
    458       "title": "Which agent causes task failures and when? On automated failure attribution of LLM multi-agent systems (Zhang et al., 2025b)",
    459       "relevance": "Baseline method (Failure Attribution) compared in faithfulness evaluation; direct prompting approach to MAS attribution."
    460     },
    461     {
    462       "title": "AgentTracer: Who is inducing failure in the LLM agentic systems? (Zhang et al., 2025a)",
    463       "relevance": "Baseline method (Agent Tracer) compared in faithfulness evaluation; counterfactual-based surrogate model approach."
    464     },
    465     {
    466       "title": "A unified approach to interpreting model predictions (SHAP/Lundberg & Lee, 2017)",
    467       "relevance": "Foundational work on Shapley values for ML interpretability that this paper adapts to the MAS attribution setting."
    468     },
    469     {
    470       "title": "Generative agents: Interactive simulacra of human behavior (Park et al., 2023)",
    471       "relevance": "Landmark paper establishing LLM-powered agents for human behavior simulation; motivates the MAS simulation paradigm."
    472     },
    473     {
    474       "title": "A value for n-person games (Shapley, 1953)",
    475       "relevance": "Original Shapley value paper providing the game-theoretic foundation for the attribution method."
    476     },
    477     {
    478       "title": "OASIS: Open agent social interaction simulations with one million agents (Yang et al., 2024)",
    479       "relevance": "Large-scale LLM-powered MAS simulation work contextualizing the scale at which extreme event interpretation is needed."
    480     }
    481   ],
    482   "engagement_factors": {
    483     "practical_relevance": {
    484       "score": 2,
    485       "justification": "Framework for identifying which agents and behaviors cause systemic failures in AI simulations has direct application to AI safety and multi-agent system design."
    486     },
    487     "surprise_contrarian": {
    488       "score": 1,
    489       "justification": "Applying Shapley values to MAS attribution is methodologically novel but the insights (few agents drive risk, behavior concentration) are intuitive rather than surprising."
    490     },
    491     "fear_safety": {
    492       "score": 2,
    493       "justification": "Paper explicitly frames the problem as Black Swan events and systemic collapse in AI-powered systems, connecting to AI safety concerns about emergent dangerous behavior."
    494     },
    495     "drama_conflict": {
    496       "score": 1,
    497       "justification": "No significant controversy; the paper presents a technical framework without challenging established results or making strong contrarian claims."
    498     },
    499     "demo_ability": {
    500       "score": 2,
    501       "justification": "Code is released on GitHub, enabling practitioners to run the attribution framework on their own MAS simulations."
    502     },
    503     "brand_recognition": {
    504       "score": 1,
    505       "justification": "Shanghai AI Laboratory and affiliated universities are respected institutions but not in the top tier of brand recognition for this community."
    506     }
    507   },
    508   "hn_data": {
    509     "threads": [],
    510     "top_points": 0,
    511     "total_points": 0,
    512     "total_comments": 0
    513   }
    514 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs