scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24319B)
      1 {
      2   "paper": {
      3     "title": "Interpreting Emergent Extreme Events in Multi-Agent Systems",
      4     "authors": ["Ling Tang", "Jilin Mei", "Dongrui Liu", "Chen Qian", "Dawei Cheng", "Jing Shao", "Xia Hu"],
      5     "year": 2026,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2601.20538",
      8     "doi": "10.48550/arXiv.2601.20538"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval", "theoretical"],
     13   "key_findings": "The paper proposes a Shapley value-based framework for attributing emergent extreme events in LLM-powered multi-agent systems across three dimensions: when, who, and what. Experiments across economic, financial, and social simulation scenarios show extreme events originate from either dormant risks or immediate shocks, are driven by a small subset of agents, and stem from a few dominant behaviors. The proposed method outperforms baseline attribution methods (random, LLM-prompting, surrogate model) in faithfulness as measured by risk drop when removing top-attributed actions.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "GitHub link provided in abstract: https://github.com/mjl0613ddm/IEEE"
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available MAS frameworks (EconAgent, TwinMarket, SocialNetwork) which are existing open-source systems."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, or environment setup details provided. The paper does not specify library versions or dependencies."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions. The paper describes methodology but does not provide a README or script for replicating experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Tables 1, 2, and 3 all report mean ± standard deviation across independent runs."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests used. Claims that 'our method achieved the highest risk drop' are based on comparing raw numbers in Table 3 without any tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Table 3 reports risk drop as percentage (e.g., '36.31% vs 5.32%'), providing magnitude of effect relative to baseline. Table 1 reports cosine similarity values."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Five independent trajectories per setting are used with no justification for why five is sufficient. N=10 or N=20 agents chosen without justification."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Standard deviations reported across independent runs in all main results tables (Tables 1, 2, 3)."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Four baselines compared: Random, Failure Taxonomy (Cemri et al., 2025), Failure Attribution (Zhang et al., 2025b), and Agent Tracer (Zhang et al., 2025a). Table 3."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "All three non-random baselines are from 2025, representing the current state of the art in MAS attribution."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No ablation study. The framework has multiple components (Shapley attribution, dimensional aggregation, metrics) but none are ablated to show individual contribution."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics used: cosine similarity for approximation accuracy (Table 1), risk drop for faithfulness (Table 3), plus five designed metrics (Ltm, Gag, Cag, Zag, Gbe) in Table 2."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation of the attribution quality. The framework claims to 'explain' extreme events but interpretability is only assessed via automated risk drop, not human judgment."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "Not a learning task. The method is applied to simulation trajectories, not trained on data."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results broken down by scenario (EconAgent, TwinMarket, SocialNetwork) and by model (5 LLMs) in all tables."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No discussion of cases where the method fails. Table 3 shows their method underperforms Agent Tracer on SocialNetwork (e.g., GPT-4o Top-3: 17.9 vs 22.2) but this is not discussed."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No negative results discussed. The SocialNetwork scenario shows weaker performance but the paper does not analyze why or report any failed approaches."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims about 'effectiveness' and 'general insights' are supported by Tables 1-3 and the five insights derived from quantitative analysis."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The causal claims are justified through a principled framework: Shapley values provide axiomatic attribution (Properties 1-4 proved in Appendix A), and faithfulness is validated through counterfactual intervention (deleting top actions and measuring risk drop)."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper claims 'general insights into the emergence of extreme phenomena' (abstract) based on only 3 simulation scenarios with small agent counts (N=10-20). The title says 'Multi-Agent Systems' broadly without bounding to these specific simulation types."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No discussion of alternative explanations for the results. For example, the risk drop metric may favor methods that identify correlated rather than causal actions, but this is not discussed."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "Risk drop is used as a proxy for 'faithfulness' of attribution, but the paper does not discuss whether removing high-attribution actions and measuring risk reduction truly validates explanatory quality versus just measuring correlation."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Models listed as 'GPT-4o mini', 'Claude-3-Haiku', 'Qwen-Plus', 'DeepSeek-V3.2' — marketing names without snapshot dates or API versions. 'Llama-3.1-8B-Instruct' includes size but no specific checkpoint."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Appendix C provides full prompt text for baseline methods (FT, FA). Figure 4(c) shows agent configuration prompts. The MAS agent prompts are from published frameworks (EconAgent, TwinMarket, SocialNetwork)."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Paper states 'All LLM APIs were accessed using default parameters' (Section 4.1) without specifying temperature, top-p, or max tokens. M=1000 and λ=0.94 are stated but LLM inference parameters are not."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "MAS scenario implementations described in detail in Appendix B, including action spaces, belief update mechanisms, risk computation formulas, and counterfactual simulation procedures."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Trajectory generation, extreme event detection via threshold, counterfactual trajectory construction (replacing actions with safe baselines), and Monte Carlo sampling procedure all documented in Sections 3.1 and Appendix A-B."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No dedicated limitations section. The conclusion briefly mentions 'Scaling the framework to significantly larger systems remains a primary objective' but this is one sentence, not substantive discussion."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity discussed. No mention of potential issues with the Monte Carlo approximation in practice, sensitivity to safe action definitions, or limitations of the simulation environments."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No explicit boundaries stated about what the results do NOT show. The paper generalizes from 3 specific simulations to 'general insights' without stating scope limitations."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw trajectory data or attribution scores released. Only aggregated results in tables."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Simulation procedures described in detail: agent counts (N=10, 20), trajectory lengths (T=27-34), seed count (5 per setting, 10 for Monte Carlo), and extreme event threshold definition (Appendix B)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. All data generated through LLM-powered simulations."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Full pipeline documented: trajectory generation → extreme event detection → Shapley value computation via Monte Carlo → dimensional aggregation → metric computation (Sections 3.1-3.4, Algorithm 1)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information mentioned anywhere in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations clearly listed: Shanghai AI Laboratory, SJTU, Fudan University, Renmin University, Tongji University."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding information disclosed, so independence cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper does not evaluate pre-trained model capability on benchmarks. LLMs are used as MAS components to generate simulation trajectories; the evaluation target is the attribution method, not the LLMs."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Not applicable — the paper evaluates an attribution framework, not model knowledge on a benchmark."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not applicable — no benchmark capability evaluation of pre-trained models."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No cost or latency reported. The method requires M=1000 re-simulations per trajectory, each involving multiple LLM calls, but no wall-clock time or API costs are mentioned."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total computational budget stated despite the method requiring extensive LLM API calls (1000 Monte Carlo samples × trajectory length × re-simulation cost)."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Results reported across 5 independent trajectories with different random seeds (Section 4.1), and 10 independent runs for Monte Carlo accuracy (Table 1). Mean and std dev reported."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Explicitly stated: 'five independent trajectories' for main experiments, '10 independent runs' for Monte Carlo accuracy, 'five independent trajectories' for behavior visualization."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "M=1000 selected 'to balance efficiency and accuracy' with Table 1 showing accuracy vs M, but no systematic search over other hyperparameters (threshold ρ, q=0.9, etc.)."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "M=1000 selection justified by Table 1 showing cosine similarity >0.99 at M=1000. q=0.9 is stated as a convention. Threshold ρ is domain-expert defined."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No hypothesis testing performed, so no multiple comparison issue."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "Authors implement all baseline methods themselves (adapted FT, FA, AT prompts in Appendix C) without acknowledging potential bias from their own implementations of competitors."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Table 1 shows accuracy vs sample size M, but does not report the corresponding compute cost. No comparison of compute cost between the proposed method and baselines."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether risk drop after action removal truly measures attribution faithfulness. This metric assumes removed actions cannot be compensated by remaining agents, which may not hold."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding comparison between models. All models are tested in the same MAS framework."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "The paper evaluates an attribution method on simulation data, not model capability on a benchmark. Temporal leakage is not applicable."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "Not applicable — no prediction task where features could leak answer information."
    349       },
    350       "non_independence_addressed": {
    351         "applies": false,
    352         "answer": false,
    353         "justification": "Not applicable — evaluation is on simulation trajectories, not a train/test data split."
    354       },
    355       "leakage_detection_method": {
    356         "applies": false,
    357         "answer": false,
    358         "justification": "Not applicable — no benchmark capability evaluation where leakage could occur."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "The Shapley value-based attribution method achieves the highest risk drop across the majority of experimental settings when removing top-attributed actions.",
    365       "evidence": "Table 3 shows risk drop percentages for top-3 and top-10 action removal across 5 models and 3 scenarios. The proposed method achieves highest risk drop in most but not all settings.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "Monte Carlo sampling with M=1000 achieves cosine similarity >0.99 with exact Shapley values.",
    370       "evidence": "Table 1 reports cosine similarity between exact and approximated Shapley values across 5 LLMs and 2 scenarios. At M=1000, values typically exceed 0.99.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Extreme events originate from either early dormant risks or immediate shocks (Insight 1).",
    375       "evidence": "Table 2 shows EconAgent has high Ltm (>0.6) while TwinMarket and SocialNetwork have Ltm≈0, demonstrated across 5 LLMs.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Extreme events are typically driven by a small subset of agents (Insight 2).",
    380       "evidence": "Table 2 shows agent risk concentration Gag often >0.4 across all scenarios and models.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Agents with high risk contribution often exhibit high instability (Insight 3).",
    385       "evidence": "Table 2 shows risk-instability correlation Cag often >0.6, with Figure 4(a) visualizing the correlation.",
    386       "supported": "moderate"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Small simulation scale",
    392       "detail": "Experiments use only N=10-20 agents with trajectory lengths T=20-34. Claims about 'general insights into extreme phenomena' are based on these small-scale simulations, yet no discussion of whether findings hold at scale."
    393     },
    394     {
    395       "flag": "No failure analysis",
    396       "detail": "Table 3 shows the method underperforms Agent Tracer on SocialNetwork (e.g., GPT-4o Top-3: 17.9 vs 22.2) but this is not discussed or analyzed."
    397     },
    398     {
    399       "flag": "Missing compute costs",
    400       "detail": "The method requires 1000 Monte Carlo re-simulations per trajectory, each involving LLM calls. No cost or time estimates are provided, making practical applicability impossible to assess."
    401     },
    402     {
    403       "flag": "Self-implemented baselines",
    404       "detail": "All baseline methods are re-implemented by the authors with adapted prompts (Appendix C). The adaptations change the original methods' objectives (e.g., FA designed for single-point attribution is adapted to score all actions), potentially disadvantaging baselines."
    405     },
    406     {
    407       "flag": "Overclaiming generality",
    408       "detail": "Five 'insights' are presented as general findings about extreme events in MAS, but they are derived from 3 specific simulation environments with fixed parameters and small agent counts."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "Generative agents: Interactive simulacra of human behavior",
    414       "authors": ["J. S. Park", "J. O'Brien", "C. J. Cai", "M. R. Morris", "P. Liang", "M. S. Bernstein"],
    415       "year": 2023,
    416       "relevance": "Foundational work on LLM-powered multi-agent systems simulating human behavior."
    417     },
    418     {
    419       "title": "Why do multi-agent LLM systems fail?",
    420       "authors": ["M. Cemri", "M. Z. Pan", "S. Yang"],
    421       "year": 2025,
    422       "arxiv_id": "2503.13657",
    423       "relevance": "Taxonomy of failure modes in multi-agent LLM systems, used as a baseline attribution method."
    424     },
    425     {
    426       "title": "AgentTracer: Who is inducing failure in the LLM agentic systems?",
    427       "authors": ["G. Zhang", "J. Wang", "J. Chen"],
    428       "year": 2025,
    429       "arxiv_id": "2509.03312",
    430       "relevance": "Counterfactual-based method for estimating agent contribution in multi-agent systems, used as baseline."
    431     },
    432     {
    433       "title": "Which agent causes task failures and when? On automated failure attribution of LLM multi-agent systems",
    434       "authors": ["S. Zhang", "M. Yin", "J. Zhang"],
    435       "year": 2025,
    436       "arxiv_id": "2505.00212",
    437       "relevance": "LLM-prompting based failure attribution for multi-agent systems, used as baseline."
    438     },
    439     {
    440       "title": "EconAgent: Large language model-empowered agents for simulating macroeconomic activities",
    441       "authors": ["N. Li", "C. Gao", "M. Li", "Y. Li", "Q. Liao"],
    442       "year": 2024,
    443       "relevance": "LLM-powered economic simulation framework used as one of the three evaluation scenarios."
    444     },
    445     {
    446       "title": "TwinMarket: A scalable behavioral and social simulation for financial markets",
    447       "authors": ["Y. Yang", "Y. Zhang", "M. Wu"],
    448       "year": 2025,
    449       "arxiv_id": "2502.01506",
    450       "relevance": "LLM-powered financial market simulation used as evaluation scenario."
    451     },
    452     {
    453       "title": "Decoding echo chambers: LLM-powered simulations revealing polarization in social networks",
    454       "authors": ["C. Wang", "Z. Liu", "D. Yang", "X. Chen"],
    455       "year": 2025,
    456       "relevance": "LLM-powered social network simulation showing extreme polarization events."
    457     },
    458     {
    459       "title": "The rise and potential of large language model based agents: A survey",
    460       "authors": ["Z. Xi", "W. Chen", "X. Guo"],
    461       "year": 2025,
    462       "relevance": "Survey of LLM-based agents covering capabilities, architecture, and applications."
    463     },
    464     {
    465       "title": "A unified approach to interpreting model predictions",
    466       "authors": ["S. M. Lundberg", "S.-I. Lee"],
    467       "year": 2017,
    468       "relevance": "SHAP framework for model interpretability, foundational to the attribution approach used."
    469     },
    470     {
    471       "title": "CAMEL: Communicative agents for 'mind' exploration of large language model society",
    472       "authors": ["G. Li", "H. Hammoud", "H. Itani"],
    473       "year": 2023,
    474       "relevance": "Multi-agent LLM framework for exploring emergent social behaviors."
    475     },
    476     {
    477       "title": "CompeteAI: Understanding the competition dynamics of large language model-based agents",
    478       "authors": ["Q. Zhao", "J. Wang", "Y. Zhang"],
    479       "year": 2024,
    480       "relevance": "Study of competition dynamics in LLM-based multi-agent systems."
    481     }
    482   ]
    483 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs