scan.json (20270B)
1 { 2 "scan_version": 2, 3 "active_modules": [], 4 "paper": { 5 "title": "Multi-Agent Risks from Advanced AI", 6 "authors": [ 7 "Lewis Hammond", 8 "Alan Chan", 9 "Jesse Clifton", 10 "Jason Hoelscher-Obermaier", 11 "Akbir Khan", 12 "Euan McLean", 13 "Chandler Smith", 14 "Wolfram Barfuss", 15 "Jakob Foerster", 16 "Tomáš Gavenčiak", 17 "The Anh Han", 18 "Edward Hughes", 19 "Vojtěch Kovařík", 20 "Jan Kulveit", 21 "Joel Z. Leibo", 22 "Caspar Oesterheld", 23 "Christian Schroeder de Witt", 24 "Nisarg Shah", 25 "Michael Wellman", 26 "Paolo Bova" 27 ], 28 "year": 2025, 29 "venue": "Cooperative AI Foundation Technical Report #1", 30 "arxiv_id": "2502.14143" 31 }, 32 "checklist": { 33 "artifacts": { 34 "code_released": { 35 "applies": true, 36 "answer": true, 37 "justification": "Section 1 footnote states code for the novel experiments is available at https://github.com/coopai/multi-agent-risks." 38 }, 39 "data_released": { 40 "applies": true, 41 "answer": false, 42 "justification": "No datasets are released. The three novel experiments are simulation-based but no simulation data or outputs are provided for download." 43 }, 44 "environment_specified": { 45 "applies": true, 46 "answer": false, 47 "justification": "No environment specifications, dependency files, or setup instructions are provided for the experimental code." 48 }, 49 "reproduction_instructions": { 50 "applies": true, 51 "answer": false, 52 "justification": "No step-by-step reproduction instructions are provided for the three novel experiments in Appendix B." 53 } 54 }, 55 "statistical_methodology": { 56 "confidence_intervals_or_error_bars": { 57 "applies": false, 58 "answer": false, 59 "justification": "The paper is primarily a theoretical taxonomy. The three case study experiments are illustrative demonstrations, not statistical evaluations requiring uncertainty quantification." 60 }, 61 "significance_tests": { 62 "applies": false, 63 "answer": false, 64 "justification": "No comparative empirical claims are made that would require significance testing. The experiments illustrate conceptual points rather than claiming statistical differences." 65 }, 66 "effect_sizes_reported": { 67 "applies": false, 68 "answer": false, 69 "justification": "Theoretical/taxonomy paper with illustrative experiments. No effect size claims are made." 70 }, 71 "sample_size_justified": { 72 "applies": false, 73 "answer": false, 74 "justification": "Theoretical paper. The experiments are conceptual demonstrations, not sample-based studies." 75 }, 76 "variance_reported": { 77 "applies": false, 78 "answer": false, 79 "justification": "Theoretical paper with illustrative simulations. No empirical variance claims are made." 80 } 81 }, 82 "evaluation_design": { 83 "baselines_included": { 84 "applies": false, 85 "answer": false, 86 "justification": "This is a taxonomy/position paper, not a system evaluation. The experiments illustrate risk scenarios rather than comparing approaches." 87 }, 88 "baselines_contemporary": { 89 "applies": false, 90 "answer": false, 91 "justification": "No system comparison is made; this is a taxonomy paper." 92 }, 93 "ablation_study": { 94 "applies": false, 95 "answer": false, 96 "justification": "No system with components to ablate. The paper proposes a conceptual framework, not a method." 97 }, 98 "multiple_metrics": { 99 "applies": false, 100 "answer": false, 101 "justification": "No system evaluation with metrics. The experiments demonstrate risk phenomena qualitatively." 102 }, 103 "human_evaluation": { 104 "applies": false, 105 "answer": false, 106 "justification": "No system outputs to evaluate. This is a theoretical taxonomy paper." 107 }, 108 "held_out_test_set": { 109 "applies": false, 110 "answer": false, 111 "justification": "No datasets or benchmarks used. Theoretical paper with illustrative simulations." 112 }, 113 "per_category_breakdown": { 114 "applies": true, 115 "answer": true, 116 "justification": "The taxonomy provides detailed per-category breakdowns of risk factors (7 risk factors across 3 failure modes), with each analyzed independently in Sections 3-5 and case studies organized by category in Section 6 and Appendix B." 117 }, 118 "failure_cases_discussed": { 119 "applies": true, 120 "answer": true, 121 "justification": "The entire paper is about failure cases. All 13 case studies (Section 6, Appendix B) describe specific multi-agent failure scenarios including collusion, emergent coordination failures, and adversarial exploitation." 122 }, 123 "negative_results_reported": { 124 "applies": true, 125 "answer": true, 126 "justification": "The three novel experiments (Case Studies 1, 7, 13) demonstrate negative outcomes: agents learning to collude, coordination failures in multi-principal settings, and adversarial exploitation of cooperative agents." 127 } 128 }, 129 "claims_and_evidence": { 130 "abstract_claims_supported": { 131 "applies": true, 132 "answer": true, 133 "justification": "The abstract claims the paper 'provides a taxonomy of multi-agent risks,' identifies '3 failure modes and 7 risk factors,' and presents 'novel experiments illustrating these risks.' All are substantiated in the body: taxonomy in Section 2, failure modes in Sections 3-5, risk factors throughout, and 3 novel experiments in Appendix B." 134 }, 135 "causal_claims_justified": { 136 "applies": true, 137 "answer": true, 138 "justification": "The paper's causal claims are modest and appropriate. Claims like 'competitive pressures can lead to a race to the bottom' are supported by game-theoretic analysis (Section 3) and illustrative experiments. The paper frames most claims as 'can lead to' rather than definitive causal statements." 139 }, 140 "generalization_bounded": { 141 "applies": true, 142 "answer": true, 143 "justification": "The paper explicitly scopes its claims to 'advanced AI systems' and the discussion section (Section 7) acknowledges limitations in current evidence. The taxonomy is presented as a framework for analysis rather than a definitive empirical finding." 144 }, 145 "alternative_explanations_discussed": { 146 "applies": false, 147 "answer": false, 148 "justification": "This is primarily a theoretical taxonomy paper. The claims are definitional and analytical rather than empirical findings requiring alternative explanations." 149 }, 150 "proxy_outcome_distinction": { 151 "applies": false, 152 "answer": false, 153 "justification": "Theoretical paper with no measurements. No proxy-outcome gap to address." 154 } 155 }, 156 "setup_transparency": { 157 "model_versions_specified": { 158 "applies": true, 159 "answer": false, 160 "justification": "The novel experiments use reinforcement learning agents but do not specify exact model architectures, hyperparameters, or software versions in sufficient detail for reproduction." 161 }, 162 "prompts_provided": { 163 "applies": false, 164 "answer": false, 165 "justification": "The paper does not use LLM prompting. The experiments are RL-based simulations." 166 }, 167 "hyperparameters_reported": { 168 "applies": true, 169 "answer": false, 170 "justification": "The three novel experiments do not report training hyperparameters (learning rates, discount factors, episode counts, etc.) in sufficient detail." 171 }, 172 "scaffolding_described": { 173 "applies": false, 174 "answer": false, 175 "justification": "No agentic scaffolding is used. The experiments are direct RL simulations." 176 }, 177 "data_preprocessing_documented": { 178 "applies": false, 179 "answer": false, 180 "justification": "No data collection or preprocessing. The experiments are simulation-based with no external data." 181 } 182 }, 183 "limitations_and_scope": { 184 "limitations_section_present": { 185 "applies": true, 186 "answer": true, 187 "justification": "Section 7 ('Discussion') serves as a limitations section, discussing the scope of the taxonomy, gaps in current evidence, and areas where the framework may not fully capture risks." 188 }, 189 "threats_to_validity_specific": { 190 "applies": true, 191 "answer": true, 192 "justification": "Section 7 discusses specific limitations: the taxonomy may not be exhaustive, current experiments use simplified environments that may not capture real-world complexity, and the relationship between competitive pressures and safety is more nuanced than simple game-theoretic models suggest." 193 }, 194 "scope_boundaries_stated": { 195 "applies": true, 196 "answer": true, 197 "justification": "The paper explicitly focuses on 'advanced AI systems' in multi-agent settings and Section 2 defines the scope boundaries. The paper acknowledges it does not address single-agent risks or risks from non-advanced systems." 198 } 199 }, 200 "data_integrity": { 201 "raw_data_available": { 202 "applies": false, 203 "answer": false, 204 "justification": "Theoretical taxonomy paper. The experiments are illustrative simulations, not data-driven studies requiring raw data verification." 205 }, 206 "data_collection_described": { 207 "applies": false, 208 "answer": false, 209 "justification": "No data collection. This is a theoretical/conceptual paper with simulation-based illustrations." 210 }, 211 "recruitment_methods_described": { 212 "applies": false, 213 "answer": false, 214 "justification": "No human participants or sample recruitment. Standard benchmark data not used." 215 }, 216 "data_pipeline_documented": { 217 "applies": false, 218 "answer": false, 219 "justification": "No data pipeline. Theoretical paper with self-contained simulations." 220 } 221 }, 222 "conflicts_of_interest": { 223 "funding_disclosed": { 224 "applies": true, 225 "answer": true, 226 "justification": "The paper is published as Cooperative AI Foundation Technical Report #1, clearly identifying the organizational affiliation and funding source." 227 }, 228 "affiliations_disclosed": { 229 "applies": true, 230 "answer": true, 231 "justification": "Author affiliations are listed, including Cooperative AI Foundation, University of Oxford, Centre for the Governance of AI, Google DeepMind, and others." 232 }, 233 "funder_independent_of_outcome": { 234 "applies": true, 235 "answer": false, 236 "justification": "The Cooperative AI Foundation funds research on cooperative AI and multi-agent risks. The foundation has a mission-driven interest in demonstrating that multi-agent risks are significant, which aligns with the paper's conclusions." 237 }, 238 "financial_interests_declared": { 239 "applies": true, 240 "answer": false, 241 "justification": "No competing interests or financial interests statement is provided. Several authors are affiliated with Google DeepMind and other organizations with stakes in AI safety narratives." 242 } 243 }, 244 "contamination": { 245 "training_cutoff_stated": { 246 "applies": false, 247 "answer": false, 248 "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. The experiments are RL simulations trained from scratch." 249 }, 250 "train_test_overlap_discussed": { 251 "applies": false, 252 "answer": false, 253 "justification": "No pre-trained model evaluated on benchmarks. RL simulations trained from scratch." 254 }, 255 "benchmark_contamination_addressed": { 256 "applies": false, 257 "answer": false, 258 "justification": "No benchmark evaluation of pre-trained models." 259 } 260 }, 261 "human_studies": { 262 "pre_registered": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "irb_or_ethics_approval": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants." 271 }, 272 "demographics_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants." 276 }, 277 "inclusion_exclusion_criteria": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants." 281 }, 282 "randomization_described": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants." 286 }, 287 "blinding_described": { 288 "applies": false, 289 "answer": false, 290 "justification": "No human participants." 291 }, 292 "attrition_reported": { 293 "applies": false, 294 "answer": false, 295 "justification": "No human participants." 296 } 297 }, 298 "cost_and_practicality": { 299 "inference_cost_reported": { 300 "applies": false, 301 "answer": false, 302 "justification": "Theoretical taxonomy paper. No method with practical costs to report." 303 }, 304 "compute_budget_stated": { 305 "applies": false, 306 "answer": false, 307 "justification": "Theoretical taxonomy paper. The illustrative experiments are small-scale simulations." 308 } 309 } 310 }, 311 "claims": [ 312 { 313 "claim": "Multi-agent AI systems introduce three distinct failure modes: collective action problems, adversarial interactions, and emergent coordination failures.", 314 "evidence": "Sections 3-5 develop each failure mode with formal definitions, game-theoretic analysis, and illustrative case studies. Section 3 covers collective action problems (races to the bottom, tragedy of the commons), Section 4 covers adversarial interactions, Section 5 covers emergent coordination failures.", 315 "supported": "moderate" 316 }, 317 { 318 "claim": "Seven risk factors mediate multi-agent risks: competitive pressures, information asymmetries, network effects, rapid capability gains, misaligned incentives, inadequate governance, and emergent behavior.", 319 "evidence": "The risk factors are developed throughout Sections 3-5 and systematized in Section 2's taxonomy. Each is illustrated with case studies in Section 6 and Appendix B.", 320 "supported": "moderate" 321 }, 322 { 323 "claim": "AI agents can learn to collude in competitive settings even without explicit communication.", 324 "evidence": "Case Study 1 (novel experiment, Appendix B) demonstrates emergent collusion in a repeated pricing game using RL agents. Agents converge on supra-competitive prices through tacit coordination.", 325 "supported": "moderate" 326 }, 327 { 328 "claim": "Multi-principal settings create coordination failures even when individual agent-principal alignment is achieved.", 329 "evidence": "Case Study 7 (novel experiment) demonstrates how agents serving different principals can produce collectively suboptimal outcomes despite each agent faithfully executing its principal's objectives.", 330 "supported": "moderate" 331 }, 332 { 333 "claim": "Cooperative AI agents are vulnerable to adversarial exploitation by non-cooperative agents.", 334 "evidence": "Case Study 13 (novel experiment) shows that agents trained to cooperate can be systematically exploited by adversarial agents in mixed settings.", 335 "supported": "moderate" 336 } 337 ], 338 "methodology_tags": [ 339 "theoretical", 340 "qualitative", 341 "case-study" 342 ], 343 "key_findings": "The paper proposes a taxonomy of multi-agent AI risks organized around three failure modes (collective action problems, adversarial interactions, emergent coordination failures) and seven risk factors. Through 13 case studies, including 3 novel RL experiments, it demonstrates that multi-agent settings introduce distinct risks beyond single-agent alignment, including emergent collusion, coordination failures in multi-principal settings, and adversarial exploitation of cooperative agents. The paper argues that current AI safety research is overly focused on single-agent alignment and underweights multi-agent dynamics.", 344 "red_flags": [ 345 { 346 "flag": "Funder alignment with conclusions", 347 "detail": "Published by the Cooperative AI Foundation, which has a mission-driven interest in establishing multi-agent risks as significant. The paper's conclusions directly support the case for the foundation's existence and funding." 348 }, 349 { 350 "flag": "Illustrative experiments presented without rigor", 351 "detail": "The three novel experiments are presented as illustrations of conceptual points but lack experimental detail (hyperparameters, environment specifications, number of runs, variance). While appropriate for a taxonomy paper, they lend an air of empirical support that the methodology does not fully justify." 352 } 353 ], 354 "cited_papers": [ 355 { 356 "title": "Risks from Learned Optimization in Advanced Machine Learning Systems", 357 "authors": ["Evan Hubinger", "Chris van Merwijk", "Vladimir Mikulik", "Joar Skalse", "Scott Garrabrant"], 358 "year": 2019, 359 "arxiv_id": "1906.01820", 360 "relevance": "Foundational work on deceptive alignment and mesa-optimization risks relevant to AI safety." 361 }, 362 { 363 "title": "Open Problems in Cooperative AI", 364 "authors": ["Allan Dafoe", "Edward Hughes", "Yoram Bachrach"], 365 "year": 2020, 366 "relevance": "Defines the cooperative AI research agenda that this paper extends to multi-agent risk analysis." 367 }, 368 { 369 "title": "Emergent Social Learning via Multi-agent Reinforcement Learning", 370 "authors": ["Kamal Ndousse", "Douglas Eck", "Sergey Levine", "Natasha Jaques"], 371 "year": 2021, 372 "relevance": "Demonstrates emergent social behaviors in multi-agent RL systems relevant to coordination failures." 373 }, 374 { 375 "title": "Scalable agent alignment via reward modeling: a research direction", 376 "authors": ["Jan Leike", "David Krueger", "Tom Everitt", "Miljan Martic", "Vishal Maini", "Shane Legg"], 377 "year": 2018, 378 "arxiv_id": "1811.07871", 379 "relevance": "Foundational reward modeling approach for AI alignment, single-agent focus that this paper argues is insufficient." 380 }, 381 { 382 "title": "Model evaluation for extreme risks", 383 "authors": ["Toby Shevlane"], 384 "year": 2023, 385 "arxiv_id": "2305.15324", 386 "relevance": "Framework for evaluating dangerous AI capabilities, relevant to multi-agent risk assessment methodology." 387 }, 388 { 389 "title": "Artificial Intelligence, Values and Alignment", 390 "authors": ["Iason Gabriel"], 391 "year": 2020, 392 "relevance": "Philosophical analysis of AI alignment problem that informs the multi-agent extension in this paper." 393 }, 394 { 395 "title": "Multi-agent reinforcement learning: A selective overview of theories and algorithms", 396 "authors": ["Kaiqing Zhang", "Zhuoran Yang", "Tamer Başar"], 397 "year": 2021, 398 "relevance": "Comprehensive MARL survey providing algorithmic foundations for the multi-agent dynamics discussed in this paper." 399 }, 400 { 401 "title": "Discovering Agents", 402 "authors": ["Zachary Kenton", "Ramana Kumar", "Sebastian Farquhar", "Jonathan Richens"], 403 "year": 2023, 404 "relevance": "Methods for identifying agentic behavior in AI systems, relevant to detecting multi-agent risks." 405 }, 406 { 407 "title": "The Tragedy of the Commons", 408 "authors": ["Garrett Hardin"], 409 "year": 1968, 410 "relevance": "Classical collective action problem framework that grounds the paper's analysis of multi-agent resource competition." 411 }, 412 { 413 "title": "Frontier AI Regulation: Managing Emerging Risks to Public Safety", 414 "authors": ["Markus Anderljung"], 415 "year": 2023, 416 "arxiv_id": "2307.03718", 417 "relevance": "AI governance framework relevant to the paper's policy recommendations for managing multi-agent risks." 418 } 419 ] 420 }