scan-v5.json (25508B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "MaMa: A Game-Theoretic Approach for Designing Safe Agentic Systems", 6 "authors": [ 7 "Jonathan Nöther", 8 "Adish Singla", 9 "Goran Radanovic" 10 ], 11 "year": 2026, 12 "venue": "arXiv", 13 "arxiv_id": "2602.04431", 14 "doi": null 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": true, 21 "justification": "Abstract claims that MaMa consistently defends against worst-case attacks while maintaining quality are supported by Figure 2 and Tables 1–4 across all four environments. Generalization claims are supported by Section 5.3 transferability experiments.", 22 "source": "haiku" 23 }, 24 "causal_claims_justified": { 25 "applies": true, 26 "answer": true, 27 "justification": "The paper compares MaMa against concrete baselines (Guardian Agents, LPTG+TF, quality-only AFlow) in controlled environments with fixed threat models, and an ablation of the initial archive is provided; the design supports causal claims within the bounded experimental setting.", 28 "source": "haiku" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": true, 33 "justification": "Results are explicitly bounded to the four BAD-ACTS environments and the specific threat model (ε compromised agents overwriting instructions); the conclusion explicitly identifies future extensions needed for zero-sum and stronger adversaries.", 34 "source": "haiku" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": false, 39 "justification": "The paper does not discuss whether safety improvements stem from generally more design iterations rather than adversarial feedback specifically; no ablation isolates the adversarial signal from sheer iterative search.", 40 "source": "haiku" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": true, 45 "justification": "LLM judges are used as proxies for safety and quality, but the paper validates both judges against 50 manually annotated episodes, reporting Cohen's kappa of 0.88 and 0.80 respectively, partially acknowledging the proxy gap.", 46 "source": "haiku" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": false, 53 "justification": "There is no dedicated limitations or threats-to-validity section; the conclusion mentions computational cost and future directions but methodological limitations are not systematically addressed.", 54 "source": "haiku" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": false, 59 "justification": "No specific threats to validity are discussed; the paper acknowledges computational cost as a practical constraint but does not discuss evaluation reliability threats such as LLM-judge bias or environmental coverage.", 60 "source": "haiku" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": false, 65 "justification": "The threat model scope is implicitly bounded but not explicitly framed as 'what results do NOT show'; the paper does not state that conclusions are limited to the four evaluated environments or to the specific agent-compromise threat model.", 66 "source": "haiku" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": false, 73 "justification": "No funding source or grant acknowledgment appears anywhere in the paper, including abstract, conclusion, or impact statement.", 74 "source": "haiku" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "All three authors are disclosed as affiliated with Max Planck Institute for Software Systems, Saarbrücken, Germany, with a correspondence email provided.", 80 "source": "haiku" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": false, 84 "answer": false, 85 "justification": "No funding is disclosed, so independence cannot be assessed.", 86 "source": "haiku" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "No competing interests statement, patents, equity, or consulting relationships are declared anywhere in the paper.", 92 "source": "haiku" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": true, 99 "justification": "Key terms are formally defined: agentic system as a tuple (A, T, G), safety as a function s: T→R, quality as q: T→R, and the Stackelberg security game is formally stated in Section 3.3.", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "Three explicit contributions are listed in the introduction under 'Framework,' 'Methodology,' and 'Evaluation' headings, clearly distinguishing this paper's additions from prior work.", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section 2 engages substantively with both automated design of agentic systems (AFlow, ADAS, AgentSquare) and adversarial attack literature (AgentDojo, InjecAgent), explicitly positioning MaMa as combining both streams.", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "empirical": { 118 "artifacts": { 119 "code_released": { 120 "applies": true, 121 "answer": false, 122 "justification": "The paper states final system code is in 'supplementary material' and prompts are in Appendix B.4, but no public code repository is linked and the MaMa algorithm implementation itself is not explicitly released.", 123 "source": "haiku" 124 }, 125 "data_released": { 126 "applies": true, 127 "answer": false, 128 "justification": "The BAD-ACTS benchmark environments (Nöther et al., 2025) are used but no public URL or dataset release is provided; the benchmark is from the same lead author and its public availability is unconfirmed.", 129 "source": "haiku" 130 }, 131 "environment_specified": { 132 "applies": true, 133 "answer": false, 134 "justification": "Compute hardware is specified (AMD EPYC 9555, H200 GPUs) and model names are given, but no requirements.txt, Dockerfile, or dependency specification is provided.", 135 "source": "haiku" 136 }, 137 "reproduction_instructions": { 138 "applies": true, 139 "answer": false, 140 "justification": "Pseudocode algorithms and hyperparameter tables are provided, but no end-to-end step-by-step reproduction guide exists; critical setup steps (AutoGen configuration, BAD-ACTS environment instantiation) are not described.", 141 "source": "haiku" 142 } 143 }, 144 "statistical_methodology": { 145 "confidence_intervals_or_error_bars": { 146 "applies": true, 147 "answer": true, 148 "justification": "Standard deviations are reported parenthetically in all result tables (e.g., '1.66(0.11)') and as shaded regions in Figure 2, computed over three distinct evaluation runs.", 149 "source": "haiku" 150 }, 151 "significance_tests": { 152 "applies": true, 153 "answer": false, 154 "justification": "No statistical significance tests (t-tests, ANOVA, etc.) are applied to comparative claims despite multiple baselines and only three runs providing limited statistical power.", 155 "source": "haiku" 156 }, 157 "effect_sizes_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Absolute score differences are visible (e.g., attacked safety rising from ~1.66 to ~3.81 in Travel Planning), and Attack Success Rate is reported numerically in Tables 3–4 with baseline comparisons.", 161 "source": "haiku" 162 }, 163 "sample_size_justified": { 164 "applies": true, 165 "answer": false, 166 "justification": "The choice of 25 iterations, 25 attacks per generation, and 3 evaluation runs is mentioned only as a cost constraint, with no power analysis or justification for sample adequacy.", 167 "source": "haiku" 168 }, 169 "variance_reported": { 170 "applies": true, 171 "answer": true, 172 "justification": "Standard deviation is reported across three distinct runs for all main results in tables and as shaded regions in figures.", 173 "source": "haiku" 174 } 175 }, 176 "evaluation_design": { 177 "baselines_included": { 178 "applies": true, 179 "answer": true, 180 "justification": "Three baselines are used: Least-Privilege Tool Gating + Tool Filters, Guardian Agents (Nöther et al., 2025), and a quality-only AFlow variant; Delimiters (Hines et al., 2024) is added for the injection attack evaluation.", 181 "source": "haiku" 182 }, 183 "baselines_contemporary": { 184 "applies": true, 185 "answer": true, 186 "justification": "Baselines are drawn from recent work (2024–2025), including Debenedetti et al. (2024) for LPTG/TF and Nöther et al. (2025) for Guardian Agents.", 187 "source": "haiku" 188 }, 189 "ablation_study": { 190 "applies": true, 191 "answer": true, 192 "justification": "Appendix A.4 ablates the initial archive (empty vs. initialized), showing faster convergence and better results with initialization; however, no ablation directly isolates the Meta-Adversary contribution from iterative design alone.", 193 "source": "haiku" 194 }, 195 "multiple_metrics": { 196 "applies": true, 197 "answer": true, 198 "justification": "Both safety score (1–5 Likert) and quality score (1–5 Likert) are reported, along with Attack Success Rate (ASR) for targeted and injection attacks in Sections 5.3.", 199 "source": "haiku" 200 }, 201 "human_evaluation": { 202 "applies": true, 203 "answer": false, 204 "justification": "Evaluation relies on LLM judges, not human annotators; 50 manually annotated episodes are used only to calibrate/validate the judge, not as the primary evaluation mechanism.", 205 "source": "haiku" 206 }, 207 "held_out_test_set": { 208 "applies": true, 209 "answer": false, 210 "justification": "No formal held-out test set is used; the Meta-Agent is optimized and evaluated on the same four environments. Transferability tests in Section 5.3 use different attack settings but not a separate held-out environment.", 211 "source": "haiku" 212 }, 213 "per_category_breakdown": { 214 "applies": true, 215 "answer": true, 216 "justification": "All results are broken down per environment (Travel Planning, Personal Assistant, Financial Article Writing, Code Generation) in all tables and figures.", 217 "source": "haiku" 218 }, 219 "failure_cases_discussed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Figure 3 explicitly shows safety degradation when the adversary controls more than 3 agents; Table 4 shows MaMa performs no better than baselines in Code Generation under injection attacks.", 223 "source": "haiku" 224 }, 225 "negative_results_reported": { 226 "applies": true, 227 "answer": true, 228 "justification": "Table 4 explicitly reports that MaMa does not outperform Delimiters in the Code Generation environment, and this result is acknowledged without dismissal.", 229 "source": "haiku" 230 } 231 }, 232 "setup_transparency": { 233 "model_versions_specified": { 234 "applies": true, 235 "answer": true, 236 "justification": "Exact model versions are named: GPT-5.1 (OpenAI, 2025) with a citation URL, Qwen3:32b (Yang et al., 2025), Llama-3.3:70b (Grattafiori et al., 2024), and GPT-5-mini for transfer experiments.", 237 "source": "haiku" 238 }, 239 "prompts_provided": { 240 "applies": true, 241 "answer": true, 242 "justification": "Full system prompts for Meta-Adversary, Meta-Agent, and all quality/safety judges for all four environments are included in Appendix B.4 within the paper text.", 243 "source": "haiku" 244 }, 245 "hyperparameters_reported": { 246 "applies": true, 247 "answer": true, 248 "justification": "Table 7 lists all hyperparameters: 25 generations, 10 clean iterations, 25 attacks per generation, k=5, ε=1, λ=0.3, γ=3, η=1.", 249 "source": "haiku" 250 }, 251 "scaffolding_described": { 252 "applies": true, 253 "answer": true, 254 "justification": "AutoGen (Wu et al., 2023) is specified as the multi-agent execution framework, and the message/tool execution episode structure is described in Section 5.1.", 255 "source": "haiku" 256 }, 257 "data_preprocessing_documented": { 258 "applies": false, 259 "answer": false, 260 "justification": "No dataset preprocessing is applicable; the environments generate episodes dynamically and no preprocessing pipeline exists.", 261 "source": "haiku" 262 } 263 }, 264 "data_integrity": { 265 "raw_data_available": { 266 "applies": true, 267 "answer": false, 268 "justification": "No raw episode data, attack archives, or evaluation outputs are released publicly.", 269 "source": "haiku" 270 }, 271 "data_collection_described": { 272 "applies": true, 273 "answer": true, 274 "justification": "The process for generating episodes (running the agentic system, collecting tool executions and messages, scoring with judges) is described through Algorithms 1 and 2 and Section 5.1.", 275 "source": "haiku" 276 }, 277 "recruitment_methods_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants; the evaluation uses automated LLM-judge scoring of synthetic agentic episodes.", 281 "source": "haiku" 282 }, 283 "data_pipeline_documented": { 284 "applies": true, 285 "answer": true, 286 "justification": "The full pipeline from system design → adversarial attack generation → episode execution → scoring → archiving is documented through pseudocode in Algorithms 1 and 2.", 287 "source": "haiku" 288 } 289 }, 290 "contamination": { 291 "training_cutoff_stated": { 292 "applies": false, 293 "answer": false, 294 "justification": "The paper does not evaluate model capabilities on held-out benchmarks; it evaluates iteratively designed system architectures, making training-cutoff contamination not the primary concern.", 295 "source": "haiku" 296 }, 297 "train_test_overlap_discussed": { 298 "applies": false, 299 "answer": false, 300 "justification": "NA — the evaluation is of system designs in simulated environments, not of model benchmark performance.", 301 "source": "haiku" 302 }, 303 "benchmark_contamination_addressed": { 304 "applies": false, 305 "answer": false, 306 "justification": "NA — BAD-ACTS environments are task simulation environments, not NLP benchmark datasets susceptible to training-data contamination.", 307 "source": "haiku" 308 } 309 }, 310 "human_studies": { 311 "pre_registered": { 312 "applies": false, 313 "answer": false, 314 "justification": "No human participants.", 315 "source": "haiku" 316 }, 317 "irb_or_ethics_approval": { 318 "applies": false, 319 "answer": false, 320 "justification": "No human participants.", 321 "source": "haiku" 322 }, 323 "demographics_reported": { 324 "applies": false, 325 "answer": false, 326 "justification": "No human participants.", 327 "source": "haiku" 328 }, 329 "inclusion_exclusion_criteria": { 330 "applies": false, 331 "answer": false, 332 "justification": "No human participants.", 333 "source": "haiku" 334 }, 335 "randomization_described": { 336 "applies": false, 337 "answer": false, 338 "justification": "No human participants.", 339 "source": "haiku" 340 }, 341 "blinding_described": { 342 "applies": false, 343 "answer": false, 344 "justification": "No human participants.", 345 "source": "haiku" 346 }, 347 "attrition_reported": { 348 "applies": false, 349 "answer": false, 350 "justification": "No human participants.", 351 "source": "haiku" 352 } 353 }, 354 "cost_and_practicality": { 355 "inference_cost_reported": { 356 "applies": true, 357 "answer": true, 358 "justification": "API cost for GPT-5.1 and GPT-5-mini is reported as $72.15 total; per-environment runtimes are reported in Table 8 (2–5 days per environment over 25 iterations).", 359 "source": "haiku" 360 }, 361 "compute_budget_stated": { 362 "applies": true, 363 "answer": true, 364 "justification": "Compute infrastructure is specified (2x AMD EPYC 9555 CPUs, 24x 96GB RAM, 2x Nvidia H200 141GB GPUs), and per-environment runtimes are provided.", 365 "source": "haiku" 366 } 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "The Meta-Adversary can discover effective attacks against systems with standard defenses (LPTG + Tool Filters), dramatically reducing safety scores.", 373 "evidence": "Table 1 shows safety dropping from ~4.3–4.9 (clean) to 1.53–3.20 (attacked) across all four environments.", 374 "supported": "strong" 375 }, 376 { 377 "claim": "MaMa-designed systems are significantly safer under adversarial attack than both initial designs and prior safety mechanisms (Guardian Agents).", 378 "evidence": "Figure 2 shows consistent safety improvement over 25 iterations; Table 3 shows MaMa ASR (0.05–0.25) substantially below Guardian Agents (0.05–0.52) and LPTG+TF (0.09–0.57).", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Safety improvements from MaMa do not cause a significant quality reduction compared to systems optimized only for quality.", 383 "evidence": "Figure 2 shows quality scores for MaMa equal to or higher than the quality-only AFlow baseline across all environments.", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "Systems designed by MaMa generalize to stronger adversaries controlling up to three agents.", 388 "evidence": "Figure 3 shows safety scores remain relatively stable for 1–3 adversarial agents before a steep drop at 4+ agents.", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "MaMa systems generalize to different adversary and agent LLMs not used during design.", 393 "evidence": "Table 2 shows similar safety scores across Qwen3:32b, Gemma3:27b adversaries and Llama3.3/GPT-5-mini agent models.", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "MaMa outperforms all baselines on indirect prompt injection attacks except in Code Generation.", 398 "evidence": "Table 4 shows MaMa achieves lowest ASR in Travel Planner (0.0068) and Personal Assistant (0.2448) but not Code Generation (0.0968 vs. Delimiters 0.0787).", 399 "supported": "moderate" 400 } 401 ], 402 "methodology_tags": [ 403 "theoretical", 404 "benchmark-eval" 405 ], 406 "key_findings": "MaMa frames automated agentic system design as a Stackelberg security game, iteratively pitting a Meta-Agent (designer) against a Meta-Adversary (attacker) to produce systems robust under worst-case agent compromise. Empirical evaluation across four simulated environments (Travel Planning, Personal Assistant, Financial Article Writing, Code Generation) shows MaMa-designed systems achieve safety scores of ~3.7–4.6 under attack versus ~1.5–3.2 for systems with standard defenses, without sacrificing quality. The designed systems generalize to different adversary LLMs, stronger adversaries controlling up to three agents, and alternative attack types including indirect prompt injection. A notable failure mode is found in Code Generation under injection attacks, where MaMa does not outperform the Delimiters baseline.", 407 "red_flags": [ 408 { 409 "flag": "Self-authored benchmark", 410 "detail": "The evaluation uses BAD-ACTS (Nöther et al., 2025), where the lead author of this paper is also the first author — no independent benchmark is used." 411 }, 412 { 413 "flag": "LLM-as-judge primary metric", 414 "detail": "Safety and quality are measured by LLM judges validated on only 50 manually annotated episodes (κ=0.88, 0.80); LLM judges are known to have systematic biases that may inflate results." 415 }, 416 { 417 "flag": "No statistical significance testing", 418 "detail": "Comparative claims are made across baselines with only 3 runs and standard deviation, but no t-tests or similar tests; some differences may not be statistically significant." 419 }, 420 { 421 "flag": "Missing key ablation", 422 "detail": "No ablation isolates the contribution of adversarial feedback specifically from simply running more design iterations — the quality-only AFlow baseline differs in both adversarial signal and overall search dynamics." 423 }, 424 { 425 "flag": "Proprietary model dependency", 426 "detail": "GPT-5.1 (OpenAI, 2025) is used for the Meta-Agent; this proprietary model may not be accessible to researchers attempting to reproduce results, and its training data is unknown." 427 }, 428 { 429 "flag": "No code repository", 430 "detail": "The MaMa implementation is not released in a public repository; final designed system code is only in supplementary material, making full reproduction difficult." 431 } 432 ], 433 "cited_papers": [ 434 { 435 "title": "AFlow: Automating Agentic Workflow Generation", 436 "relevance": "Direct baseline; MaMa builds upon AFlow's Meta-Agent design loop, adding the adversarial second player." 437 }, 438 { 439 "title": "Benchmarking the robustness of agentic systems to adversarially-induced harms (BAD-ACTS)", 440 "relevance": "Provides the evaluation environments and Guardian Agents baseline used throughout the paper." 441 }, 442 { 443 "title": "AgentDojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents", 444 "relevance": "Establishes the indirect prompt injection threat model used in Section 5.3 and inspires the LPTG/TF baselines." 445 }, 446 { 447 "title": "Automated Design of Agentic Systems (ADAS)", 448 "relevance": "Prior work on Meta-Agent driven automated system search; MaMa extends this with a second adversarial player." 449 }, 450 { 451 "title": "Magentic-One: A generalist multi-agent system for solving complex tasks", 452 "relevance": "Documents safety risks in manually designed agentic systems without adversarial safeguards, motivating this work." 453 }, 454 { 455 "title": "AgentSquare: Automatic LLM agent search in modular design space", 456 "relevance": "Closely related automated design approach; MaMa adapts its structured agent representation." 457 }, 458 { 459 "title": "Defending against indirect prompt injection attacks with spotlighting", 460 "relevance": "Provides the Delimiters baseline used in the injection attack comparison in Table 4." 461 }, 462 { 463 "title": "Generative adversarial nets", 464 "relevance": "Conceptual predecessor for two-player adversarial optimization framework that MaMa adapts for system design." 465 } 466 ], 467 "engagement_factors": { 468 "practical_relevance": { 469 "score": 2, 470 "justification": "The framework directly addresses real multi-agent system safety, but requires 2–5 days of H200 GPU time per environment and proprietary model access, limiting near-term practitioner adoption." 471 }, 472 "surprise_contrarian": { 473 "score": 2, 474 "justification": "The finding that adversarial training improves safety without quality loss is somewhat counterintuitive; the game-theoretic framing of system design is novel in this space." 475 }, 476 "fear_safety": { 477 "score": 3, 478 "justification": "The paper demonstrates concrete harmful outcomes (Ricin recipe generation, $600% stock fraud, device-wiping malware) achievable by compromising a single agent, directly addressing AI safety risks." 479 }, 480 "drama_conflict": { 481 "score": 2, 482 "justification": "Adversarial agent compromise with real-world harm examples (financial fraud, dangerous travel planning) creates a compelling threat narrative, though framed constructively." 483 }, 484 "demo_ability": { 485 "score": 1, 486 "justification": "Requires days of compute on H200 GPUs and access to GPT-5.1; no demo environment or interactive version is available." 487 }, 488 "brand_recognition": { 489 "score": 1, 490 "justification": "Max Planck Institute is a respected research institution but lacks the AI hype cachet of major industry labs; no famous product or celebrity researcher association." 491 } 492 }, 493 "hn_data": { 494 "threads": [], 495 "top_points": 0, 496 "total_points": 0, 497 "total_comments": 0 498 } 499 }