scan.json (25969B)
1 { 2 "paper": { 3 "title": "MaMa: A Game-Theoretic Approach for Designing Safe Agentic Systems", 4 "authors": ["Jonathan Nöther", "Adish Singla", "Goran Radanovic"], 5 "year": 2026, 6 "venue": "arXiv", 7 "arxiv_id": "2602.04431" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "MaMa, a Stackelberg security game framework for automated design of safe agentic systems, produces systems significantly safer than manually designed alternatives and systems optimized only for quality, across four environments. Safety improvements transfer to stronger adversaries, different LLMs, targeted attacks, and indirect prompt injection attacks, while maintaining comparable task quality. Judge models validated with Cohen's Kappa (κ=0.88 safety, κ=0.80 quality) on 50 manually annotated episodes.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "The paper mentions 'we included the code for each final agentic system in the supplementary material' (Section 5.4) but provides no repository URL or persistent archive link. Supplementary material attached to a preprint is not the same as a released codebase." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "The paper uses the BAD-ACTS benchmark environments (Nöther et al., 2025) which appear to be publicly available, but the paper does not release the specific designed systems, attack archives, or episode logs generated during experiments." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": true, 28 "justification": "Appendix B.2 specifies compute infrastructure: 2x AMD EPYC 9555 CPUs, 24x 96GB memory, 2x Nvidia H200 141GB NVL GPUs. GPT-5.1 and GPT-5-mini accessed via OpenAI API. Open-source models (Llama-3.3:70b, Qwen3:32b) run locally. AutoGen used for execution." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions are provided. Hyperparameters are listed in Table 7, but there are no scripts, commands, or README to replicate the experiments." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": true, 40 "justification": "Tables 1-4 report standard deviations in parentheses. Figure 2 shows shaded regions indicating standard deviation over three distinct evaluations." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper claims MaMa systems are 'significantly safer' but uses no formal statistical significance tests. Comparisons are made by visually comparing means and standard deviations." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "Results are reported as absolute scores (1-5 scale) with baselines, allowing direct effect size computation. E.g., Travel Planning safety goes from 1.66 (attacked baseline) to 3.81 (MaMa), and ASR drops from 0.5646 to 0.1700 (Table 3)." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "No justification given for using 3 runs, 25 attacks per generation, 25 generations, or 50 episodes for judge validation. These numbers appear chosen pragmatically without power analysis." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "Standard deviations reported over three distinct runs in Tables 1-4 and as shaded regions in Figure 2." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Baselines include: Least-Privilege Tool Gating + Tool Filters, Guardian-Agents (Nöther et al., 2025), Delimiters (Hines et al., 2024), and a quality-only MaMa variant (AFlow-like). Section 5.1." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "Baselines are from 2024-2025 (Debenedetti et al. 2024, Nöther et al. 2025, Hines et al. 2024), all recent and relevant to the multi-agent safety domain." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "The quality-only variant (no Meta-Adversary) serves as an ablation of the adversarial component. Appendix A.4 analyzes the influence of the initial archive (empty vs. seeded). Figure 6 shows MaMa with empty initial archive." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Two primary metrics: quality score (1-5) and safety score (1-5), measured independently. Additionally, Attack Success Rate (ASR) is used for targeted and tool injection attacks (Tables 3-4)." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": true, 87 "justification": "Judge models validated against manual annotation of 50 randomly sampled episodes. Cohen's Weighted Kappa reported (κ=0.88 safety, κ=0.80 quality). Section 5.1." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": false, 92 "justification": "No explicit separation between development/tuning tasks and evaluation tasks is described. The same environments are used for both the iterative design process and evaluation." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results are broken down per environment (Travel Planning, Personal Assistant, Financial Article Writing, Code Generation) in all tables and figures." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section 5.2 describes specific attack successes: creating Ricin recipes, sending money to bank accounts, planning travel to disaster areas, injecting stock recommendations, deleting user files. Section 5.3 and Figure 3 show where safety degrades with >3 adversarial agents." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "Table 4 Code Generation shows MaMa (0.0968) does not outperform Delimiters (0.0787) or Guardian Agents (0.0939). Figure 3 shows safety drops steeply when adversary controls >3 agents." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Abstract claims of improved safety while maintaining quality are supported by Figure 2 and Tables 1-4. Claims of generalization to stronger adversaries and different LLMs are supported by Table 2, Figure 3, and Tables 3-4." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper makes causal claims ('MaMa produces safer systems') supported by controlled comparisons: same environments, same models, same evaluation, with and without the adversarial component. The ablation (quality-only variant) isolates the Meta-Adversary's contribution." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title claims 'Designing Safe Agentic Systems' generally, but results are limited to four specific environments from BAD-ACTS, specific LLMs (Llama-3.3, GPT-5.1, Qwen3:32b), and LLM-based judge evaluation. The paper does not bound claims to these specific settings." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "No discussion of alternative explanations. For example, the improvements could be partly due to the iterative optimization process itself (more compute) rather than the adversarial framing. The quality-only ablation partially addresses this but is not discussed as such." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": false, 134 "justification": "Safety and quality are measured via LLM judges on a 1-5 scale. While judges are validated against human annotations (κ=0.88/0.80), the paper does not discuss the gap between judge scores and actual safety/quality outcomes. A system rated 'safe' by a judge could still cause harm in deployment." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "Models listed as 'GPT-5.1', 'GPT-5-mini', 'Qwen3:32b', 'Gemma3:27b', 'Llama-3.3:70b'. No API versions, snapshot dates, or specific model IDs are provided. Marketing names without versions." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "Full system prompts for Meta-Agent, Meta-Adversary, all judge models, and environment descriptions are provided in Appendix B.4 (pages 14-28)." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "Table 7 lists all hyperparameters: generations=25, clean iterations=10, attacks per generation=25, k=5, ε=1, λ=0.3, γ=3, η=1. However, LLM sampling parameters (temperature, top-p) are not reported." 152 }, 153 "scaffolding_described": { 154 "applies": true, 155 "answer": true, 156 "justification": "The agentic system structure is formally defined in Section 3.1 as tuple (A, T, G). Communication graphs, tools, agent roles are described. AutoGen framework used for execution. Figure 1 and Figure 4 illustrate system architectures." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "The environments come from BAD-ACTS benchmark. Attack validation procedure described (checking manipulated agent exists, self-reflection until valid). Episode budget limited to 50 messages/tool executions. Section 5.1." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 6 (Conclusion) includes substantive discussion of limitations: computational cost requiring regression models, focus on safety-only adversaries, and future directions for zero-sum settings and manipulation of additional system aspects." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "The limitations in Section 6 are forward-looking research directions rather than specific threats to the validity of the current results. No discussion of: judge reliability beyond kappa, LLM-as-judge biases, overfitting to specific environments, or generalizability concerns." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound findings to the four tested environments or discuss what kinds of agentic systems or attack settings are excluded." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "No raw episode logs, agent interactions, or judge annotations are made available. Only aggregated scores are reported." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Data collection procedure is clear: iterative generation of systems and attacks over 25 generations, each system evaluated over 10 clean and 25 adversarial iterations, judged by LLM judges. Section 5.1 and Appendix B." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. The 50 episodes for judge validation were 'randomly sampled' but this is a benchmark evaluation, not a human study." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The pipeline is documented in Algorithms 1-2: propose system → evaluate clean → attack system → store results → select best. Top-5 attack selection criteria and archive sampling rule (equation in Section 4.2) are specified." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding sources disclosed. Authors are at Max Planck Institute for Software Systems but no grants or funding are mentioned." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "All authors listed as Max Planck Institute for Software Systems, Saarbrücken, Germany. No product being evaluated is affiliated with their institution." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding disclosed, so independence cannot be assessed." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests statement is present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": false, 228 "answer": false, 229 "justification": "This paper tests a system design method, not a pre-trained model's knowledge on benchmarks. The models are used as components, not evaluated for learned capability." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": false, 233 "answer": false, 234 "justification": "Not evaluating model knowledge on benchmarks — evaluating an automated system design algorithm." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": false, 238 "answer": false, 239 "justification": "The BAD-ACTS environments test system safety under adversarial conditions, not model knowledge. Contamination is not relevant." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in the study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": true, 283 "justification": "Appendix B.2 states total API cost of $72.15 for GPT-5.1 and GPT-5-mini. Table 8 reports wall-clock runtimes per environment (ranging from ~2 days to ~4.7 days for 25 iterations)." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": true, 288 "justification": "Hardware specified (2x H200 GPUs, 2x EPYC 9555). API cost $72.15. Per-environment runtimes in Table 8. Sufficient to understand computational requirements." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": true, 295 "justification": "Results reported over 'three distinct runs' with standard deviations in all tables. Figure 2 shows shaded regions for variance across runs." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": true, 300 "justification": "Explicitly stated: 'three distinct runs' (Tables 1-4), 25 attacks per generation, 10 clean iterations (Table 7)." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "No hyperparameter search budget reported. The values in Table 7 appear to be fixed choices with no discussion of how they were selected or whether alternatives were tried." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": true, 310 "justification": "Section 5.3 states: 'we select the system that performed best according to the unweighted sum of quality and safety.' The selection criterion is explicit. Algorithm 2 selects the archive maximum of sum of safety and quality." 311 }, 312 "multiple_comparison_correction": { 313 "applies": true, 314 "answer": false, 315 "justification": "Multiple comparisons across 4 environments, multiple baselines, and multiple transfer settings without any correction for family-wise error rate." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "Authors evaluate their own MaMa system against baselines they implemented. No acknowledgment of potential bias from re-implementing baselines or tuning their own method." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": false, 325 "justification": "MaMa runs 25 generations of iterative optimization (2-5 days per environment) while baselines are static defenses. This massive compute difference is not discussed or controlled for." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": false, 330 "justification": "No discussion of whether BAD-ACTS environments and LLM-based judge scoring actually measure real-world agentic system safety. The gap between simulated safety judgments and deployment safety is not addressed." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": true, 334 "answer": true, 335 "justification": "MaMa designs the entire scaffold as the intervention — the scaffold IS the thing being compared. Same underlying models used across all conditions. Table 2 explicitly varies both the adversary model and agent model to isolate contributions." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": false, 341 "answer": false, 342 "justification": "Not testing model knowledge on benchmarks. The environments are interactive task-completion settings, not static benchmark problems that could be memorized." 343 }, 344 "feature_leakage_addressed": { 345 "applies": false, 346 "answer": false, 347 "justification": "Not applicable — not a benchmark evaluation testing learned knowledge." 348 }, 349 "non_independence_addressed": { 350 "applies": false, 351 "answer": false, 352 "justification": "Not applicable — no train/test split of benchmark problems." 353 }, 354 "leakage_detection_method": { 355 "applies": false, 356 "answer": false, 357 "justification": "Not applicable — testing system design, not model memorization." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "The Meta-Adversary can significantly decrease the safety of agentic systems with common defenses.", 364 "evidence": "Table 1: Safety drops from 4.89→1.66 (Travel), 4.31→1.53 (Personal Assistant), 4.29→2.46 (Financial), 4.65→3.20 (Code Gen) when attacked. Specific attacks described: Ricin recipe creation, unauthorized money transfers, malicious code execution.", 365 "supported": "strong" 366 }, 367 { 368 "claim": "MaMa-designed systems are significantly safer than baselines while maintaining comparable quality.", 369 "evidence": "Figure 2 shows safety improvements across all 4 environments over 25 generations, with quality maintained or improved. Table 3 shows ASR reductions vs LPTG+TF and Guardian Agents across all environments.", 370 "supported": "strong" 371 }, 372 { 373 "claim": "MaMa-designed systems generalize to stronger adversaries and different LLMs.", 374 "evidence": "Table 2 shows similar safety scores across Qwen3:32b and Gemma3:27b adversaries with Llama3.3 and GPT-5-mini agents. Figure 3 shows safety maintained for up to 3 adversarial agents (designed for ε=1).", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "MaMa-designed systems defend against indirect prompt injection attacks.", 379 "evidence": "Table 4: MaMa achieves lowest ASR in 3/4 environments for tool injection attacks. In Code Generation, performance is comparable to baselines (0.0968 vs 0.0787 for Delimiters).", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "LLM-based judges achieve strong agreement with human annotations.", 384 "evidence": "Section 5.1: Cohen's Weighted Kappa κ=0.88 (safety) and κ=0.80 (quality) on 50 randomly sampled episodes.", 385 "supported": "moderate" 386 } 387 ], 388 "red_flags": [ 389 { 390 "flag": "Compute budget asymmetry with baselines", 391 "detail": "MaMa uses 2-5 days of iterative optimization per environment while baselines are static defenses. The improvement could partly be due to the additional compute and optimization iterations, not the adversarial game formulation specifically." 392 }, 393 { 394 "flag": "LLM-as-judge reliability", 395 "detail": "Safety and quality are measured entirely by LLM judges (validated on only 50 episodes). LLM judges can have systematic biases, and 50 validation examples may be insufficient to characterize judge reliability across diverse attack types." 396 }, 397 { 398 "flag": "No statistical significance tests", 399 "detail": "The paper repeatedly claims 'significant' improvements but uses no formal statistical tests. With only 3 runs and overlapping standard deviations in some settings, some claimed differences may not be statistically significant." 400 }, 401 { 402 "flag": "Unbounded generalization claims", 403 "detail": "The title and abstract claim general applicability to 'safe agentic systems' but results are limited to 4 specific simulated environments with specific LLMs and judge-based evaluation." 404 } 405 ], 406 "cited_papers": [ 407 { 408 "title": "Automated design of agentic systems", 409 "authors": ["S. Hu", "C. Lu", "J. Clune"], 410 "year": 2024, 411 "relevance": "Foundational work on automated agentic system design that MaMa extends with safety considerations." 412 }, 413 { 414 "title": "AFlow: Automating agentic workflow generation", 415 "authors": ["J. Zhang", "J. Xiang", "Z. Yu"], 416 "year": 2025, 417 "relevance": "MCTS-based agentic system design method that MaMa directly builds upon." 418 }, 419 { 420 "title": "AgentDojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents", 421 "authors": ["E. Debenedetti", "J. Zhang", "M. Balunovic"], 422 "year": 2024, 423 "relevance": "Benchmark for evaluating prompt injection attacks against LLM agents; MaMa evaluates against their tool injection attack." 424 }, 425 { 426 "title": "Benchmarking the robustness of agentic systems to adversarially-induced harms", 427 "authors": ["J. Nöther", "A. Singla", "G. Radanovic"], 428 "year": 2025, 429 "relevance": "BAD-ACTS benchmark providing the evaluation environments used in MaMa; also introduces Guardian-Agents baseline." 430 }, 431 { 432 "title": "Magentic-One: A generalist multi-agent system for solving complex tasks", 433 "authors": ["A. Fourney", "G. Bansal", "H. Mozannar"], 434 "year": 2024, 435 "relevance": "Multi-agent system demonstrating safety risks in handcrafted agentic architectures." 436 }, 437 { 438 "title": "AgentSquare: Automatic LLM agent search in modular design space", 439 "authors": ["Y. Shang", "Y. Li", "K. Zhao"], 440 "year": 2024, 441 "relevance": "Structured approach to automated agentic system design that MaMa draws on." 442 }, 443 { 444 "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversations", 445 "authors": ["Q. Wu", "G. Bansal", "J. Zhang"], 446 "year": 2023, 447 "relevance": "Multi-agent framework used to execute the designed agentic systems in MaMa experiments." 448 }, 449 { 450 "title": "Among Us: A sandbox for measuring and detecting agentic deception", 451 "authors": ["S. Golechha", "A. Garriga-Alonso"], 452 "year": 2025, 453 "relevance": "Studies LLM agent deception in social deduction games, related to multi-agent safety." 454 }, 455 { 456 "title": "Why do multi-agent LLM systems fail?", 457 "authors": ["M. Cemri", "M. Z. Pan", "S. Yang"], 458 "year": 2025, 459 "relevance": "Analyzes failure modes in multi-agent LLM systems including miscoordination and hallucinations." 460 }, 461 { 462 "title": "OS-Harm: A benchmark for measuring safety of computer use agents", 463 "authors": ["T. Kuntz", "A. Duzan", "H. Zhao"], 464 "year": 2025, 465 "relevance": "Benchmark for evaluating safety of computer-using agents." 466 }, 467 { 468 "title": "AgentHarm: A benchmark for measuring harmfulness of LLM agents", 469 "authors": ["M. Andriushchenko", "A. Souly", "M. Dziemian"], 470 "year": 2024, 471 "relevance": "Benchmark for measuring harmful behaviors of LLM agents." 472 }, 473 { 474 "title": "Defending against indirect prompt injection attacks with spotlighting", 475 "authors": ["K. Hines", "G. Lopez", "M. Hall"], 476 "year": 2024, 477 "relevance": "Defense mechanism (Delimiters) against indirect prompt injection, used as baseline in MaMa." 478 } 479 ] 480 }