scan.json (26850B)
1 { 2 "paper": { 3 "title": "AdvEvo-MARL: Shaping Internalized Safety Through Adversarial Co-Evolution in Multi-Agent Reinforcement Learning", 4 "authors": [ 5 "Zhenyu Pan", 6 "Yiting Zhang", 7 "Zhuo Liu", 8 "Yolo Yunlong Tang", 9 "Zeliang Zhang", 10 "Haozheng Luo", 11 "Yuwei Han", 12 "Jianshu Zhang", 13 "Dennis Wu", 14 "Hong-Yu Chen", 15 "Haoran Lu", 16 "Haoyang Fang", 17 "Manling Li", 18 "Chenliang Xu", 19 "Philip S. Yu", 20 "Han Liu" 21 ], 22 "year": 2025, 23 "venue": "arXiv preprint", 24 "arxiv_id": "2510.01586" 25 }, 26 "checklist": { 27 "artifacts": { 28 "code_released": { 29 "applies": true, 30 "answer": false, 31 "justification": "No repository URL or code archive is provided anywhere in the paper. There is no mention of releasing code." 32 }, 33 "data_released": { 34 "applies": true, 35 "answer": true, 36 "justification": "The paper uses publicly available datasets: MATH-500, JailbreakBench, Wild Jailbreak, Strong Reject, AIME, LiveCodeBench, and GPQA-diamond, all of which are publicly available benchmarks. The seed attack prompts are from public datasets." 37 }, 38 "environment_specified": { 39 "applies": true, 40 "answer": false, 41 "justification": "No requirements file, Dockerfile, or detailed environment specification is provided. The paper only mentions using QWen2.5 instruction-tuned models (3B and 7B) without library versions or environment details." 42 }, 43 "reproduction_instructions": { 44 "applies": true, 45 "answer": false, 46 "justification": "No step-by-step reproduction instructions are provided. There is no README or 'Reproducing Results' section; the paper describes the method conceptually but does not give runnable instructions." 47 } 48 }, 49 "statistical_methodology": { 50 "confidence_intervals_or_error_bars": { 51 "applies": true, 52 "answer": false, 53 "justification": "Results in Table 1 and Figures 2–5 are presented as point estimates (ASR%, CR%, accuracy) with no confidence intervals, error bars, or ± notation." 54 }, 55 "significance_tests": { 56 "applies": true, 57 "answer": false, 58 "justification": "The paper makes comparative claims (e.g., 'AdvEvo-MARL consistently achieves the lowest ASR') but no statistical significance tests (p-values, t-tests, etc.) are reported anywhere." 59 }, 60 "effect_sizes_reported": { 61 "applies": true, 62 "answer": false, 63 "justification": "Effect sizes as formal measures (Cohen's d, odds ratios) are not reported. Percentage differences are presented in context (e.g., '12% reduction in ASR under NetSafe') but without baseline context sufficient for formal effect size interpretation." 64 }, 65 "sample_size_justified": { 66 "applies": true, 67 "answer": false, 68 "justification": "The choice of 4,000 training samples, 300 adversarial prompts, and benchmark sizes is not justified. No power analysis or reasoning for the chosen N is provided." 69 }, 70 "variance_reported": { 71 "applies": true, 72 "answer": false, 73 "justification": "All results appear to be single-run numbers. No standard deviation, variance across seeds, or multiple-run results with spread measures are reported." 74 } 75 }, 76 "evaluation_design": { 77 "baselines_included": { 78 "applies": true, 79 "answer": true, 80 "justification": "The paper compares against Vanilla (undefended) 3B and 7B models, Challenger (self-verification), Inspector (external guard agent), and closed-source GPT-3.5 and GPT-4o-mini reference models." 81 }, 82 "baselines_contemporary": { 83 "applies": true, 84 "answer": true, 85 "justification": "The baselines (Challenger from Huang et al. 2025, Inspector, and the vanilla QWen2.5 models) are contemporary methods from 2024-2025, appropriate for the 2025 submission." 86 }, 87 "ablation_study": { 88 "applies": true, 89 "answer": true, 90 "justification": "Section 5.3 compares dynamic vs. static attacker, and individual vs. joint (MAS) defender training. Section 5.4 ablates the public baseline mechanism against a no-baseline variant, with figures showing training dynamics." 91 }, 92 "multiple_metrics": { 93 "applies": true, 94 "answer": true, 95 "justification": "The paper uses three metrics: Attack Success Rate (ASR), Contagion Rate (CR), and task accuracy/Pass@1, measuring both safety and utility dimensions." 96 }, 97 "human_evaluation": { 98 "applies": false, 99 "answer": false, 100 "justification": "The paper evaluates automated safety and task performance; human evaluation of system outputs is not relevant to the claims being made about ASR and task accuracy." 101 }, 102 "held_out_test_set": { 103 "applies": true, 104 "answer": true, 105 "justification": "The paper uses task benchmarks (AIME, GPQA-diamond, LiveCodeBench) and adversarial evaluation sets that are distinct from the training data (MATH-500 for defenders; JailbreakBench/Wild Jailbreak/Strong Reject for evaluation)." 106 }, 107 "per_category_breakdown": { 108 "applies": true, 109 "answer": true, 110 "justification": "Table 1 provides per-attack-scenario (NetSafe, AutoInject, UserHijack) and per-topology (chain, tree, complete) breakdowns, giving a detailed view of performance across conditions." 111 }, 112 "failure_cases_discussed": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper notes that in the complete graph topology, CR remains elevated (up to 35%), and acknowledges that 'even our approach' is not perfectly effective in densely connected environments. Section 5.2 notes that the 3B variant shows a maximum 3% accuracy drop." 116 }, 117 "negative_results_reported": { 118 "applies": true, 119 "answer": true, 120 "justification": "The ablation in Section 5.4 shows the no-baseline variant exhibits 'non-stationary behavior and even degraded performance in later stages' including a 13.3% drop in defender response length, constituting a reported negative result for the alternative configuration." 121 } 122 }, 123 "claims_and_evidence": { 124 "abstract_claims_supported": { 125 "applies": true, 126 "answer": true, 127 "justification": "The abstract claims ASR below 20% and baselines reaching up to 38.33%. Table 1 shows AdvEvo-MARL achieving low ASR across conditions and baselines (e.g., Challenger-7B at 38.33% ASR in UserHijack complete topology), supporting the abstract claims." 128 }, 129 "causal_claims_justified": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper makes causal claims via ablation studies (dynamic vs. static attacker, with vs. without public baseline, individual vs. joint training) with single-variable manipulation. These ablations provide adequate support for the causal attribution claims in Sections 5.3 and 5.4." 133 }, 134 "generalization_bounded": { 135 "applies": true, 136 "answer": false, 137 "justification": "The abstract and conclusion claim AdvEvo-MARL as a 'standardized framework for building MAS that are both safe and capable' broadly, but experiments are limited to QWen2.5 3B/7B models and three specific attack scenarios. No explicit bounding of claims to this setting is made." 138 }, 139 "alternative_explanations_discussed": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper does not discuss alternative explanations for the observed results. For example, the performance improvement of the 7B model on out-of-distribution tasks is presented without considering that co-evolutionary training might improve general RL training dynamics rather than specifically safety-related benefits." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": false, 149 "justification": "The paper uses 'QWen2.5 instruction-tuned models (3B and 7B)' without specifying the exact model checkpoint, version tag (e.g., 'Qwen2.5-7B-Instruct'), or any snapshot identifier. GPT-3.5 and GPT-4o-mini comparison models also lack version/snapshot dates." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": false, 154 "justification": "The paper describes using adversarial prompts and defender system prompts, but no actual prompt text is provided. The Challenger strategy ('verify the benignness of its input') is described only in natural language without the actual prompt used." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": false, 159 "justification": "The reward weights (αs=1, βt=0.5 in first half, reversed in second half) and the clipping parameter ε and KL coefficient β in Equation 5 are referenced but never assigned specific values. No learning rate, batch size, or other RL training hyperparameters are reported." 160 }, 161 "scaffolding_described": { 162 "applies": true, 163 "answer": true, 164 "justification": "The paper describes the multi-agent scaffold in detail: three agent topologies (chain, tree, complete), the attacker warm-up via SFT, the adversarial RL loop, the REINFORCE++ training with public baseline, and the reward mechanisms for attackers vs. defenders (Section 4 and Figure 1)." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 4.2 describes the preprocessing pipeline: sampling 1,000 harmful behaviors, applying jailbreak strategies to get Dinit, using a reasoning model to synthesize traces, filtering with LLM-as-judge to remove invalid trajectories, resulting in ~4,000 samples in Dadv. The filtering criteria (contradictory, off-topic, or vague) are stated." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": false, 176 "justification": "There is no dedicated limitations or threats-to-validity section. The paper has an Ethics Statement (Section 7) but it does not function as a limitations section. The conclusion does not include substantive limitations discussion." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": false, 181 "justification": "No threats to validity are discussed. There is no consideration of specific threats such as the small model sizes tested, limited attack scenarios, lack of multiple runs, or potential overfitting to the specific attack strategies used in evaluation." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": false, 186 "justification": "The paper does not explicitly state what the results do NOT show. Claims in the conclusion are broad ('promising and unified framework for building safe and capable multi-agent systems') without bounding to the tested conditions (specific models, attack types, and topologies)." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": false, 193 "justification": "No raw experimental data, model outputs, or evaluation results in a downloadable format are provided. Only aggregated metrics in tables and figures are presented." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Section 4.2 describes how the training data was collected: sampling 1,000 harmful behaviors from 'existing public datasets,' applying jailbreak strategies, and constructing the evaluation pools from JailbreakBench, Wild Jailbreak, and Strong Reject (Section 5.1)." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "This paper has no human participants; all data comes from automated benchmarks and synthetic adversarial prompt construction." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "The data pipeline from seed prompts to final training data is described: sample harmful behaviors → apply jailbreak strategies → synthesize reasoning traces → filter with LLM judge → obtain ~4,000 samples. Steps and counts are provided." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": false, 215 "justification": "There is no acknowledgments section or funding disclosure anywhere in the paper. No grants, corporate sponsors, or funding agencies are mentioned." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Author affiliations are listed on the first page: Northwestern University, University of Illinois at Chicago, University of Rochester, and Carnegie Mellon University. The paper does not evaluate any product affiliated with these institutions." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": false, 224 "answer": false, 225 "justification": "No funding is disclosed, so funder independence cannot be assessed." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "There is no competing interests statement, patent disclosure, or declaration of financial interests in the paper." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": true, 236 "answer": false, 237 "justification": "The paper uses QWen2.5 instruction-tuned models but does not state their training data cutoff dates. Evaluations are also conducted against GPT-3.5 and GPT-4o-mini without stating their training cutoffs." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": true, 241 "answer": false, 242 "justification": "The paper uses AIME, GPQA-diamond, and LiveCodeBench for evaluation but does not discuss whether QWen2.5's pre-training data may have included these benchmarks or their solutions." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": true, 246 "answer": false, 247 "justification": "LiveCodeBench is noted as 'contamination free' (citing Jain et al., 2024) but GPQA-diamond and AIME are public benchmarks that could have appeared in QWen2.5's training data. No contamination analysis is provided for these benchmarks." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants are involved in this study." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants are involved; an ethics statement is included but addresses the social implications of the research, not IRB approval." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants are involved in this study." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants are involved in this study." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants are involved in this study." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants are involved in this study." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants are involved in this study." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "The paper proposes a method requiring training multiple LLMs with RL but reports no inference cost, API cost, tokens consumed, or wall-clock time for either training or evaluation." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "No GPU hours, total compute, hardware specifications, or training time are stated anywhere in the paper." 297 } 298 } 299 }, 300 "claims": [ 301 { 302 "claim": "AdvEvo-MARL consistently keeps attack-success rate (ASR) below 20%, whereas baselines reach up to 38.33%.", 303 "evidence": "Table 1 shows AdvEvo-MARL-7B achieving near-zero to low single-digit ASR in chain and tree topologies. In the complete graph topology, the maximum observed ASR is 17.68% (3B model, UserHijack). Challenger-7B reaches 38.33% ASR in UserHijack complete topology.", 304 "supported": "strong" 305 }, 306 { 307 "claim": "AdvEvo-MARL preserves and sometimes improves task accuracy (up to +3.67% on reasoning tasks) while enhancing safety.", 308 "evidence": "Figure 2 shows AdvEvo-MARL-7B outperforming Vanilla-7B on AIME, GPQA, and LiveCodeBench. The abstract states '+3.67%' but the figure shows approximate values; the main results section (Section 5.2) states 'maximum 3% accuracy drop' for 3B variants and improvements for 7B.", 309 "supported": "moderate" 310 }, 311 { 312 "claim": "Training with a dynamic co-evolutionary attacker yields superior defender robustness compared to a static attacker baseline.", 313 "evidence": "Section 5.3 and Figure 3 show a 12% reduction in ASR under NetSafe and consistently lower ASR across AutoInject and UserHijack when using the dynamic MARL attacker vs. a static prompt pool.", 314 "supported": "moderate" 315 }, 316 { 317 "claim": "The public baseline mechanism for advantage estimation leads to more stable training and better task and safety outcomes than training without it.", 318 "evidence": "Section 5.4 and Figure 5 compare public baseline vs. no-baseline configurations, showing steadily improving accuracy and controlled ASR with baseline, and non-stationary behavior plus a 13.3% response length drop without it.", 319 "supported": "moderate" 320 }, 321 { 322 "claim": "Training defenders jointly in a MAS setting yields better safety and task utility than training them individually.", 323 "evidence": "Section 5.3 states 'AdvEvo-MARL exhibits the highest system safety and task utility across all evaluated settings' vs. individual training (referenced in Figure 3), with 12% robustness gain and 4% task utility enhancement under NetSafe.", 324 "supported": "moderate" 325 } 326 ], 327 "methodology_tags": [ 328 "benchmark-eval" 329 ], 330 "key_findings": "AdvEvo-MARL co-evolves attacker and defender agents via MARL to internalize safety in LLM-based multi-agent systems, achieving attack success rates below 20% across three attack scenarios and three system topologies while preserving or slightly improving task performance on math, coding, and reasoning benchmarks. The paper demonstrates that a public baseline mechanism (sharing group-level mean returns) stabilizes training compared to standard advantage estimation. The 7B variant of AdvEvo-MARL outperforms its vanilla counterpart on out-of-distribution task benchmarks, suggesting that adversarial co-evolution provides general capability benefits. Results are compared against self-verification (Challenger) and external guard agent (Inspector) baselines using QWen2.5 3B/7B backbone models.", 331 "red_flags": [ 332 { 333 "flag": "No statistical testing or uncertainty quantification", 334 "detail": "All results are single-run point estimates with no standard deviation, confidence intervals, or error bars. Claims of superiority (e.g., 'consistently achieves the lowest ASR') are made without statistical tests, making it impossible to assess whether differences are meaningful or due to random variation." 335 }, 336 { 337 "flag": "Missing hyperparameters", 338 "detail": "Critical RL hyperparameters (learning rate, batch size, number of training steps, KL coefficient β, clip parameter ε, specific reward weights beyond the general priority scheme) are not reported, making reproduction impossible." 339 }, 340 { 341 "flag": "Vague model version specification", 342 "detail": "The paper uses 'QWen2.5 instruction-tuned models (3B and 7B)' without specifying the exact model checkpoint or version, and GPT-3.5/GPT-4o-mini comparisons lack snapshot dates. This is critical for reproducing and comparing results." 343 }, 344 { 345 "flag": "No limitations section", 346 "detail": "The paper has no limitations section or threats-to-validity discussion. Notable unaddressed issues include: small model sizes tested (3B/7B may not represent frontier safety challenges), three specific attack types (generalizability to other attacks is untested), and single-run results with potential high variance." 347 }, 348 { 349 "flag": "Benchmark contamination not addressed", 350 "detail": "AIME, GPQA-diamond, and similar benchmarks are used for task evaluation with QWen2.5 models whose training data cutoffs are not stated. The possibility that these benchmarks appeared in pre-training data is not discussed for most benchmarks." 351 }, 352 { 353 "flag": "Overly broad generalization claims", 354 "detail": "The conclusion describes AdvEvo-MARL as 'a promising and unified framework for building safe and capable multi-agent systems' broadly, but experiments only cover two model sizes (3B/7B), three attack types, and one model family (QWen2.5). The framework has not been validated on larger models or different LLM families." 355 }, 356 { 357 "flag": "Contagion rate anomaly for AdvEvo-MARL-3B in chain topology", 358 "detail": "In Table 1, AdvEvo-MARL-3B shows only 6.93% ASR but 35.64% CR under NetSafe in chain topology, which is comparable to the Vanilla-3B model's 36.14% CR despite much lower ASR. This inconsistency is not explained or discussed." 359 } 360 ], 361 "cited_papers": [ 362 { 363 "title": "NetSafe: Exploring the Topological Safety of Multi-Agent Networks", 364 "authors": ["Miao Yu", "Shilong Wang", "Guibin Zhang", "Junyuan Mao", "Chenlong Yin", "Qijiong Liu", "Qingsong Wen", "Kun Wang", "Yang Wang"], 365 "year": 2024, 366 "arxiv_id": "2410.15686", 367 "relevance": "Introduces the NetSafe attack framework used as one of the primary evaluation scenarios and a baseline comparison in this paper." 368 }, 369 { 370 "title": "On the Resilience of LLM-based Multi-Agent Collaboration with Faulty Agents", 371 "authors": ["Jen tse Huang", "Jiaxu Zhou", "Tailin Jin", "Xuhui Zhou", "Zixi Chen", "Wenxuan Wang", "Youliang Yuan", "Michael R. Lyu", "Maarten Sap"], 372 "year": 2025, 373 "arxiv_id": "2408.00989", 374 "relevance": "Provides the AutoInject attack strategy used in this paper's evaluation and proposes self-verification and guard agent baselines." 375 }, 376 { 377 "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code", 378 "authors": ["Naman Jain", "King Han", "Alex Gu", "Wen-Ding Li", "Fanjia Yan", "Tianjun Zhang", "Sida Wang", "Armando Solar-Lezama", "Koushik Sen", "Ion Stoica"], 379 "year": 2024, 380 "arxiv_id": "2403.07974", 381 "relevance": "Task benchmark used to evaluate coding capabilities; relevant as a contamination-aware coding evaluation benchmark for LLMs." 382 }, 383 { 384 "title": "GPQA: A Graduate-Level Google-Proof Q&A Benchmark", 385 "authors": ["David Rein", "Betty Li Hou", "Asa Cooper Stickland", "Jackson Petty", "Richard Yuanzhe Pang", "Julien Dirani", "Julian Michael", "Samuel R. Bowman"], 386 "year": 2024, 387 "relevance": "General reasoning benchmark used to evaluate task utility of the safety-trained agents; widely used for LLM capability evaluation." 388 }, 389 { 390 "title": "REINFORCE++: A Simple and Efficient Approach for Aligning Large Language Models", 391 "authors": ["Jian Hu"], 392 "year": 2025, 393 "arxiv_id": "2501.03262", 394 "relevance": "The RL training algorithm used in AdvEvo-MARL for policy optimization, directly relevant to LLM alignment via RL." 395 }, 396 { 397 "title": "Evo-MARL: Co-Evolutionary Multi-Agent Reinforcement Learning for Internalized Safety", 398 "authors": ["Zhenyu Pan", "Yiting Zhang", "Yutong Zhang", "Jianshu Zhang", "Haozheng Luo", "Yuwei Han", "Dennis Wu", "Hong-Yu Chen", "Philip S. Yu", "Manling Li", "Han Liu"], 399 "year": 2025, 400 "arxiv_id": "2508.03864", 401 "relevance": "A companion/predecessor paper from the same group that appears to be a related earlier version or concurrent work on MARL for safety in multi-agent systems." 402 }, 403 { 404 "title": "GuardAgent: Safeguard LLM Agents via Knowledge-Enabled Reasoning", 405 "authors": ["Zhen Xiang", "Linzhi Zheng", "Yanjie Li", "Junyuan Hong", "Qinbin Li", "Han Xie", "Jiawei Zhang", "Zidi Xiong", "Chulin Xie", "Nathaniel D Bastian"], 406 "year": 2025, 407 "relevance": "External guard agent approach for LLM agent safety, representing the class of defenses that AdvEvo-MARL aims to supersede." 408 }, 409 { 410 "title": "AgentPoison: Red-Teaming LLM Agents via Poisoning Memory or Knowledge Bases", 411 "authors": ["Zhaorun Chen", "Zhen Xiang", "Chaowei Xiao", "Dawn Song", "Bo Li"], 412 "year": 2024, 413 "relevance": "Adversarial attack on LLM agents relevant to the safety evaluation context of multi-agent systems." 414 }, 415 { 416 "title": "Red-Teaming LLM Multi-Agent Systems via Communication Attacks", 417 "authors": ["Pengfei He", "Yupin Lin", "Shen Dong", "Han Xu", "Yue Xing", "Hui Liu"], 418 "year": 2025, 419 "arxiv_id": "2502.14847", 420 "relevance": "Attack methodology on multi-agent communication relevant to the adversarial threat model studied in this paper." 421 }, 422 { 423 "title": "G-Safeguard: A Topology-Guided Security Lens and Treatment on LLM-based Multi-Agent Systems", 424 "authors": ["Shilong Wang", "Guibin Zhang", "Miao Yu", "Guancheng Wan", "Fanci Meng", "Chongye Guo", "Kun Wang", "Yang Wang"], 425 "year": 2025, 426 "arxiv_id": "2502.11127", 427 "relevance": "Graph-neural-network-based defense for multi-agent system safety, an alternative approach to the RL-based method in this paper." 428 }, 429 { 430 "title": "LlamaFirewall: An Open Source Guardrail System for Building Secure AI Agents", 431 "authors": ["Sahana Chennabasappa", "Cyrus Nikolaidis", "Daniel Song", "David Molnar", "Stephanie Ding", "Shengye Wan", "Spencer Whitman", "Lauren Deason", "Nicholas Doucette", "Abraham Montilla"], 432 "year": 2025, 433 "arxiv_id": "2505.03574", 434 "relevance": "External guardrail system for AI agent safety, representing the external guard paradigm that AdvEvo-MARL contrasts with." 435 } 436 ] 437 }