scan.json (25517B)
1 { 2 "paper": { 3 "title": "Among Us: Measuring and Mitigating Malicious Contributions in Model Collaboration Systems", 4 "authors": ["Ziyuan Yang", "Wenxuan Ding", "Shangbin Feng", "Yulia Tsvetkov"], 5 "year": 2026, 6 "venue": "arXiv", 7 "arxiv_id": "2602.05176" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "The abstract states 'Our code is available at https://github.com/Ziyuan-Yang/AmongUs' and provides the URL." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses publicly available benchmarks (GSM8k, HumanEval, MMLU-redux, TruthfulQA, DS-1000, IFEval, IFBench, CocoNot, SafetyBench, NLGraph). The adversarial datasets are constructed from existing public benchmarks and the code repository is provided." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper mentions 'A100 GPUs with 40G VRAM' and specific model names but does not provide a requirements.txt, Dockerfile, or detailed library version specifications sufficient to recreate the environment." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "While a GitHub repository is referenced and experimental details are given in the appendix (hyperparameters, LoRA settings, training epochs), there are no explicit step-by-step reproduction instructions or README with commands described in the paper itself." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Results in Tables 1 and 2 are reported as point estimates only (e.g., .568, .799). No confidence intervals, error bars, or ± notation are provided." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": true, 41 "justification": "Table 1 notes 'statistically significant performance drops further in underline', indicating significance tests were performed. The paper distinguishes between general drops (gray) and statistically significant drops (underlined)." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports percentage performance drops with baselines for context: e.g., 'lowering worst-case performance by 34.9% on average', '7.12% and 7.94% drop' for reasoning and safety, 'recovering 95.24% of the original performance' (from 0.368 to 0.541). These provide magnitude with baseline context." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification is provided for the choice of 10 datasets, 5 domains, or the specific test set sizes (ranging from 140 to 1000 examples as shown in Table 4). No power analysis is discussed." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No standard deviations, variance, or spread measures across experimental runs are reported. Results appear to be single-run numbers." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Each collaboration method is evaluated with a benign-only baseline (no malicious models) against which malicious settings are compared, shown as the first row of each section in Tables 1 and 2." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "The collaboration methods studied include recent work: RouteLLM (Ong et al., 2025), GraphRouter (Feng et al., 2025e), and the MoCo framework (Feng et al., 2026). The model pool uses Qwen2.5-7B-Instruct (2024)." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper systematically ablates across four malicious methods (M1-M4), studies the effect of malicious model count (Section 5, Figure 3), maliciousness diversity (Figure 2), steering factor alpha (Table 6), and out-of-domain SFT transfer (Figure 4)." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Performance is evaluated across 10 datasets spanning 5 domains (safety, reasoning, knowledge, coding, instruction following), each with domain-appropriate metrics (accuracy, pass rates, compliance scores)." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "The paper relies entirely on automated evaluation (regex matching, LLM-based verifier, multiple-choice accuracy, sandbox execution). The Limitations section acknowledges 'Incorporating human evaluation and real-world task settings is an important direction for future work.'" 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "Table 4 shows explicit dev/test splits for all 10 datasets. The routers are trained on development sets and results are reported on separate test sets." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Table 1 provides per-dataset breakdowns across all 10 benchmarks, and Table 2 shows per-benchmark mitigation results. The analysis discusses domain-specific impacts (safety vs. reasoning vs. coding)." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Tables 7-15 in the appendix provide qualitative examples of both correct and incorrect (malicious) responses. Section 4.2 discusses domains where mitigation fails: 'the average recovery rate still remains around 89.2% for the CocoNot benchmark, and the worst case drops to approximately 75%.'" 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that prompting-based attacks have 'relatively limited impact' (3.24% and 3.65%), some malicious settings showed no degradation (LOGIT AVERAGE, GREEDY SOUP), and mitigation strategies fail to fully recover safety performance. It also notes that 'fully mitigating the impact of malicious models in safety-critical collaboration remains an open problem.'" 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims 'performance is lowered by 7.12% and 7.94% on average' for reasoning and safety, and mitigation 'recover 95.31% of the initial performance.' These specific numbers are supported by Tables 1 and 2 in the results." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper makes causal claims about malicious models degrading performance. These are justified through controlled experiments: the only variable changed is the introduction of malicious models into a fixed collaboration system, with benign-only baselines. The ablation design (varying malicious method, count, domain) constitutes controlled single-variable manipulation." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "The Limitations section explicitly bounds findings: 'our experiments focus on a limited set of collaboration methods... and model architectures', 'the conclusions may not directly generalize to all emerging collaboration paradigms or proprietary systems', and 'mitigation strategies are evaluated primarily on API-level and Text-level collaboration methods.'" 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper does not substantively discuss alternative explanations for the observed performance degradation. For example, it does not consider whether the degradation could be partly attributed to adding any additional (non-malicious but lower-quality) model to the pool, or whether the router/collaboration methods simply fail with any heterogeneous model addition." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Specific model versions are provided: 'Qwen2.5-7B-Instruct', 'Llama-3.1-8B-Instruct', 'Olmo-3-7B-Instruct', 'Llama-3.1-Tulu-3-8B-DPO', 'Mistral-7B-Instruct-v0.2', 'Skywork-Reward-Llama-3.1-8B', and 'sentence-transformers/all-MiniLM-L6-v2'. These include version identifiers." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "The full malicious model prompt for M1-Prompting is provided in Appendix A.1. The paper states TEXT DEBATE 'employs prompts in (Du et al., 2024)' referencing the original work. The malicious persona prompt is given verbatim." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Appendix A.2 specifies 'top-p = 0.9, temperature=0.7 and 256 maximum generation length.' LoRA training uses 'rank 16, batch size 32... 5 epochs'. GRAPH ROUTER: '500 epochs and 32 batch size.' Logit contrastive: 'λ = 0.2'. Activation steering: 'α = 5.0'." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "The paper does not use agentic scaffolding. The collaboration methods (routing, debate, logit fusion, model merging) are model collaboration approaches, not agentic scaffolding with tools, memory, or retry logic." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Appendix A.1 describes how adversarial datasets were constructed for each domain (M3-SFT), including source datasets, sizes, and how GPT-4O was used to generate misaligned outputs. Appendix A.3 describes model pool construction with specific training data subsets. Dataset dev/test splits are documented in Table 4." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "A dedicated 'Limitations' section is present, discussing four specific limitations including the scope of malicious behaviors, limited collaboration methods, mitigation only for API/text-level, and reliance on automated evaluation." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "The limitations are specific to this study: 'Our malicious settings may not fully capture more sophisticated or adaptive adversaries', 'the conclusions may not directly generalize to all emerging collaboration paradigms or proprietary systems', and 'mitigation strategies are evaluated primarily on API-level and Text-level collaboration methods.'" 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "The Limitations section states specific boundaries: mitigation not tested for logit-level and weight-level, limited to studied collaboration methods and model architectures, and acknowledges results use automated evaluation rather than real-world deployment scenarios." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "While the code repository is provided, the paper does not mention releasing raw experimental outputs, model predictions, or intermediate results that would enable independent verification of the reported numbers." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Appendix A.1 describes adversarial dataset construction in detail: sources, sizes (e.g., '6k vulnerable coding dataset from (Betley et al., 2025)', '7.5k misaligned MATH questions'), and methods. Appendix A.4 describes benchmark evaluation procedures." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants are involved; all data comes from publicly available benchmarks and model-generated outputs." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The pipeline is documented: malicious model construction (Section 2.1, Appendix A.1) → model pool integration (Section 2.2) → collaboration execution → evaluation via automated judges. Training details, dataset sizes, and evaluation methods are specified." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No acknowledgments section listing funding sources is present in the paper. No mention of grants or corporate sponsors." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly stated: University of Washington (Yang, Feng, Tsvetkov) and New York University (Ding). The authors are academic researchers, not employees of the model providers being studied." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding source is disclosed, so independence cannot be assessed. The absence of a funding disclosure means this criterion cannot be verified." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper uses Qwen2.5-7B-Instruct and evaluates on benchmarks like HumanEval and GSM8k but does not state the training data cutoff date for any of the models used." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of whether the test benchmarks (HumanEval, GSM8k, MMLU-redux, etc.) may have appeared in the training data of Qwen2.5-7B-Instruct or other models." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "HumanEval (2021), GSM8k (2021), and other benchmarks predate the models used. No discussion of contamination risk for these well-known benchmarks that are widely present in training corpora." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants are involved in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants are involved. The Ethics Statement discusses responsible use of adversarial research but no IRB is needed." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants are involved in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants are involved in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants are involved in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants are involved in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants are involved in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inference costs, API costs, tokens consumed, or wall-clock time are reported. The paper mentions A100 GPUs but does not quantify the cost per experiment or total compute." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "Only 'A100 GPUs with 40G VRAM' is mentioned. No total GPU hours, training time, or computational budget is stated for the extensive experiments (8 collaboration methods x 4 malicious settings x 10 datasets)." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Malicious models have a severe impact on multi-LLM collaboration systems, lowering worst-case performance by 34.9% on average.", 286 "evidence": "Table 1 shows GRAPH ROUTER with M2-Steering has the largest macro-average degradation of 34.99%. Section 4.1 reports this finding.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Safety and reasoning domains are disproportionately affected, with 7.94% and 7.12% average performance drops respectively.", 291 "evidence": "Table 1 results show CocoNot (safety) average drop of 22.05% and GSM8k (reasoning) average drop of 11.15% across collaboration methods (Section 4.1).", 292 "supported": "strong" 293 }, 294 { 295 "claim": "Mitigation strategies recover 95.31% of initial performance on average.", 296 "evidence": "Table 2 shows recovery rates across benchmarks and collaboration methods with supervisor-based methods. Section 4.2 reports the average recovery rate.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "Deeper collaborations (logit-level and weight-level) are more robust to malicious models.", 301 "evidence": "Table 1 shows LOGIT AVERAGE and GREEDY SOUP have minimal macro-average degradation compared to API-level and text-level methods. Section 4.1 discusses this.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "Activation steering and RL produce more impactful malicious models than prompting or SFT.", 306 "evidence": "Section 4.1 reports M2-steering causes the most impact on API-level methods, and M4-RL causes 12.29% average drop, versus prompting at 3.24% and 3.65%. Table 1 supports these numbers.", 307 "supported": "strong" 308 }, 309 { 310 "claim": "Concentrated/uniform malicious behaviors are more detrimental than diverse malicious patterns.", 311 "evidence": "Figure 2 and Section 5 (Maliciousness Diversity) show performance degrades as malicious diversity decreases, described as a 'counterintuitive result.'", 312 "supported": "moderate" 313 }, 314 { 315 "claim": "Supervisor-based mitigation outperforms supervisor-free mitigation by 4.5% recovery rate on average.", 316 "evidence": "Section 4.2 reports this finding based on Table 2 results across M2 and M4 settings and five benchmarks.", 317 "supported": "moderate" 318 } 319 ], 320 "methodology_tags": ["benchmark-eval"], 321 "key_findings": "Malicious models injected into multi-LLM collaboration systems (routing, debate, logit fusion, model merging) cause significant performance degradation, with worst-case drops of 34.9% particularly affecting safety and reasoning domains. Deeper collaboration methods (logit-level, weight-level) are more robust than API-level and text-level approaches. Proposed mitigation strategies using external supervisors (LLM-as-a-judge or reward models) recover 95.31% of initial performance on average, though safety-critical tasks remain challenging with recovery rates around 89.2%. Cross-domain malicious fine-tuning can transfer negative effects but is generally weaker than in-domain attacks.", 322 "red_flags": [ 323 { 324 "flag": "No variance or uncertainty reporting", 325 "detail": "All results across Tables 1 and 2 are single-point estimates with no standard deviations, confidence intervals, or error bars. With stochastic model generation (top-p=0.9, temperature=0.7), run-to-run variance could be substantial." 326 }, 327 { 328 "flag": "Benchmark contamination not discussed", 329 "detail": "The paper uses well-known benchmarks (HumanEval, GSM8k, MMLU) with models that may have been trained on them. While contamination is less critical for measuring relative degradation, baseline performance levels may be inflated." 330 }, 331 { 332 "flag": "No compute budget disclosed", 333 "detail": "The paper runs 8 collaboration methods x 4 malicious settings x 10 datasets plus mitigation experiments and analyses, but provides no total compute cost or GPU hours, making it difficult to assess practical replicability." 334 }, 335 { 336 "flag": "Missing alternative explanation for degradation", 337 "detail": "The paper does not control for whether adding any additional model (even a non-malicious but differently-skilled one) to the pool would cause similar degradation. The comparison is always benign-pool vs. benign-pool + malicious, never benign-pool vs. benign-pool + random additional benign model." 338 } 339 ], 340 "cited_papers": [ 341 { 342 "title": "Emergent misalignment: Narrow finetuning can produce broadly misaligned LLMs", 343 "authors": ["Jan Betley", "Daniel Chee Hian Tan", "Niels Warncke", "Anna Sztyber-Betley", "Xuchan Bao", "Martín Soto", "Nathan Labenz", "Owain Evans"], 344 "year": 2025, 345 "relevance": "Studies how adversarial fine-tuning in one domain can trigger universal maliciousness that transfers across domains, directly motivating the cross-domain SFT analysis in this paper." 346 }, 347 { 348 "title": "RouteLLM: Learning to route LLMs from preference data", 349 "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu", "Wei-Lin Chiang"], 350 "year": 2025, 351 "relevance": "Proposes the LLM Router approach for routing queries to appropriate models, one of the key collaboration methods evaluated in this paper." 352 }, 353 { 354 "title": "GraphRouter: A graph-based router for LLM selections", 355 "authors": ["Tao Feng", "Yanzhen Shen", "Jiaxuan You"], 356 "year": 2025, 357 "relevance": "Proposes graph neural network-based routing for LLM selection, the other API-level collaboration method evaluated." 358 }, 359 { 360 "title": "Improving factuality and reasoning in language models through multiagent debate", 361 "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba", "Joshua B. Tenenbaum", "Igor Mordatch"], 362 "year": 2024, 363 "relevance": "Introduces multi-agent debate for improving LLM factuality, one of the text-level collaboration methods evaluated under malicious settings." 364 }, 365 { 366 "title": "On the resilience of multi-agent systems with malicious agents", 367 "authors": ["Jen tse Huang", "Jiaxu Zhou", "Tailin Jin"], 368 "year": 2025, 369 "relevance": "Studies how malicious agents affect multi-agent system behavior, directly related prior work on adversarial multi-LLM safety." 370 }, 371 { 372 "title": "Adaptive attacks break defenses against indirect prompt injection attacks on LLM agents", 373 "authors": ["Qiusi Zhan", "Richard Fang", "Henil Shalin Panchal", "Daniel Kang"], 374 "year": 2025, 375 "relevance": "Studies adaptive attacks against LLM agent defenses, relevant to understanding attack-defense dynamics in multi-agent systems." 376 }, 377 { 378 "title": "G-safeguard: A topology-guided security lens and treatment on LLM-based multi-agent systems", 379 "authors": ["Shilong Wang", "Guibin Zhang", "Miao Yu"], 380 "year": 2025, 381 "relevance": "Proposes topology-guided security methods for multi-agent LLM systems, a mitigation approach for the same threat model studied here." 382 }, 383 { 384 "title": "Moco: A one-stop shop for model collaboration research", 385 "authors": ["Shangbin Feng", "Yuyang Bai", "Ziyuan Yang"], 386 "year": 2026, 387 "relevance": "Framework used for implementing all collaboration methods in this paper, representing the state-of-the-art infrastructure for model collaboration research." 388 }, 389 { 390 "title": "Watch out for your agents! Investigating backdoor threats to LLM-based agents", 391 "authors": ["Wenkai Yang", "Xiaohan Bi", "Yankai Lin"], 392 "year": 2024, 393 "relevance": "Studies backdoor threats to LLM-based agents, a complementary attack vector to the malicious model injection studied here." 394 }, 395 { 396 "title": "Peacemaker or troublemaker: How sycophancy shapes multi-agent debate", 397 "authors": ["Binwei Yao", "Chao Shang", "Wanyu Du"], 398 "year": 2025, 399 "relevance": "Studies how sycophantic behavior affects multi-agent debate quality, related to understanding failure modes in LLM collaboration." 400 }, 401 { 402 "title": "When one LLM drools, multi-LLM collaboration rules", 403 "authors": ["Shangbin Feng", "Wenxuan Ding", "Alisa Liu"], 404 "year": 2025, 405 "relevance": "Foundational survey on multi-LLM collaboration paradigms that motivates and contextualizes this security-focused study." 406 }, 407 { 408 "title": "NetSafe: Exploring the topological safety of multi-agent system", 409 "authors": ["Miao Yu", "Shilong Wang", "Guibin Zhang"], 410 "year": 2025, 411 "relevance": "Explores topological safety properties of multi-agent systems, relevant to understanding structural vulnerabilities in collaborative AI." 412 } 413 ] 414 }