scan.json (26000B)
1 { 2 "paper": { 3 "title": "MAD-SPEAR: A Conformity-Driven Prompt Injection Attack on Multi-Agent Debate Systems", 4 "authors": ["Yu Cui", "Hongyang Du"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2507.13038" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. The paper references the SoM framework and external APIs but does not release its own attack code or experimental scripts." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses publicly available datasets: GSM-Ranges (Shrestha, Kim, and Ross 2025) and Logical Fallacies from MMLU (Hendrycks et al. 2021). These are standard public benchmarks that were not modified." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, conda environment, or detailed library versions are provided. The paper mentions using specific model APIs (DeepSeek, moonshot, Qwen) and the DeepSeek tokenizer for token counting, but does not specify a reproducible environment." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. Algorithm 1 describes the attack process at a high level but the paper does not provide enough implementation detail (e.g., exact prompt injection templates, SoM modification code) for independent reproduction." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Results are reported as point estimates (e.g., '56.66% ASR', '100% to 26.67% accuracy') with no confidence intervals, error bars, or uncertainty quantification on any figures or tables." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper makes comparative claims (e.g., MAD-SPEAR achieves '8x improvement in attack success rate' over baseline) but provides no statistical significance tests such as p-values or hypothesis tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports effect sizes in context: 'accuracy on Level 4 drops sharply from 100% to 26.67%', 'accuracy drops from 86.67% to 46.67%', 'over an 8x improvement in attack success rate', 'more than a 3x degradation in scalability'. Baselines and attacked conditions are provided for context." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper does not justify the sample sizes used. It is unclear how many problems from each GSM-Ranges level or the Logical Fallacies dataset were used. The paper states results in percentages (e.g., 86.67%, 46.67%) suggesting small sample sizes (likely 15 problems given the granularity), but this is never justified or discussed." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. Results appear to be single-run numbers with no indication of multiple runs or result stability." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper includes baselines: (1) MAD under no attack conditions, and (2) the infinite loop attack from Zhang et al. (2024a), identified as the stronger of two existing attack methods. Table 1 compares No Attack, Baseline, and MAD-SPEAR." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "The baseline attack (infinite loop from Zhang et al. 2024a) is from 2024, which is recent and relevant. The paper acknowledges it as the state-of-the-art attack method for multi-agent systems." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper investigates several component variations: (1) reducing the proportion of compromised agents from 1/4 to 1/6 (Figure 4), (2) comparing heterogeneous vs. homogeneous MAD (Table 2), (3) varying debate rounds (3 vs 4), and (4) the composite attack combining prompt injection with communication attack." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Three evaluation metrics are used: accuracy (attack success rate), scalability (token consumption), and consensus speed (number of rounds to reach consensus). These are defined in the Evaluation Approach section." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "This is a benchmark evaluation of automated attack methods on multi-agent debate systems. Human evaluation of system outputs is not relevant to the claims being made about attack effectiveness." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": false, 88 "justification": "The paper does not discuss any separation between development and test data. It is unclear whether hyperparameters of the attack (e.g., number of Sybil agents L, prompt template design) were tuned on the same data used for final evaluation." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down by dataset difficulty level (Level 3-6 in GSM-Ranges) in Figure 5 and Figure 3, plus a separate evaluation on the Logical Fallacies dataset. Per-agent token consumption is shown in Figure 6." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "The appendix (Figure 7) provides detailed analysis of how the attack affects reasoning processes of non-compromised agents, showing the contradiction and self-doubt stages. The paper also notes that reasoning LLMs show 'relatively stronger resistance' to the attack compared to traditional LLMs." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that reasoning LLMs demonstrate 'a higher level of confidence' and 'relatively stronger resistance against our prompt injection attack' compared to traditional LLMs. It also reports that the attack on homogeneous MAD (Qwen1.5-32B) only caused a drop from 94% to 78%, a smaller effect than on heterogeneous MAD." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims that MAD-SPEAR 'consistently outperforms the baseline attack in degrading system performance' (supported by Table 1 and Figure 5), that it works with 'five benchmark datasets with varying difficulty levels' (supported by Level 3-6 GSM-Ranges + Logical Fallacies), and that 'agent diversity substantially improves MAD performance' (supported by Table 2)." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper's main causal claims are that the prompt injection attack causes degradation in MAD performance. The experimental design (controlled comparison of MAD with and without attack, using the same datasets and settings) is adequate for this type of causal claim. The ablation of attack components (varying compromised agent ratio, composite attack) provides controlled single-variable manipulation." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title and abstract speak broadly of 'Multi-Agent Debate Systems', but experiments are conducted only on the SoM framework with a maximum of 6 agents, using only mathematical reasoning (GSM-Ranges) and one multiple-choice task (MMLU Logical Fallacies). The generalizability section argues adaptation to Sparse MAD is easy but provides no experiments. The claim of 'broad applicability' is not well-bounded." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper does not substantively discuss alternative explanations for its results. For example, the dramatic accuracy drops could be partly due to the specific models used, the small sample sizes, or the particular task types. The limitations section mentions budget constraints and scale limits but does not discuss confounding factors for the observed results." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper specifies 'DeepSeek-R1-0528' (with a footnote linking to the API docs for this specific version), 'moonshot-v1-32k' (with API link), and 'Qwen1.5-32B-Chat' (with HuggingFace link). The version identifiers include specific version numbers or snapshot dates." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "The full injected prompt template is provided in the appendix under 'Injected Data' and 'Injected Data (continued)', spanning approximately two pages with the complete text of the attack prompt including the example, instructions, and output template." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper does not report temperature, top-p, max tokens, or other sampling parameters for any of the LLM API calls. The number of Sybil agents L=2 is stated, debate rounds delta_R=3 or 4, and N=4 or 6 agents, but LLM inference hyperparameters are missing." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The MAD scaffolding is described: the paper uses the SoM framework (Du et al. 2024), modified to support heterogeneous MAD. The debate process (multi-round agent interaction, consensus mechanism), attack injection process (Algorithm 1), and message passing between agents are detailed." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": false, 152 "justification": "The paper states it 'specifically select subsets of the dataset featuring level 3 to 6 perturbations' from GSM-Ranges and selects the 'Logical Fallacies dataset from MMLU', but does not specify how many problems were selected, what the selection criteria were, or how subsets were drawn from these benchmarks." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "The appendix contains a 'Limitations and Ethical Considerations' section that discusses budget constraints, the maximum of six agents, and acknowledges that 'investigating the behavior of larger-scale MAD systems under attack remains an important direction for future research.'" 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "The limitations section mentions a specific threat: 'Due to budget constraints, the MAD system implemented in our experiments includes up to six agents.' This is a concrete limitation specific to this study, though the discussion could be more extensive." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound its claims to specific model families, task types, or MAD frameworks tested. The limitation about scale (up to 6 agents) is mentioned but no explicit statements about what settings or populations are excluded from the claims." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "No raw experimental data (individual problem results, model outputs, conversation logs) is made available. Only aggregated results in figures and tables are shown." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": false, 181 "justification": "The paper describes which datasets were used (GSM-Ranges Level 3-6, MMLU Logical Fallacies) and which models (DeepSeek-R1-0528, moonshot-v1-32k, Qwen1.5-32B-Chat) but does not specify how many problems were sampled from each dataset, the exact selection process, or the time period of experiments." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants in this study. Data sources are standard public benchmarks." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": false, 191 "justification": "The pipeline from dataset selection to final results is not documented in detail. How problems were selected from each dataset, how the SoM framework was configured, and the complete flow from data input to reported metrics lacks step-by-step documentation." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section listing grants or sponsors." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly listed: both authors are from the Department of Electrical and Electronic Engineering, The University of Hong Kong. Yu Cui's work was done during an internship at HKU." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding is disclosed, so independence cannot be assessed. The paper uses DeepSeek and moonshot APIs but does not disclose whether API access was granted/sponsored by these companies." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement or financial interest declaration is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper evaluates DeepSeek-R1-0528, moonshot-v1-32k, and Qwen1.5-32B-Chat on GSM-Ranges and MMLU benchmarks but does not state the training data cutoff dates for any of these models." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of potential train/test overlap. MMLU (Hendrycks et al. 2021) has been publicly available since 2021 and could easily be in the training data of models released in 2024-2025. GSM-Ranges is derived from GSM8K which is also widely known. Neither overlap is discussed." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "MMLU was published in 2021 and GSM-based benchmarks have been publicly available for years. All three models used were trained well after these benchmarks were published. No contamination analysis or discussion is provided." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "Token consumption is explicitly measured and reported as a key metric. Figure 3 shows output token consumption across datasets, and Table 1 reports average token consumption (e.g., 26947 for no attack vs. 85101.5 under MAD-SPEAR). This serves as a proxy for API cost." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total computational budget is stated. The paper does not report total API spend, number of API calls, wall-clock time for experiments, or hardware used." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "MAD-SPEAR achieves an average attack success rate of 56.66%, over 8x higher than the baseline infinite loop attack (6.67%).", 286 "evidence": "Table 1 compares MAD-SPEAR with the baseline attack on Dataset Level 3-4, showing 56.66% vs 6.67% ASR.", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "MAD-SPEAR causes more than 3x degradation in scalability (token consumption) compared to baseline conditions.", 291 "evidence": "Table 1 shows average token consumption of 85101.50 under MAD-SPEAR vs 26947.00 under no attack and 26959.00 under baseline. Figure 3 shows per-level token consumption comparisons.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "The attack's effectiveness does not diminish when the proportion of compromised agents decreases from 1/4 to 1/6.", 296 "evidence": "Figure 4 shows results with N=6 (1/6 compromised), where the attack maintains substantial disruptive impact on accuracy.", 297 "supported": "weak" 298 }, 299 { 300 "claim": "Agent diversity (heterogeneous MAD) significantly improves mathematical reasoning performance, with a ~56% accuracy improvement over homogeneous MAD.", 301 "evidence": "Table 2 shows heterogeneous MAD achieves 93.33% average accuracy vs 60.00% for homogeneous MAD under normal conditions.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "The attack becomes increasingly effective as the number of debate rounds increases, pushing the system from finite MAD toward infinite MAD.", 306 "evidence": "Figure 4 compares results with delta_R=3 vs delta_R=4, showing lower probability of correct convergence with more rounds under attack.", 307 "supported": "moderate" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "MAD-SPEAR is a prompt injection attack on multi-agent debate systems that exploits LLM conformity by creating Sybil agents through a single compromised agent. The attack achieves 56.66% average attack success rate, over 8x higher than the baseline infinite loop attack, while causing 3x higher token consumption. Even compromising only 1/6 of agents maintains attack effectiveness. The paper also finds that agent diversity (heterogeneous models) substantially improves MAD mathematical reasoning performance (~56% accuracy gain over homogeneous MAD), contradicting prior work.", 312 "red_flags": [ 313 { 314 "flag": "Very small sample sizes", 315 "detail": "Results reported in percentages like 86.67%, 46.67%, 26.67% suggest samples of approximately 15 problems per dataset level. Such small samples make individual results highly variable, yet no uncertainty quantification is provided." 316 }, 317 { 318 "flag": "No statistical uncertainty quantification", 319 "detail": "All results are point estimates with no confidence intervals, error bars, standard deviations, or multiple-run analysis. Given the small apparent sample sizes, the results could easily shift with different random samples." 320 }, 321 { 322 "flag": "Single framework evaluation", 323 "detail": "All experiments use only the SoM framework. The generalizability claim to other MAD frameworks (e.g., Sparse MAD) is argued theoretically but never tested experimentally." 324 }, 325 { 326 "flag": "Fixed attack target", 327 "detail": "The paper states 'we consistently designate the first of the four agents as the target of the attack.' This means no randomization of which agent is attacked, potentially confounding results with agent-position effects." 328 }, 329 { 330 "flag": "Contamination risk unaddressed", 331 "detail": "The evaluation uses MMLU (published 2021) and GSM-derived benchmarks with models from 2024-2025 that almost certainly saw these benchmarks during training. The baseline 'no attack' accuracy could be inflated by memorization, making the attack's impact appear larger." 332 } 333 ], 334 "cited_papers": [ 335 { 336 "title": "Breaking agents: Compromising autonomous llm agents through malfunction amplification", 337 "authors": ["B. Zhang", "Y. Tan", "Y. Shen", "A. Salem", "M. Backes", "S. Zannettou", "Y. Zhang"], 338 "year": 2024, 339 "relevance": "Proposes prompt injection attacks (infinite loop, incorrect function execution) against LLM agents, used as the baseline comparison in this paper." 340 }, 341 { 342 "title": "Red-teaming llm multi-agent systems via communication attacks", 343 "authors": ["P. He", "Y. Lin", "S. Dong", "H. Xu", "Y. Xing", "H. Liu"], 344 "year": 2025, 345 "relevance": "Proposes communication attacks on multi-agent systems that are combined with MAD-SPEAR for the composite attack strategy." 346 }, 347 { 348 "title": "Amplified Vulnerabilities: Structured Jailbreak Attacks on LLM-based Multi-Agent Debate", 349 "authors": ["S. Qi", "Y. Zou", "P. Li", "Z. Lin", "X. Cheng", "D. Yu"], 350 "year": 2025, 351 "relevance": "Studies jailbreak attacks on multi-agent debate systems, directly relevant to MAD system security." 352 }, 353 { 354 "title": "Agents Under Siege: Breaking Pragmatic Multi-Agent LLM Systems with Optimized Prompt Attacks", 355 "authors": ["R. M. S. Khan", "Z. Tan", "S. Yun", "C. Flemming", "T. Chen"], 356 "year": 2025, 357 "relevance": "Studies optimized prompt attacks against multi-agent LLM systems." 358 }, 359 { 360 "title": "IP Leakage Attacks Targeting LLM-Based Multi-Agent Systems", 361 "authors": ["L. Wang", "W. Wang", "S. Wang", "Z. Li", "Z. Ji", "Z. Lyu", "D. Wu", "S.-C. Cheung"], 362 "year": 2025, 363 "relevance": "Studies IP leakage vulnerabilities in LLM multi-agent systems, relevant to multi-agent security." 364 }, 365 { 366 "title": "Formalizing and Benchmarking Prompt Injection Attacks and Defenses", 367 "authors": ["Y. Liu", "Y. Jia", "R. Geng", "J. Jia", "N. Z. Gong"], 368 "year": 2024, 369 "relevance": "Provides formal framework and benchmarks for prompt injection attacks and defenses, foundational to the attack taxonomy used." 370 }, 371 { 372 "title": "Automatic and universal prompt injection attacks against large language models", 373 "authors": ["X. Liu", "Z. Yu", "Y. Zhang", "N. Zhang", "C. Xiao"], 374 "year": 2024, 375 "relevance": "Proposes automated prompt injection attacks against LLMs, relevant to attack methodology against AI systems." 376 }, 377 { 378 "title": "Improving factuality and reasoning in language models through multiagent debate", 379 "authors": ["Y. Du", "S. Li", "A. Torralba", "J. B. Tenenbaum", "I. Mordatch"], 380 "year": 2024, 381 "relevance": "Introduces the SoM multi-agent debate framework used as the experimental platform in this paper." 382 }, 383 { 384 "title": "If Multi-Agent Debate is the Answer, What is the Question?", 385 "authors": ["H. Zhang", "Z. Cui", "X. Wang", "Q. Zhang", "Z. Wang", "D. Wu", "S. Hu"], 386 "year": 2025, 387 "relevance": "Comprehensive evaluation of existing MAD frameworks on multiple benchmarks, relevant to understanding MAD system capabilities." 388 }, 389 { 390 "title": "Which agent causes task failures and when? on automated failure attribution of llm multi-agent systems", 391 "authors": ["S. Zhang", "M. Yin", "J. Zhang", "J. Liu", "Z. Han", "J. Zhang", "B. Li", "C. Wang", "H. Wang", "Y. Chen"], 392 "year": 2025, 393 "relevance": "Studies automated failure attribution in LLM multi-agent systems, proposed as a potential defense mechanism." 394 }, 395 { 396 "title": "G-safeguard: A topology-guided security lens and treatment on llm-based multi-agent systems", 397 "authors": ["S. Wang", "G. Zhang", "M. Yu", "G. Wan", "F. Meng", "C. Guo", "K. Wang", "Y. Wang"], 398 "year": 2025, 399 "relevance": "Proposes security mechanisms for LLM multi-agent systems based on topology analysis, suggested as a potential defense." 400 }, 401 { 402 "title": "Prompt infection: Llm-to-llm prompt injection within multi-agent systems", 403 "authors": ["D. Lee", "M. Tiwari"], 404 "year": 2024, 405 "relevance": "Studies LLM-to-LLM prompt injection propagation in multi-agent systems." 406 }, 407 { 408 "title": "Revisiting Multi-Agent Debate as Test-Time Scaling: A Systematic Study of Conditional Effectiveness", 409 "authors": ["Y. Yang", "E. Yi", "J. Ko", "K. Lee", "Z. Jin", "S.-Y. Yun"], 410 "year": 2025, 411 "relevance": "Systematic study of MAD effectiveness that claims agent diversity has minimal impact on performance, directly contradicted by this paper's findings." 412 } 413 ] 414 }