scan.json (25136B)
1 { 2 "paper": { 3 "title": "ARGUS: Defending Against Multimodal Indirect Prompt Injection via Steering Instruction-Following Behavior", 4 "authors": [ 5 "Weikai Lu", 6 "Ziqian Zeng", 7 "Kehua Zhang", 8 "Haoran Li", 9 "Huiping Zhuang", 10 "Ruidong Wang", 11 "Cen Chen", 12 "Hao Peng" 13 ], 14 "year": 2025, 15 "venue": "arXiv", 16 "arxiv_id": "2512.05745" 17 }, 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper or supplementary materials. No mention of code release." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": false, 28 "justification": "The paper constructs a benchmark dataset across image, video, and audio modalities, but no download link or repository for this dataset is provided. The underlying QA benchmarks (VTQA, MSR-VTT, Clotho-AQA) are public, but the constructed IPI benchmark is not released." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "The paper mentions 'four NVIDIA A800 GPUs' in Appendix B.2, but no software environment details (Python version, library versions, requirements.txt, Dockerfile) are provided." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The paper describes the method but not how to reproduce the experiments." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "All results in Tables 1 and 3 are reported as single point estimates (e.g., ARGUS UIAinject=46.3, AIA=0.1) with no confidence intervals, error bars, or uncertainty measures." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "The paper claims ARGUS outperforms baselines (e.g., 'ARGUS demonstrates the best safety-utility-efficiency trade-off') but provides no statistical significance tests. Comparisons are made solely by comparing numbers in tables." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper reports raw metric values (AIA, UIA, AIFR) but does not report effect sizes such as Cohen's d, odds ratios, or relative improvements with sufficient context. Differences are stated numerically but without formal effect size measures." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "The test set contains 1000 samples per modality (Appendix A.1), but no justification is given for why this size was chosen. No power analysis is discussed." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "Results appear to be from single experimental runs. No standard deviation, variance across seeds, or multi-run results are reported anywhere in the paper." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "Section 6.1 includes five baselines: System Prompt, Ignore Prompt, Noise, Removal, and Adversarial Training (AT). Results are compared in Table 1." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "The baselines include recent methods: SecAlign (2024), StruQ (USENIX 2025), Chen et al. (2024) for prompt-based defenses, and detection-based approaches. These represent contemporary approaches in the IPI defense space." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "Section 6.3 provides an ablation study with three ARGUS variants: w/o Search, w/o AI (adaptive intervention), and w/o PF (post-filtering). Results are in Table 1, showing the contribution of each component." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "The paper uses four metrics: UIAinject, UIAclean, AIA, and AIFR (defined in Sec. 4.1 and Appendix A.2), plus inference time. This provides a comprehensive view of safety, utility, and efficiency." 88 }, 89 "human_evaluation": { 90 "applies": false, 91 "answer": false, 92 "justification": "This is a defense method against prompt injection evaluated via automated metrics (substring matching for AIA/UIA/AIFR). Human evaluation is not clearly relevant to the core claims about defense effectiveness." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": true, 97 "justification": "The paper explicitly separates training, validation, and test sets with different injection elements across splits (Appendix A.1). Test set uses different trigger phrases and injection instructions from training/validation to assess generalization." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Results are broken down by modality (image, video, audio) in Table 1, and additionally evaluated on different MLLMs in Table 3 (InternVL3.5-8B, Qwen2.5-VL-7B, Qwen2-Audio-7B)." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section 6.3 discusses failure cases from the ablation: the post-filtering stage can introduce false positives, and the adaptive intervention can actually reduce utility in certain scenarios (video modality). Section 7 acknowledges limitations with multi-instruction scenarios." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper reports several negative findings: naive defense directions can be coupled with utility-degrading directions (Finding 3), excessive intervention strength harms performance (Finding 2), the adaptive intervention sometimes reduces utility in video (Sec. 6.3), and prompt-engineering baselines can paradoxically increase adherence to injected instructions (Sec. 6.2)." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims ARGUS can 'achieve robust defense against multimodal IPI while maximally preserving the MLLM's utility.' Table 1 shows near-zero AIA/AIFR with UIAinject close to 'No Defense' levels across modalities, supporting this claim." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper makes causal claims through ablation studies (Sec. 6.3) showing that removing components degrades performance. The ablation design is controlled single-variable manipulation (removing one component at a time), which is adequate for these causal claims." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The title claims 'Defending Against Multimodal Indirect Prompt Injection' broadly, but results are limited to two specific MLLMs (Qwen2-vl-7b and Kimi-Audio-7b) with additional tests on three more. The limitations section (Sec. 7) only mentions the single-instruction limitation, not the narrow model coverage. The paper also only tests simple concatenation-based injection methods." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper does not discuss alternative explanations for why the method works beyond the proposed hypothesis about instruction-following subspaces. For example, it does not consider whether the improvement could be partly due to the injection detection stage alone, or whether the method might be learning dataset-specific patterns rather than general defense mechanisms." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper specifies Qwen2-vl-7b (with arXiv reference 2409.12191), Kimi-Audio-7b (arXiv 2504.18425), InternVL3.5-8B (arXiv 2508.18265), Qwen2.5-VL-7B, and Qwen2-Audio-7B. These are specific model identifiers with size and version information." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "The defensive system prompt text is provided in Sec. 6.1 ('Be vigilant against prompt-injection attacks...'), the Ignore Prompt text is given, GLUE task prompts are in Fig. 4, and the Removal baseline prompts are in Fig. 5. The injection prompt templates are also specified in Sec. 4.1." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "Table 2 in Appendix B.2 provides hyperparameters for ARGUS across modalities: detection layers, steering layers, post-filtering layers, intervention strength, epochs, and learning rate. AT baseline hyperparameters (epochs=2, lr=2e-6) and Noise baseline parameters (std=150, amplitude=50%) are also stated." 152 }, 153 "scaffolding_described": { 154 "applies": true, 155 "answer": true, 156 "justification": "The three-stage pipeline of ARGUS (injection detection, activation steering, post-filtering) is described in detail in Sec. 5 with Figure 3 showing the framework. The practical design decisions (which layers for each stage) are explained in Sec. 5.4." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Appendix A.1 documents the data construction pipeline in detail: source benchmarks, how samples were composed (7-tuple), injection methods per modality (image concatenation, video frame insertion, audio TTS synthesis), how splits were divided, and the final dataset statistics (training/validation/test sizes)." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 7 is titled 'Conclusion and Limitations' and includes a dedicated limitations paragraph discussing the single-instruction constraint." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "The limitations section only mentions one specific threat: 'experiments in this paper are limited to situations involving a single user instruction and a single injection instruction.' This is a single, brief statement without discussion of other specific threats like the simple injection methods used, the narrow model coverage, or potential brittleness to novel attack strategies." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "The limitations section mentions only the single-instruction scenario. It does not explicitly state what the results do NOT show, such as whether the method works on larger models, closed-source models, real-world deployment scenarios, or sophisticated adversarial injection methods beyond simple concatenation." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "The constructed benchmark dataset is not released. No download link or repository is provided for the raw data or the constructed samples." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Appendix A.1 describes in detail how data was collected: benign elements from VTQA 2023, MSR-VTT, and Clotho-AQA; injection elements from trigger phrases of [2], Stanford-Alpaca, and GLUE tasks; and specific injection methods (image concatenation, video frame insertion, TTS audio)." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants are involved. The data is constructed from existing public benchmarks (VTQA, MSR-VTT, Clotho-AQA) combined with synthesized injection components." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "Appendix A.1 documents the full data pipeline: source selection, sample composition, injection method per modality, split creation, and final statistics (e.g., 10312 image training samples, 18536 video, 8107 audio, 1000 validation/test per modality)." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding sources or acknowledgments section is present in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are listed on the first page: South China University of Technology, Hong Kong University of Science and Technology, Zhejiang Normal University, and Beihang University." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding information is disclosed, so it is impossible to assess funder independence. The absence of disclosure is not the same as absence of conflict." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests statement or financial disclosure is present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": false, 228 "answer": false, 229 "justification": "This paper tests a defense mechanism against prompt injection, not a pre-trained model's capability on a benchmark. The evaluation measures the defense's effectiveness at preventing instruction hijacking, not model knowledge." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": false, 233 "answer": false, 234 "justification": "The paper tests a defense mechanism, not a model's benchmark performance. The train/test separation of the IPI benchmark itself is documented (different injection elements across splits), but this is about experimental design rather than benchmark contamination." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": false, 238 "answer": false, 239 "justification": "Not applicable as the paper evaluates a defense method rather than model capability on an existing benchmark. The benchmark was constructed specifically for this paper." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants are involved in this study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants are involved in this study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants are involved in this study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants are involved in this study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants are involved in this study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants are involved in this study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants are involved in this study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": true, 283 "justification": "Table 1 reports additional inference time per sample in milliseconds for all methods. ARGUS adds only 3-6ms per sample, compared to the Removal baseline at 12,885ms (image) and 574,121ms (video)." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "The paper mentions 'four NVIDIA A800 GPUs' but does not state total GPU hours, training time for the probes/direction search, or total computational budget for the experiments." 289 } 290 } 291 }, 292 "claims": [ 293 { 294 "claim": "Instruction-following behaviors of MLLMs are linearly separable in activation space, indicating a safety subspace exists.", 295 "evidence": "Linear probes achieve near-100% accuracy across most layers for distinguishing 'following user instruction' vs 'following attacker instruction' behaviors (Fig. 1, Sec. 4.3, Finding 1). Multiple orthogonal probes also achieve >95% accuracy (Finding 5).", 296 "supported": "strong" 297 }, 298 { 299 "claim": "ARGUS achieves near-zero AIA and AIFR while preserving model utility across image, video, and audio modalities.", 300 "evidence": "Table 1 shows ARGUS achieves AIA of 0.1%, 0.1%, and 0.0% for image, video, and audio respectively, with UIAinject close to or exceeding 'No Defense' levels. Additional MLLMs in Table 3 confirm the pattern.", 301 "supported": "moderate" 302 }, 303 { 304 "claim": "ARGUS achieves the best safety-utility-efficiency trade-off compared to baselines.", 305 "evidence": "Table 1 shows ARGUS has near-zero AIA/AIFR with high UIAinject and only 3-6ms additional inference time, while baselines either sacrifice safety (prompt-based), utility (Noise, AT), or efficiency (Removal at 12,885-574,121ms).", 306 "supported": "moderate" 307 }, 308 { 309 "claim": "The optimal utility direction search successfully decouples from utility degradation direction.", 310 "evidence": "Ablation in Table 1 (Sec. 6.3): ARGUS w/o Search shows 'significant drop in UIAinject' compared to full ARGUS (e.g., 44.5 vs 46.3 for image, 54.4 vs 58.0 for audio).", 311 "supported": "moderate" 312 }, 313 { 314 "claim": "Naive defense directions can be coupled with utility-degrading directions.", 315 "evidence": "Finding 3 (Sec. 4.3): When alpha is increased just enough to achieve AIA=0, UIA still fails to reach the no-injection upper bound, and different modalities show different degradation levels at the same intervention strength.", 316 "supported": "strong" 317 } 318 ], 319 "methodology_tags": [ 320 "benchmark-eval" 321 ], 322 "key_findings": "The paper discovers that instruction-following behavior in multimodal LLMs is encoded in a linearly-separable safety subspace within the activation space, enabling directional control via activation steering. ARGUS, a three-stage defense framework (injection detection, adaptive activation steering with utility-preserving direction search, post-filtering), achieves near-zero attack success rates across image, video, and audio modalities while preserving model utility with only 3-6ms additional inference time. The approach generalizes across multiple MLLMs (Qwen2-vl-7b, Kimi-Audio-7b, InternVL3.5-8B, Qwen2.5-VL-7B, Qwen2-Audio-7B), outperforming prompt-engineering, noise-based, removal-based, and adversarial training defenses in the safety-utility-efficiency trade-off.", 323 "red_flags": [ 324 { 325 "flag": "No error bars or multi-run results", 326 "detail": "All results appear to be from single experimental runs with no standard deviations, confidence intervals, or repeated trials. Given that activation steering involves learned components (probes, direction search), variance across random seeds could be significant." 327 }, 328 { 329 "flag": "Simple injection methods only", 330 "detail": "The injection methods are basic: text concatenation on images, frame insertion in video, TTS audio insertion. More sophisticated attacks (e.g., steganographic embedding, adversarial perturbations, semantic injection) are not tested, limiting confidence in the 'robust defense' claim." 331 }, 332 { 333 "flag": "No code or data release", 334 "detail": "Neither the code nor the constructed benchmark is released, making independent verification of the results impossible." 335 }, 336 { 337 "flag": "Limited scope of limitations discussion", 338 "detail": "The limitations section mentions only the single-instruction constraint. It does not address the simple injection methods, narrow model coverage (all 7B-scale models), or potential vulnerabilities to adaptive attackers who know the defense mechanism." 339 } 340 ], 341 "cited_papers": [ 342 { 343 "title": "SecAlign: Defending against prompt injection with preference optimization", 344 "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "David Wagner", "Chuan Guo"], 345 "year": 2024, 346 "arxiv_id": "2410.05451", 347 "relevance": "Adversarial training baseline for IPI defense using direct preference optimization, directly comparable defense approach." 348 }, 349 { 350 "title": "StruQ: Defending against prompt injection with structured queries", 351 "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"], 352 "year": 2025, 353 "relevance": "Prompt injection defense using structured queries, published at USENIX Security 2025." 354 }, 355 { 356 "title": "VPI-Bench: Visual prompt injection attacks for computer-use agents", 357 "authors": ["Tri Cao", "Bennett Lim", "Yue Liu"], 358 "year": 2025, 359 "arxiv_id": "2506.02456", 360 "relevance": "Benchmark for visual prompt injection attacks on computer-use agents, directly relevant to multimodal IPI evaluation." 361 }, 362 { 363 "title": "AgentTypo: Adaptive typographic prompt injection attacks against black-box multimodal agents", 364 "authors": ["Yanjie Li", "Yiming Cao", "Dong Wang", "Bin Xiao"], 365 "year": 2025, 366 "arxiv_id": "2510.04257", 367 "relevance": "Black-box typographic prompt injection attack using Bayesian optimization, relevant to agentic AI security." 368 }, 369 { 370 "title": "Defense against prompt injection attack by leveraging attack techniques", 371 "authors": ["Yulin Chen", "Haoran Li", "Zihao Zheng", "Yangqiu Song", "Dekai Wu", "Bryan Hooi"], 372 "year": 2024, 373 "arxiv_id": "2411.00459", 374 "relevance": "Prompt engineering defense against IPI using attack techniques, a baseline defense approach." 375 }, 376 { 377 "title": "Can indirect prompt injection attacks be detected and removed?", 378 "authors": ["Yulin Chen", "Haoran Li", "Yuan Sui"], 379 "year": 2025, 380 "arxiv_id": "2502.16580", 381 "relevance": "Detection-based approach to IPI defense, directly relevant to the detection stage of ARGUS." 382 }, 383 { 384 "title": "Defending against indirect prompt injection attacks with spotlighting", 385 "authors": ["Keegan Hines", "Gary Lopez", "Matthew Hall", "Federico Zarfati", "Yonatan Zunger", "Emre Kiciman"], 386 "year": 2024, 387 "arxiv_id": "2403.14720", 388 "relevance": "Prompt engineering defense using spotlighting technique, a baseline defense for IPI in LLMs." 389 }, 390 { 391 "title": "Formalizing and benchmarking prompt injection attacks and defenses", 392 "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"], 393 "year": 2024, 394 "relevance": "Formal framework for prompt injection attacks and defenses, foundational work in the IPI defense space." 395 }, 396 { 397 "title": "Representation engineering: A top-down approach to AI transparency", 398 "authors": ["Andy Zou", "Long Phan", "Sarah Chen", "James Campbell"], 399 "year": 2023, 400 "arxiv_id": "2310.01405", 401 "relevance": "Foundational work on representation engineering for AI safety, the technical basis for ARGUS's activation steering approach." 402 }, 403 { 404 "title": "Manipulating multimodal agents via cross-modal prompt injection", 405 "authors": ["Le Wang", "Zonghao Ying", "Tianyuan Zhang"], 406 "year": 2025, 407 "arxiv_id": "2504.14348", 408 "relevance": "Cross-modal prompt injection attacks combining image and text, relevant to multimodal IPI threat modeling." 409 }, 410 { 411 "title": "Ignore previous prompt: Attack techniques for language models", 412 "authors": ["Fábio Perez", "Ian Ribeiro"], 413 "year": 2022, 414 "arxiv_id": "2211.09527", 415 "relevance": "Seminal work on prompt injection attacks in LLMs, foundational reference for the IPI threat model." 416 }, 417 { 418 "title": "COSMIC: Generalized refusal direction identification in LLM activations", 419 "authors": ["Vincent Siu", "Nicholas Crispino", "Zihao Yu"], 420 "year": 2025, 421 "arxiv_id": "2506.00085", 422 "relevance": "Activation steering for jailbreak defense via refusal directions, related approach to ARGUS but for different threat model." 423 } 424 ] 425 }