scan.json (31793B)
1 { 2 "paper": { 3 "title": "Manipulating Multimodal Agents via Cross-Modal Prompt Injection", 4 "authors": [ 5 "Le Wang", 6 "Zonghao Ying", 7 "Tianyuan Zhang", 8 "Siyuan Liang", 9 "Shengshan Hu", 10 "Mingchuan Zhang", 11 "Aishan Liu", 12 "Xianglong Liu" 13 ], 14 "year": 2025, 15 "venue": "ACM Multimedia", 16 "arxiv_id": "2504.14348", 17 "doi": "10.1145/3746027.3755211" 18 }, 19 "scan_version": 3, 20 "active_modules": ["experimental_rigor", "data_leakage"], 21 "methodology_tags": ["benchmark-eval", "case-study"], 22 "key_findings": "CrossInject demonstrates that coordinated cross-modal prompt injection (simultaneous visual and textual manipulation) significantly outperforms unimodal attacks against multimodal agents, with visual-only attacks (JIP) achieving 0% ASR while CrossInject achieves up to 97%. Ablation studies show both Visual Latent Alignment and Textual Guidance Enhancement contribute substantially, with average drops of 18.7% and 24.8% respectively when removed. The attack transfers to a physical autonomous driving robot (9/10 success rate), and tested defenses (sandwich prompting, Gaussian blur) reduce ASR by at most ~7% on average.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": false, 28 "justification": "The paper states 'All code is implemented in PyTorch' (Sec. 5.1) but provides no repository URL, GitHub link, or code archive. No code release is mentioned anywhere in the paper." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The evaluation uses publicly available standard datasets: CoEDIT (text editing) and SST2 (sentiment classification), both referenced with citations. However, the specific 100-sample subsets and constructed injection datasets are not released separately." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "The paper mentions 'NVIDIA A800-SXM4-80GB GPU cluster' and PyTorch, and names specific models (Stable-Diffusion-3.5-Large, Llama-3.1-8B-Instruct), but provides no requirements.txt, Dockerfile, or detailed dependency/version listing sufficient to recreate the environment." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "No reproduction instructions, README, or step-by-step guide are provided. The methodology section describes the approach conceptually but does not provide runnable instructions." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "Despite stating 'each experiment was repeated three times, and the average results are reported' (Sec. 5.1), no confidence intervals, error bars, or ± notation appear in any table or figure." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper claims CrossInject 'outperforms' baselines based solely on comparing point estimates. No statistical significance tests (t-tests, bootstrap, etc.) are reported for any comparison." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "The paper reports specific percentage-point improvements with baseline context, e.g., 'average performance gain up to 32.7% over all baseline approaches, with maximum ASR improvement reaching 71.7%' (Sec. 5.2). Absolute ASR values are provided for all methods in Tab. 1." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "100 entries were randomly sampled from each dataset (Sec. 5.1) with no justification for this sample size, no power analysis, and no discussion of whether 100 is sufficient for the claims made." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "Experiments were repeated 3 times and averaged, but no standard deviation, IQR, or any spread measure is reported. The reader cannot assess result stability across runs." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Three baselines are compared: Naive (direct instruction), JIP (visual-only prompt injection), and FB (textual delimiter-based injection). All are described in Sec. 5.1." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "JIP [21] is from ICLR 2024 and FB [58] is from USENIX Security 2024. Both are recent and representative of the state of the art in their respective modalities." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "Extensive ablations in Sec. 5.3 cover visual alignment (Tab. 2), textual enhancement (Tab. 3), perturbation budget (Fig. 4a), optimization iterations (Fig. 4b), and surrogate LLM choice (Tab. 4)." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "Two metrics are used: Attack Success Rate (ASR) measuring attack effectiveness, and Performance under No Attack (PNA) measuring agent capability without attacks (Sec. 5.1)." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": false, 97 "justification": "All evaluation uses the automated LLM-as-a-Judge approach (Qwen-Max). No human evaluation of attack success, output quality, or adversarial image imperceptibility is conducted." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": false, 102 "justification": "The 100 samples per dataset are used directly for evaluation. There is no explicit separation of development and test splits for the evaluation data. While attack optimization uses surrogate models (not victim models), the data itself is not split." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Tab. 1 provides full breakdowns by agent role (RM/PG), VLM planner (Qwen2-VL/Phi-3.5-vision), injected task (Text Editing/Sentiment Analysis), and attack surface (local document/online webpage)." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper discusses: visual-only attacks (JIP) achieving 0% ASR, perturbation budget=32 causing regression due to overfitting surrogate encoders (Sec. 5.3), and online webpages being harder to exploit (10.8% lower ASR, Sec. 5.2)." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "Negative results include: JIP's 0% ASR showing visual-only attacks fail, budget=32 regression on RM/Qwen2-VL (Sec. 5.3), random noise yielding no improvement, and random textual commands degrading performance in 3 cases (Tab. 3)." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": false, 124 "justification": "The abstract claims 'at least a +30.1% increase in attack success rates across diverse tasks,' but Sec. 5.2 reports +32.7% average for local documents and +27.5% for online webpages. The 27.5% figure is below the claimed 30.1% minimum. The conclusion further contradicts this, stating '+26.4% increase.' The headline claim is inconsistent across sections." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "Causal claims (e.g., 'removing visual adversarial alignment results in a substantial degradation') are supported by controlled ablation studies that manipulate one variable at a time (Tab. 2, Tab. 3). The single-variable manipulation design is adequate for these claims." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": false, 134 "justification": "The title claims broad applicability to 'Multimodal Agents' but evaluation uses only 2 custom chatbot agents with 2 small VLMs (Qwen2-VL 2B, Phi-3.5-vision) and 1 toy physical robot. No evaluation on production-grade models (GPT-4V, Gemini) or real-world deployed agents. Generalization claims are not bounded to the tested setting." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper does not discuss alternative explanations for attack success, such as whether the small model size (2B parameters) makes agents more susceptible, whether the custom toy agents have unusually weak safety guardrails, or whether LLM-as-a-Judge introduces systematic bias in ASR measurement." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": false, 144 "justification": "ASR is measured via LLM-as-a-Judge (Qwen-Max) as a proxy for actual attack success, but the paper does not discuss the validity of this proxy, potential biases of the judge model, or how well automated judgment aligns with human assessment of successful attacks." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": true, 151 "justification": "Specific model versions are provided: 'Qwen2-VL with 2B parameters,' 'Phi-3.5-vision,' 'Llama-3.1-8B-Instruct,' 'Stable-Diffusion-3.5-Large,' vision encoders 'ViT-H-14, ViT-L-14, ViT-B-16, ViT-SO400M-14-SigLIP-384,' and 'Qwen-Max' for judging (Sec. 5.1). These are specific enough to identify unique model checkpoints." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": false, 156 "justification": "The agent system prompts are described functionally ('RecipeMaster generates recipes...') but not provided as actual text. The meta-prompting template T(·,·), constructed defensive prompts, and optimized malicious commands (C') are described conceptually but their actual text is not shown." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Extensive hyperparameters reported in Sec. 5.1: visual perturbation budget ε=16, visual iterations=200, GCG top-k=256, batch_size=512, textual iterations=100, max_new_tokens=1024. Ablation studies systematically vary these." 162 }, 163 "scaffolding_described": { 164 "applies": true, 165 "answer": false, 166 "justification": "The agents are described as having 'multiple steps, including multimodal data processing, API calls, logical reasoning, and response generation' (Sec. 5.1) but the actual scaffolding details — how external data is retrieved, how multi-step workflows are orchestrated, what APIs are called — are not provided." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "Data preparation is documented: 100 entries randomly sampled per dataset (Sec. 5.1), malicious instructions embedded in local documents or webpage HTML with HTML5 tags and whitespace characters (Sec. 5.1), and the visual perturbation pipeline is described in Sec. 4.1." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": true, 178 "justification": "A 'Limitations' subsection appears at the end of Section 8 (Conclusion and Future Work), identifying two areas: designing more effective defenses for multimodal settings and extending attack applicability to more complex architectures." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": false, 183 "justification": "The limitations are future work directions rather than specific threats to validity of the current study. No mention of threats such as the small model sizes tested, the simplicity of the custom agents, the reliability of LLM-as-a-Judge, or the limited number of trials in the physical experiment." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": false, 188 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to the specific models, agent types, or scale tested. The mention of 'extending to real-world agents with more complex architecture' implicitly acknowledges scope limits but does not state specific exclusions." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": false, 195 "justification": "No raw experimental data (individual trial results, generated adversarial images, optimized commands) is made available. Only aggregated ASR percentages are reported in tables." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Data sources are clearly identified: CoEDIT for text editing and SST2 for sentiment classification, with 100 entries randomly sampled from each (Sec. 5.1). The physical world setup with LIMO vehicle and manually constructed track is described in Sec. 6." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants in this study. Data comes from standard NLP benchmarks (CoEDIT, SST2)." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "The pipeline from data collection to results is documented: sample 100 entries → embed malicious instructions in local documents or webpages → apply visual perturbation and textual optimization → feed to victim agent → evaluate via LLM-as-a-Judge → repeat 3 times and average." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding sources, grants, or sponsorships are mentioned anywhere in the paper. There is no acknowledgments section disclosing funding." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "All author affiliations are clearly listed: Beihang University, National University of Singapore, Huazhong University of Science and Technology, and Henan University of Science and Technology. Authors are academic researchers not evaluating their own commercial products." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": false, 226 "answer": false, 227 "justification": "No funding is disclosed, making independence assessment impossible. Appears to be unfunded academic research." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests or financial interests statement appears in the paper." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": false, 238 "answer": false, 239 "justification": "This paper tests a prompt injection attack method against multimodal agents, not model capability on benchmarks. Whether the models saw SST2 or CoEDIT during training does not affect whether the attack can hijack the agent. This is a red-teaming study testing attacks rather than model knowledge." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": false, 243 "answer": false, 244 "justification": "Not applicable — the study tests an attack framework's ability to hijack agent behavior, not model knowledge on benchmark tasks." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": false, 248 "answer": false, 249 "justification": "Not applicable — this is an adversarial attack study where the success metric is whether the agent follows injected instructions, not whether it has memorized benchmark answers." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study. All experiments involve automated attacks against AI agents." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants. The study involves adversarial attacks on AI systems only." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in this study." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "No inference cost, wall-clock time per attack, or token consumption is reported. The attack involves 200 iterations of visual optimization and 100 iterations of GCG textual optimization per example, but the computational cost of this is not quantified." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "The paper mentions 'NVIDIA A800-SXM4-80GB GPU cluster' but provides no total GPU hours, training time, or compute budget for the experiments." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "Experiments are repeated 3 times and averaged, but no across-run variation (std dev, range) is reported. The reader cannot assess seed sensitivity." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": true, 310 "justification": "Sec. 5.1 explicitly states: 'each experiment was repeated three times, and the average results are reported.'" 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "While ablation studies test specific hyperparameter values (ε from 2 to 32, iterations from 50 to 150), no formal hyperparameter search budget or total configurations tried is reported. The selection of default values appears ad hoc." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": true, 320 "justification": "Default configurations (ε=16, 100 iterations) are justified via ablation studies: ε=16 offers 'balance effectiveness and stealthiness' with diminishing returns beyond (Sec. 5.3), and 100 iterations reaches 'near-optimal performance' before convergence (Sec. 5.3)." 321 }, 322 "multiple_comparison_correction": { 323 "applies": true, 324 "answer": false, 325 "justification": "Numerous comparisons are made across 4 agent configurations, 2 attack surfaces, 2 tasks, and multiple ablation conditions without any correction for multiple comparisons." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "The authors implement all baselines and their own method but do not acknowledge self-comparison bias. Baseline implementations are not validated against original authors' results." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "CrossInject requires 200 visual optimization iterations + 100 GCG textual iterations per attack, while baselines like Naive require no optimization. This compute disparity is not discussed or controlled for." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": false, 340 "justification": "ASR is measured by LLM-as-a-Judge (Qwen-Max) without discussing whether this judge reliably measures true attack success. No validation of judge accuracy against human annotations, and no discussion of potential judge biases." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": true, 344 "answer": true, 345 "justification": "All attacks (CrossInject, Naive, JIP, FB) are tested on the same agent configurations with identical scaffolding, controlling for the scaffold confound. Differences are attributed to the attack method, not the scaffold." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the VLMs' training data includes CoEDIT or SST2 examples, or whether the models' ability to perform injected tasks is influenced by having seen these datasets during training." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether the evaluation setup leaks information. For example, the malicious instruction format itself may signal task identity to the model." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "The 100 samples per dataset are randomly drawn but independence is not verified. No discussion of whether samples share structural properties that could inflate results." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": false, 367 "justification": "No leakage detection or prevention methods are employed." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "CrossInject achieves at least +30.1% increase in attack success rates over state-of-the-art attacks across diverse tasks.", 374 "evidence": "Tab. 1 shows +32.7% average ASR improvement for local document attacks and +27.5% for online webpage attacks (Sec. 5.2). However, the conclusion states '+26.4% increase,' contradicting the abstract's '+30.1% minimum.'", 375 "supported": "weak" 376 }, 377 { 378 "claim": "Visual-only prompt injection (JIP) achieves 0% ASR against state-of-the-art multimodal agents.", 379 "evidence": "Tab. 1 shows JIP achieves exactly 0.0% ASR across all 16 tested scenarios (2 agents × 2 VLMs × 2 tasks × 2 attack surfaces).", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Visual Latent Alignment contributes an average 18.7% improvement to attack performance.", 384 "evidence": "Tab. 2 ablation study comparing full method vs. without visual alignment shows ASR drops of 28.0%, 16.7%, 6.0%, and 22.7% across four agent configurations (Sec. 5.3).", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "Textual Guidance Enhancement contributes an average 24.8% improvement to attack performance.", 389 "evidence": "Tab. 3 ablation study shows ASR drops of 47.0%, 9.0%, 6.0%, and 14.0% when removing the textual enhancement component.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "CrossInject is effective against real-world autonomous driving agents, achieving 9/10 attack success rate.", 394 "evidence": "Sec. 6 describes a physical experiment with LIMO robot, where CrossInject caused the vehicle to collide with a stop sign in 9 out of 10 trials, compared to 4/10 for Naive attack.", 395 "supported": "weak" 396 }, 397 { 398 "claim": "Existing defenses (sandwich prompting, Gaussian blur) are ineffective against CrossInject.", 399 "evidence": "Tab. 5 shows textual defense reduces ASR by 6.7% on average, visual defense by 2.8%, and combined defense is not additive.", 400 "supported": "moderate" 401 } 402 ], 403 "red_flags": [ 404 { 405 "flag": "Inconsistent headline claim across sections", 406 "detail": "The abstract claims 'at least +30.1% increase in ASR' but the conclusion states '+26.4% increase.' The online webpage results show only +27.5% average, below the claimed 30.1% minimum. This inconsistency suggests selective reporting of the headline number." 407 }, 408 { 409 "flag": "No variance reported despite repeated runs", 410 "detail": "Experiments are repeated 3 times and averaged, but no standard deviation, error bars, or any spread measure is reported. This conceals result stability — the reader cannot assess whether the reported averages are reliable or driven by high-variance outlier runs." 411 }, 412 { 413 "flag": "Tiny physical experiment sample", 414 "detail": "The real-world autonomous driving demonstration (Sec. 6) consists of only 10 trials on a single setup with a toy robot (LIMO) on a manually constructed track. This is insufficient to support claims about 'effectiveness in real-world multimodal autonomous agents.'" 415 }, 416 { 417 "flag": "Unvalidated LLM-as-a-Judge evaluation", 418 "detail": "All ASR measurements rely on Qwen-Max as an automated judge, but no calibration study validates this judge's accuracy against human assessments. The judge model itself could be susceptible to the adversarial patterns being tested, introducing systematic measurement bias." 419 }, 420 { 421 "flag": "Only small-scale models tested", 422 "detail": "Both victim VLMs are small (Qwen2-VL-2B, Phi-3.5-vision ~4B). No evaluation on larger, more capable models (GPT-4V, Gemini, Claude) that likely have stronger safety guardrails. The title's broad framing ('Multimodal Agents') overstates the tested scope." 423 }, 424 { 425 "flag": "Custom toy agents may not represent real deployments", 426 "detail": "RecipeMaster and PoetryGenius are custom chatbot agents built for this study with unspecified system prompts and scaffolding. Their security posture may not reflect production-grade agent systems with hardened prompt architectures." 427 } 428 ], 429 "cited_papers": [ 430 { 431 "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection", 432 "authors": ["S. Abdelnabi", "K. Greshake", "S. Mishra", "C. Endres", "T. Holz", "M. Fritz"], 433 "year": 2023, 434 "relevance": "Foundational work on indirect prompt injection against LLM-integrated applications via external data sources." 435 }, 436 { 437 "title": "Formalizing and Benchmarking Prompt Injection Attacks and Defenses", 438 "authors": ["Y. Liu", "Y. Jia", "R. Geng", "J. Jia", "N.Z. Gong"], 439 "year": 2024, 440 "relevance": "Formal framework for prompt injection attacks and defenses; used as the FB baseline in this paper." 441 }, 442 { 443 "title": "Prompt Injection attack against LLM-integrated Applications", 444 "authors": ["Y. Liu", "G. Deng", "Y. Li", "K. Wang", "T. Zhang", "Y. Liu", "H. Wang", "Y. Zheng", "Y. Liu"], 445 "year": 2023, 446 "arxiv_id": "2306.05499", 447 "relevance": "Early work demonstrating delimiter-based prompt injection attacks against LLM-integrated applications." 448 }, 449 { 450 "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models", 451 "authors": ["A. Zou", "Z. Wang", "N. Carlini", "M. Nasr", "J.Z. Kolter", "M. Fredrikson"], 452 "year": 2023, 453 "arxiv_id": "2307.15043", 454 "relevance": "GCG algorithm for adversarial suffix optimization, used as the core textual attack optimization method in CrossInject." 455 }, 456 { 457 "title": "Jailbreak in pieces: Compositional adversarial attacks on multi-modal language models", 458 "authors": ["E. Shayegani", "Y. Dong", "N.B. Abu-Ghazaleh"], 459 "year": 2024, 460 "relevance": "Visual modality-based jailbreak attacks on multimodal LLMs; used as the JIP baseline in this paper." 461 }, 462 { 463 "title": "Dissecting Adversarial Robustness of Multimodal LM Agents", 464 "authors": ["C.H. Wu", "R. Shah", "J.Y. Koh", "R. Salakhutdinov", "D. Fried", "A. Raghunathan"], 465 "year": 2024, 466 "arxiv_id": "2406.12814", 467 "relevance": "Analysis of adversarial robustness of multimodal LLM agents, directly related to security of multimodal agent systems." 468 }, 469 { 470 "title": "Benchmarking and Defending Against Indirect Prompt Injection Attacks on Large Language Models", 471 "authors": ["J. Yi", "Y. Xie", "B. Zhu", "E. Kiciman", "G. Sun", "X. Xie", "F. Wu"], 472 "year": 2025, 473 "arxiv_id": "2312.14197", 474 "relevance": "Benchmark for indirect prompt injection defense evaluation on LLMs." 475 }, 476 { 477 "title": "Injecagent: Benchmarking indirect prompt injections in tool-integrated large language model agents", 478 "authors": ["Q. Zhan", "Z. Liang", "Z. Ying", "D. Kang"], 479 "year": 2024, 480 "arxiv_id": "2403.02691", 481 "relevance": "Benchmark for indirect prompt injection in tool-integrated LLM agents." 482 }, 483 { 484 "title": "Abusing Images and Sounds for Indirect Instruction Injection in Multi-Modal LLMs", 485 "authors": ["E. Bagdasaryan", "T.-Y. Hsieh", "B. Nassi", "V. Shmatikov"], 486 "year": 2023, 487 "arxiv_id": "2307.10490", 488 "relevance": "Demonstrates indirect instruction injection via visual and audio modalities in multimodal LLMs under white-box settings." 489 }, 490 { 491 "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions", 492 "authors": ["E. Wallace", "K. Xiao", "R. Leike", "L. Weng", "J. Heidecke", "A. Beutel"], 493 "year": 2024, 494 "arxiv_id": "2406.09246", 495 "relevance": "Proposes instruction hierarchy as a defense mechanism for LLMs against prompt injection." 496 }, 497 { 498 "title": "Automatic and Universal Prompt Injection Attacks against Large Language Modals", 499 "authors": ["X. Liu", "Z. Yu", "Y. Zhang", "N. Zhang", "C. Xiao"], 500 "year": 2024, 501 "arxiv_id": "2403.04957", 502 "relevance": "GCG-based automatic prompt injection attacks against LLMs using adversarial suffixes." 503 }, 504 { 505 "title": "Optimization-based Prompt Injection Attack to LLM-as-a-Judge", 506 "authors": ["J. Shi", "Z. Yuan", "Y. Liu", "Y. Huang", "P. Zhou", "L. Sun", "N.Z. Gong"], 507 "year": 2024, 508 "relevance": "Optimization-based attacks specifically targeting LLM-as-a-Judge systems, relevant to evaluation reliability." 509 } 510 ], 511 "engagement_factors": { 512 "practical_relevance": { 513 "score": 2, 514 "justification": "Demonstrates a concrete attack vector against multimodal agents that security teams should be aware of, though the attack requires significant compute and surrogate model access." 515 }, 516 "surprise_contrarian": { 517 "score": 1, 518 "justification": "The finding that visual-only injection achieves 0% ASR is somewhat surprising; the general cross-modal vulnerability is largely expected given known prompt injection risks." 519 }, 520 "fear_safety": { 521 "score": 3, 522 "justification": "Demonstrates hijacking of an autonomous driving agent to ignore stop signs and collide with obstacles, directly raising safety concerns for physical-world AI deployments." 523 }, 524 "drama_conflict": { 525 "score": 1, 526 "justification": "No major controversy or direct challenge to specific companies or products; standard adversarial ML research framing." 527 }, 528 "demo_ability": { 529 "score": 0, 530 "justification": "No code repository, demo, or tool is released; the attack cannot be tried without reimplementation." 531 }, 532 "brand_recognition": { 533 "score": 0, 534 "justification": "Authors are from Chinese academic institutions (Beihang, HUST, NUS) without major brand recognition in the AI safety space." 535 } 536 } 537 }