scan.json (33691B)
1 { 2 "paper": { 3 "title": "Taxonomy, Evaluation and Exploitation of IPI-Centric LLM Agent Defense Frameworks", 4 "authors": [ 5 "Zimo Ji", 6 "Xunguang Wang", 7 "Zongjie Li", 8 "Pingchuan Ma", 9 "Yudong Gao", 10 "Daoyuan Wu", 11 "Xincheng Yan", 12 "Tian Tian", 13 "Shuai Wang" 14 ], 15 "year": 2025, 16 "venue": "arXiv.org", 17 "arxiv_id": "2511.15203", 18 "doi": "10.48550/arXiv.2511.15203" 19 }, 20 "scan_version": 3, 21 "active_modules": ["experimental_rigor", "data_leakage", "survey_methodology"], 22 "methodology_tags": ["meta-analysis", "benchmark-eval"], 23 "key_findings": "This SoK provides the first comprehensive taxonomy of IPI-centric LLM agent defense frameworks along five dimensions and evaluates 11 representative frameworks across three benchmarks. Policy enforcement and system design defenses achieve near-0% attack success rates but at substantial utility and overhead costs. The authors identify six root causes of defense failure and design three novel adaptive attacks that increase ASR by up to fivefold against specific frameworks, demonstrating that frameworks claiming 'provable security' still harbor exploitable architectural flaws.", 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. Prompt templates are shown in Appendix E but the evaluation code, framework implementations, and adapted payloads are not released." 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": true, 34 "justification": "The paper uses three publicly available benchmarks: AgentDojo, InjecAgent, and Agent Security Bench (ASB). These are standard public benchmarks. However, the 949 adapted payloads from the semantic-masquerading attack and evaluation logs are not released." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "No requirements.txt, Dockerfile, or environment setup section is provided. The paper mentions GPT-4o version and temperature but does not specify the full runtime environment needed for reproduction." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "No step-by-step reproduction instructions are provided. The paper describes the evaluation methodology but does not provide a README, scripts, or commands to replicate the experiments." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": false, 51 "justification": "All results in Tables 2-3 and Figures 3-4 are point estimates with no confidence intervals, error bars, or ± notation. ASR and utility are reported as single percentages." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper makes comparative claims (e.g., 'policy enforcement performs best', 'ASR increased fourfold') based solely on comparing raw percentages without any statistical significance tests." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Effect sizes are reported as multiplicative improvements with baseline context. Table 5 shows '3.9X', '2.9X', '2X', '4.0X' increases, and Table 6 shows '2.6X', '4.8X'. Raw ASR values with baselines are provided throughout." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "No justification for the benchmark sizes or why these specific numbers of test cases are sufficient. Section 4 notes dataset sizes (629, 2040, 1054 test cases) but provides no power analysis or justification." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": false, 71 "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single runs (temperature=0 implies deterministic decoding but this is not discussed as a design choice for reproducibility)." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "Undefended GPT-4o and Llama-3.1-8B serve as baselines in Tables 2-3. All defense frameworks are compared against these undefended models across all three benchmarks." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": true, 83 "justification": "The evaluated frameworks are recent (2024-2025, Table 1) and include state-of-the-art systems like CaMeL, Progent, LlamaFirewall, and MELON. GPT-4o (2024-11-20) is a contemporary model." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": false, 88 "justification": "No ablation study is performed on the proposed adaptive attacks. For example, the semantic-masquerading pipeline (Figure 5) has adjusting, judging, and refining steps, but no ablation shows which components are essential." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "The evaluation uses five metrics across three dimensions: ASR (security), task success rate and false positive rate (utility), wall-clock time and token usage (overhead). Tables 2-3 and Appendix B report these." 94 }, 95 "human_evaluation": { 96 "applies": true, 97 "answer": false, 98 "justification": "No human evaluation is included. The failure analysis in Section 4.2 states 'two authors independently reviewed all failure logs' for root cause classification, but this is manual annotation of failures, not human evaluation of system outputs." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper uses established benchmark test suites (AgentDojo, ASB, InjecAgent) which are standard evaluation sets. No tuning was done on these test sets — frameworks were evaluated as-is." 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "Extensive per-category breakdowns: Table 2 breaks results by benchmark and attack template, Table 3 by scenario (workspace, travel, banking, slack), Figure 3 shows per-attack-method performance, and Table 8 shows per-scenario false positive rates." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 4.2 provides detailed failure analysis with six root causes and specific case studies (e.g., IsolateGPT banking scenario bypass, CaMeL error message injection, LlamaFirewall hallucination). Appendix C provides additional case studies." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": true, 118 "justification": "Several negative results are reported: Meta SecAlign generalizes poorly across IPI scenarios (Section 4.2, RC 6), some adaptive attack payloads fail and revert to originals (Section 5.1), and Task Shield's high FP rate is explicitly noted (Section 4.1)." 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "The abstract claims a comprehensive taxonomy (supported by Section 3, Table 1), thorough security/usability assessment (Tables 2-3), six root causes (Section 4.2), and three adaptive attacks that 'significantly improve attack success rates' (Tables 5-6 show 2-5X improvements)." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": true, 130 "justification": "Causal claims about adaptive attacks ('targeting RC 1 & 2', 'increasing ASR up to fourfold') are justified through controlled experiments: same benchmarks, same models, same frameworks with only the attack method changed. The root cause analysis in Section 4.2 provides the causal mechanism." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": true, 135 "justification": "Section 2.3 explicitly bounds the threat model to template-based attacks against LLM agents. The paper tests on two models (GPT-4o, DeepSeek-V3 in Appendix F) and three benchmarks. The scope is explicitly limited to IPI against agents, not QA LLMs (Section 2.1)." 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": false, 140 "justification": "While the paper provides detailed root cause analysis for defense failures, it does not consider alternative explanations for its findings. For example, the LLM-judged attack success in semantic-masquerading IPI could have judge reliability issues, but this is not discussed." 141 }, 142 "proxy_outcome_distinction": { 143 "applies": true, 144 "answer": true, 145 "justification": "The paper's measurements (ASR, task success rate, wall-clock time, token usage) directly correspond to its claims about security, utility, and overhead. The paper acknowledges that its LLM-based attack success judging (Section 5.1) is a proxy for the benchmark's state-based ground truth." 146 } 147 }, 148 "setup_transparency": { 149 "model_versions_specified": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 4: 'GPT-4o (version 2024-11-20)' with specific version date. Also specifies DeepSeek-V3, Llama-3.1-8B-Instruct, and Meta SecAlign's LoRA adapter." 153 }, 154 "prompts_provided": { 155 "applies": true, 156 "answer": true, 157 "justification": "Appendix E provides full prompt templates for the payload adjust LLM, payload judge LLM, and attack success judge LLM. Table 7 (Appendix A) provides all attack template payloads with exact text used." 158 }, 159 "hyperparameters_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 4: 'temperature parameter set to 0' is stated. For the OpenAI function calling template, the specific template choice is documented. Temperature=0 implies greedy decoding, fully specifying the sampling strategy." 163 }, 164 "scaffolding_described": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 3 describes each defense framework's architecture in detail (plan-exec decoupling, code-then-exec, dual LLM, etc.). Figure 2 illustrates technical paradigms. Figure 5 describes the adaptive attack pipeline. Framework implementations are described with code examples." 168 }, 169 "data_preprocessing_documented": { 170 "applies": true, 171 "answer": true, 172 "justification": "The benchmark datasets are described with sizes and attack methods (Section 4). The semantic-masquerading payload adjustment process is documented in Section 5.1 and Figure 5, including the refining loop and 949 output payloads. Framework configurations are standardized (Section 4)." 173 } 174 }, 175 "limitations_and_scope": { 176 "limitations_section_present": { 177 "applies": true, 178 "answer": false, 179 "justification": "There is no dedicated Limitations or Threats to Validity section. Section 6 (Conclusion) is brief and does not discuss limitations. Only Ethics Considerations and LLM Usage Considerations sections are present at the end." 180 }, 181 "threats_to_validity_specific": { 182 "applies": true, 183 "answer": false, 184 "justification": "No threats to validity are discussed. The paper does not address potential issues like the reliability of LLM-as-judge for attack success, the limitation of testing only template-based attacks, or the generalizability across model versions." 185 }, 186 "scope_boundaries_stated": { 187 "applies": true, 188 "answer": true, 189 "justification": "Section 2.1 explicitly states 'This SoK focuses solely on IPI against LLM agents' (not QA LLMs). Section 2.3 bounds the threat model to template-based attacks. Section 5.3 acknowledges 'large-scale evaluation is rather impractical' for the isolation-breach attack." 190 } 191 }, 192 "data_integrity": { 193 "raw_data_available": { 194 "applies": true, 195 "answer": false, 196 "justification": "No raw evaluation data, failure logs, or adapted payloads are made available. Only aggregated results in tables and figures are presented." 197 }, 198 "data_collection_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section 4 describes the three benchmarks with dataset sizes (AgentDojo: 629 cases, ASB: 2040 cases, InjecAgent: 1054 cases), attack methods (Table 7), and framework selection rationale. The adaptive attack payload generation process is described in Section 5.1." 202 }, 203 "recruitment_methods_described": { 204 "applies": false, 205 "answer": false, 206 "justification": "No human participants in this study. All data sources are standard public benchmarks (AgentDojo, ASB, InjecAgent)." 207 }, 208 "data_pipeline_documented": { 209 "applies": true, 210 "answer": true, 211 "justification": "Figure 5 documents the full semantic-masquerading payload pipeline (adjust → judge → refine → test → success judge). Section 4 documents the evaluation pipeline. The cascading IPI payload structure is shown in Figure 6." 212 } 213 }, 214 "conflicts_of_interest": { 215 "funding_disclosed": { 216 "applies": true, 217 "answer": false, 218 "justification": "No acknowledgments or funding section is present in the paper. No grants, sponsors, or funding agencies are mentioned." 219 }, 220 "affiliations_disclosed": { 221 "applies": true, 222 "answer": true, 223 "justification": "Author affiliations are clearly listed: HKUST, Zhejiang University of Technology, Lingnan University, Southeast University, and ZTE Corporation. ZTE is an industry affiliation." 224 }, 225 "funder_independent_of_outcome": { 226 "applies": true, 227 "answer": false, 228 "justification": "No funding information is disclosed, making it impossible to assess funder independence. ZTE Corporation (a telecommunications company) has an author but no funding relationship is stated." 229 }, 230 "financial_interests_declared": { 231 "applies": true, 232 "answer": false, 233 "justification": "No competing interests or financial disclosure statement is present in the paper." 234 } 235 }, 236 "contamination": { 237 "training_cutoff_stated": { 238 "applies": false, 239 "answer": false, 240 "justification": "This paper evaluates defense frameworks against IPI attacks, not pre-trained model capabilities on knowledge benchmarks. The benchmarks test whether defenses can block injections, not model knowledge." 241 }, 242 "train_test_overlap_discussed": { 243 "applies": false, 244 "answer": false, 245 "justification": "The paper tests defense mechanisms rather than model knowledge. Train/test overlap for the model is not the relevant concern — the benchmarks measure defense effectiveness, not model capability on known tasks." 246 }, 247 "benchmark_contamination_addressed": { 248 "applies": false, 249 "answer": false, 250 "justification": "The paper evaluates defense frameworks rather than model capabilities on benchmarks. Contamination of model knowledge is not the primary threat to validity here." 251 } 252 }, 253 "human_studies": { 254 "pre_registered": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study. All evaluations are automated benchmark runs." 258 }, 259 "irb_or_ethics_approval": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants. The Ethics Considerations section confirms 'This study relies on public benchmarks within isolated environments and involves no human subjects.'" 263 }, 264 "demographics_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "inclusion_exclusion_criteria": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "randomization_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 }, 279 "blinding_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study." 283 }, 284 "attrition_reported": { 285 "applies": false, 286 "answer": false, 287 "justification": "No human participants in this study." 288 } 289 }, 290 "cost_and_practicality": { 291 "inference_cost_reported": { 292 "applies": true, 293 "answer": true, 294 "justification": "Table 3 reports wall-clock time and token usage per framework per scenario. Figure 4(b-c) visualize time and token consumption. For example, Task Shield averages 501,294 tokens and 17.23s per task." 295 }, 296 "compute_budget_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "No total computational budget (total API spend, total GPU hours, overall cost) is stated. Per-scenario overhead is reported but the aggregate cost of the full evaluation campaign is not." 300 } 301 }, 302 "experimental_rigor": { 303 "seed_sensitivity_reported": { 304 "applies": true, 305 "answer": false, 306 "justification": "No seed sensitivity analysis. The paper uses temperature=0 (deterministic decoding) but does not explicitly discuss this as a deliberate reproducibility measure or report results across different conditions." 307 }, 308 "number_of_runs_stated": { 309 "applies": true, 310 "answer": false, 311 "justification": "The number of experimental runs per configuration is not explicitly stated. Temperature=0 implies single deterministic runs, but this is not discussed." 312 }, 313 "hyperparameter_search_budget": { 314 "applies": true, 315 "answer": false, 316 "justification": "No hyperparameter search was conducted or reported. Frameworks were evaluated with their default configurations and temperature=0 was used uniformly." 317 }, 318 "best_config_selection_justified": { 319 "applies": true, 320 "answer": true, 321 "justification": "Section 4 explains the standardization approach: 'we standardize the configurations across all benchmarks and defense frameworks' using GPT-4o (2024-11-20) at temperature=0 with OpenAI function calling template. This eliminates configuration selection bias." 322 }, 323 "multiple_comparison_correction": { 324 "applies": true, 325 "answer": false, 326 "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied. The paper compares 11 frameworks across 3 benchmarks and multiple attack templates using only raw percentages." 327 }, 328 "self_comparison_bias_addressed": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper does not discuss author-evaluation bias. For the adaptive attacks they propose, their own implementation and judging pipeline could introduce favorable bias, but this is not acknowledged." 332 }, 333 "compute_budget_vs_performance": { 334 "applies": true, 335 "answer": true, 336 "justification": "Table 3 and Figure 4 present utility alongside overhead (time and token usage) per framework, enabling readers to assess the security-utility-overhead tradeoff. The paper explicitly discusses this tradeoff in Section 4.1." 337 }, 338 "benchmark_construct_validity": { 339 "applies": true, 340 "answer": true, 341 "justification": "Section 4.1 explicitly discusses how benchmark design affects results: 'ASB features attacker-intended tools designed to mimic legitimate, user-intent-like functionalities' which explains why Tool Filter and CaMeL show higher ASR on ASB. Framework performance variation across benchmarks is analyzed." 342 }, 343 "scaffold_confound_addressed": { 344 "applies": true, 345 "answer": true, 346 "justification": "Section 4 standardizes the backend LLM (GPT-4o with same version and temperature) and tool calling template across all frameworks. Appendix F confirms consistent trends with DeepSeek-V3, addressing model-specific confounds." 347 } 348 }, 349 "data_leakage": { 350 "temporal_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether GPT-4o or DeepSeek-V3 training data may have included the benchmark datasets (AgentDojo, ASB, InjecAgent), which could affect baseline ASR or defense behavior." 354 }, 355 "feature_leakage_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of whether the evaluation setup leaks information that would not be available in real deployment scenarios." 359 }, 360 "non_independence_addressed": { 361 "applies": true, 362 "answer": false, 363 "justification": "No discussion of potential overlap or structural similarities between the three benchmarks' test cases, or whether results on one benchmark could be dependent on another." 364 }, 365 "leakage_detection_method": { 366 "applies": true, 367 "answer": false, 368 "justification": "No concrete leakage detection or prevention method is applied to the evaluation." 369 } 370 }, 371 "survey_methodology": { 372 "prisma_or_structured_protocol": { 373 "applies": true, 374 "answer": false, 375 "justification": "No PRISMA flow diagram or structured search protocol is described. The paper presents a taxonomy of 26 defense frameworks (Table 1) but does not describe the systematic search strategy used to identify them." 376 }, 377 "quality_assessment_of_sources": { 378 "applies": true, 379 "answer": true, 380 "justification": "The paper goes beyond simple cataloging by conducting empirical evaluation of representative frameworks on three benchmarks (Tables 2-3), measuring security, utility, and overhead. Section 4.2 provides detailed quality assessment through failure analysis." 381 }, 382 "publication_bias_discussed": { 383 "applies": true, 384 "answer": false, 385 "justification": "No discussion of publication bias. The paper does not consider whether published defense frameworks skew toward positive results or whether unsuccessful approaches are underrepresented in the literature." 386 } 387 } 388 }, 389 "claims": [ 390 { 391 "claim": "All evaluated defense frameworks significantly reduce ASR compared to undefended baselines, with most lowering ASR below 10%.", 392 "evidence": "Table 2 shows undefended GPT-4o averages 23.69% ASR while all defense frameworks are below 12%, with many below 5%. Consistent across three benchmarks (Section 4.1).", 393 "supported": "strong" 394 }, 395 { 396 "claim": "Policy enforcement and system design defenses achieve near-0% ASR, outperforming other paradigms.", 397 "evidence": "Table 2: Progent achieves 0.00% average ASR, ACE 0.10%, IsolateGPT 0.12%. These are the lowest across all evaluated frameworks (Section 4.1).", 398 "supported": "strong" 399 }, 400 { 401 "claim": "System design frameworks cause the most severe utility degradation among all defense paradigms.", 402 "evidence": "Table 3: IsolateGPT drops utility to 37.11% and CaMeL to 59.79% compared to undefended GPT-4o's 80.41%. Runtime checking (Task Shield 83.51%) and policy enforcing (Progent 72.17%) show much less degradation (Section 4.1).", 403 "supported": "strong" 404 }, 405 { 406 "claim": "Semantic-masquerading IPI increases ASR up to fourfold against specific defense frameworks.", 407 "evidence": "Table 5: ASR increases from 0.48% to 1.91% (3.9X) on IsolateGPT, 1.05% to 3.06% (2.9X) on Tool Filter, 0.95% to 1.90% (2X) on CaMeL, and 0.84% to 3.37% (4.0X) on LlamaFirewall (Section 5.1).", 408 "supported": "moderate" 409 }, 410 { 411 "claim": "Cascading IPI increases ASR nearly fivefold against MELON framework.", 412 "evidence": "Table 6: MELON ASR increases from 0.95% (Important template baseline) to 4.53% (4.8X). Task Shield from 2.07% to 5.37% (2.6X) (Section 5.2).", 413 "supported": "moderate" 414 }, 415 { 416 "claim": "Defense frameworks maintain consistent ASR regardless of attack template, indicating bypasses stem from inherent design flaws rather than template sensitivity.", 417 "evidence": "Figure 3 shows 'tightly clustered and overlapping lines of different attack templates' for defended frameworks, unlike baselines which vary significantly (e.g., GPT-4o on Important vs TODO in AgentDojo) (Section 4.1).", 418 "supported": "strong" 419 }, 420 { 421 "claim": "An isolation-breach vulnerability exists in IsolateGPT where untrusted Spoke data propagates to the Hub LLM, enabling plan corruption.", 422 "evidence": "Section 5.3 demonstrates a two-stage attack where email content propagates from Spoke to Hub, tricking Hub into adding a malicious tool (AugustSmartLockUnlockDoor) to the plan. This is described as a zero-day vulnerability.", 423 "supported": "weak" 424 } 425 ], 426 "red_flags": [ 427 { 428 "flag": "LLM-as-judge for attack success", 429 "detail": "In the semantic-masquerading IPI evaluation (Section 5.1), attack success is judged by an LLM rather than the benchmark's ground-truth state checks. The paper acknowledges this departure ('it is no longer viable after payload adjustment') but does not validate the LLM judge's reliability or accuracy against any ground truth." 430 }, 431 { 432 "flag": "No error bars or statistical tests", 433 "detail": "All comparisons across 11 frameworks, 3 benchmarks, and multiple attack templates rely on single-run point estimates with no confidence intervals, significance tests, or variance measures. The absolute ASR differences between frameworks are often small (e.g., 0.12% vs 0.00%) and could reflect noise." 434 }, 435 { 436 "flag": "No limitations section", 437 "detail": "The paper lacks any discussion of limitations or threats to validity. Key unaddressed concerns include: LLM judge reliability, generalizability beyond template-based attacks, potential ceiling effects at low ASR values, and framework configuration choices." 438 }, 439 { 440 "flag": "Single case study for strongest attack claim", 441 "detail": "The isolation-breach IPI attack (Section 5.3), which claims discovery of a 'zero-day vulnerability,' is demonstrated through only a single case study. The paper acknowledges 'large-scale evaluation is rather impractical' but still claims significance from one example." 442 }, 443 { 444 "flag": "No code or data release", 445 "detail": "Despite evaluating 11 frameworks across 3 benchmarks and designing 3 novel attacks, no code, evaluation scripts, adapted payloads, or raw results are released. This prevents independent verification of all reported results." 446 } 447 ], 448 "cited_papers": [ 449 { 450 "title": "AgentDojo: A Dynamic Environment to Evaluate Attacks and Defenses for LLM Agents", 451 "authors": ["E. Debenedetti", "J. Zhang", "M. Balunovic", "L. Beurer-Kellner", "M. Fischer", "F. Tramèr"], 452 "year": 2024, 453 "relevance": "Primary benchmark for evaluating IPI attacks and defenses in LLM agent systems, central to this paper's evaluation." 454 }, 455 { 456 "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents", 457 "authors": ["Q. Zhan", "Z. Liang", "Z. Ying", "D. Kang"], 458 "year": 2024, 459 "arxiv_id": "2403.02691", 460 "relevance": "Benchmark for indirect prompt injection in tool-integrated LLM agents, one of three benchmarks used in the evaluation." 461 }, 462 { 463 "title": "Agent Security Bench (ASB): Formalizing and Benchmarking Attacks and Defenses in LLM-Based Agents", 464 "authors": ["H. Zhang", "J. Huang", "K. Mei", "Y. Yao", "Z. Wang", "C. Zhan", "H. Wang", "Y. Zhang"], 465 "year": 2024, 466 "arxiv_id": "2410.02644", 467 "relevance": "Comprehensive benchmark formalizing agent security evaluation, used as one of three evaluation benchmarks." 468 }, 469 { 470 "title": "Defeating Prompt Injections by Design", 471 "authors": ["E. Debenedetti", "I. Shumailov", "T. Fan", "J. Hayes", "N. Carlini", "D. Fabian", "C. Kern", "C. Shi", "A. Terzis", "F. Tramèr"], 472 "year": 2025, 473 "arxiv_id": "2503.18813", 474 "relevance": "CaMeL framework for defeating prompt injections through dual LLM architecture and code-then-exec planning, evaluated and attacked in this paper." 475 }, 476 { 477 "title": "Progent: Programmable Privilege Control for LLM Agents", 478 "authors": ["T. Shi", "J. He", "Z. Wang", "L. Wu", "H. Li", "W. Guo", "D. Song"], 479 "year": 2025, 480 "arxiv_id": "2504.11703", 481 "relevance": "Policy enforcement defense framework achieving 0% ASR with manual policies, representing the strongest defense paradigm evaluated." 482 }, 483 { 484 "title": "SecAlign: Defending Against Prompt Injection with Preference Optimization", 485 "authors": ["S. Chen", "A. Zharmagambetov", "S. Mahloujifar", "K. Chaudhuri", "D. Wagner", "C. Guo"], 486 "year": 2024, 487 "arxiv_id": "2410.05451", 488 "relevance": "Fine-tuning based defense using DPO to train LLMs to resist prompt injection, evaluated as representative of fine-tuning paradigm." 489 }, 490 { 491 "title": "MELON: Provable Defense Against Indirect Prompt Injection Attacks in AI Agents", 492 "authors": ["K. Zhu", "X. Yang", "J. Wang", "W. Guo", "W. Y. Wang"], 493 "year": 2025, 494 "arxiv_id": "2502.05174", 495 "relevance": "Runtime checking defense using parallel agent comparison to detect injection, evaluated and targeted by cascading IPI attack." 496 }, 497 { 498 "title": "LlamaFirewall: An Open Source Guardrail System for Building Secure AI Agents", 499 "authors": ["S. Chennabasappa", "C. Nikolaidis", "D. Song", "D. Molnar"], 500 "year": 2025, 501 "arxiv_id": "2505.03574", 502 "relevance": "Meta's detection-based defense framework combining guardrails, evaluated and found vulnerable to semantic-masquerading attacks." 503 }, 504 { 505 "title": "IsolateGPT: An Execution Isolation Architecture for LLM-Based Agentic Systems", 506 "authors": ["Y. Wu", "F. Roesner", "T. Kohno", "N. Zhang", "U. Iqbal"], 507 "year": 2024, 508 "arxiv_id": "2403.04960", 509 "relevance": "Plan-then-execute system design defense, found vulnerable to parameter-level attacks and isolation breaches." 510 }, 511 { 512 "title": "Adaptive Attacks Break Defenses Against Indirect Prompt Injection Attacks on LLM Agents", 513 "authors": ["Q. Zhan", "R. Fang", "H. S. Panchal", "D. Kang"], 514 "year": 2025, 515 "arxiv_id": "2503.00061", 516 "relevance": "Prior work on optimization-based adaptive attacks against IPI defenses, which this paper extends with logic-driven attacks." 517 }, 518 { 519 "title": "The Attacker Moves Second: Stronger Adaptive Attacks Bypass Defenses Against LLM Jailbreaks and Prompt Injections", 520 "authors": ["M. Nasr", "N. Carlini", "C. Sitawarin"], 521 "year": 2025, 522 "arxiv_id": "2510.09023", 523 "relevance": "Demonstrates that optimization and search-based adaptive attacks can bypass 12 defense frameworks, motivating the logic-driven attacks in this paper." 524 }, 525 { 526 "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection", 527 "authors": ["K. Greshake", "S. Abdelnabi", "S. Mishra", "C. Endres", "T. Holz", "M. Fritz"], 528 "year": 2023, 529 "relevance": "Foundational work defining the indirect prompt injection threat against LLM-integrated applications." 530 }, 531 { 532 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 533 "authors": ["S. Yao", "J. Zhao", "D. Yu", "N. Du", "I. Shafran", "K. Narasimhan", "Y. Cao"], 534 "year": 2022, 535 "arxiv_id": "2210.03629", 536 "relevance": "Foundational agentic framework for LLM reasoning and acting, relevant as the architecture most defense frameworks are designed to protect." 537 } 538 ], 539 "engagement_factors": { 540 "practical_relevance": { 541 "score": 2, 542 "justification": "The taxonomy and identified vulnerabilities are directly useful for security practitioners building or deploying LLM agent defense systems." 543 }, 544 "surprise_contrarian": { 545 "score": 2, 546 "justification": "Challenges 'provably secure' claims of frameworks like CaMeL and IsolateGPT by demonstrating exploitable architectural flaws." 547 }, 548 "fear_safety": { 549 "score": 3, 550 "justification": "Demonstrates novel attacks that bypass state-of-the-art defenses by up to 5X, claims a zero-day vulnerability in IsolateGPT, and shows fundamental architectural flaws in agent security." 551 }, 552 "drama_conflict": { 553 "score": 2, 554 "justification": "Directly names and attacks specific frameworks (CaMeL, IsolateGPT, LlamaFirewall) challenging their security claims, though the tone remains academic." 555 }, 556 "demo_ability": { 557 "score": 0, 558 "justification": "No code, demo, or tool is released. Attack templates are in appendices but no executable implementation is provided." 559 }, 560 "brand_recognition": { 561 "score": 1, 562 "justification": "References Meta's LlamaFirewall and uses GPT-4o but the paper itself is from HKUST, not a major AI lab." 563 } 564 } 565 }