scan.json (28250B)
1 { 2 "paper": { 3 "title": "PromptArmor: Simple yet Effective Prompt Injection Defenses", 4 "authors": [ 5 "Tianneng Shi", 6 "Kaijie Zhu", 7 "Zhun Wang", 8 "Yuqi Jia", 9 "Will Cai", 10 "Weida Liang", 11 "Haonan Wang", 12 "Hend Alzahrani", 13 "Joshua Lu", 14 "Kenji Kawaguchi", 15 "Basel Alomair", 16 "Xuandong Zhao", 17 "William Yang Wang", 18 "Neil Gong", 19 "Wenbo Guo", 20 "Dawn Song" 21 ], 22 "year": 2025, 23 "venue": "arXiv.org", 24 "arxiv_id": "2507.15219", 25 "doi": "10.48550/arXiv.2507.15219" 26 }, 27 "scan_version": 2, 28 "active_modules": ["experimental_rigor", "data_leakage"], 29 "methodology_tags": ["benchmark-eval"], 30 "key_findings": "PromptArmor, a prompting-based guardrail that uses an off-the-shelf LLM to detect and remove injected prompts, achieves FPR and FNR below 1% on AgentDojo with GPT-4o, GPT-4.1, or o4-mini, reducing attack success rate to near zero. Larger guardrail models perform better, with Qwen3-32B matching GPT-4.1 performance, while reasoning helps mid-sized models but cannot compensate for insufficient model capacity. The defense remains effective against adaptive fuzzing-based attacks (AgentVigil), and a memorization test suggests the guardrail LLM has not simply memorized the benchmark data.", 31 "checklist": { 32 "artifacts": { 33 "code_released": { 34 "applies": true, 35 "answer": false, 36 "justification": "No source code repository URL is provided anywhere in the paper. No GitHub link, Zenodo archive, or other code release is mentioned." 37 }, 38 "data_released": { 39 "applies": true, 40 "answer": true, 41 "justification": "The evaluation uses AgentDojo (Debenedetti et al., 2024), a publicly available benchmark. The paper does not collect or modify any proprietary data." 42 }, 43 "environment_specified": { 44 "applies": true, 45 "answer": false, 46 "justification": "No requirements.txt, Dockerfile, or environment specification is provided. The paper mentions model names and temperature=0 but no software dependencies or environment setup details." 47 }, 48 "reproduction_instructions": { 49 "applies": true, 50 "answer": false, 51 "justification": "No step-by-step reproduction instructions are provided. The paper describes the method conceptually but does not include runnable scripts or detailed reproduction steps." 52 } 53 }, 54 "statistical_methodology": { 55 "confidence_intervals_or_error_bars": { 56 "applies": true, 57 "answer": false, 58 "justification": "All results in Tables 1 and 3 and Figure 3 are reported as point estimates with no confidence intervals or error bars." 59 }, 60 "significance_tests": { 61 "applies": true, 62 "answer": false, 63 "justification": "No statistical significance tests are used. Claims that PromptArmor outperforms baselines are based on comparing raw numbers in Table 1 without any formal testing." 64 }, 65 "effect_sizes_reported": { 66 "applies": true, 67 "answer": true, 68 "justification": "Results are reported with baseline context: ASR drops from 54.53% (no defense) to 0.00% (PromptArmor-GPT-4.1), and UA improves from 64.27% to 76.35% (o4-mini). The magnitude of improvement is clear from the tables." 69 }, 70 "sample_size_justified": { 71 "applies": true, 72 "answer": false, 73 "justification": "AgentDojo provides 629 adversarial scenarios but no justification is given for whether this sample size is adequate for the claims made." 74 }, 75 "variance_reported": { 76 "applies": true, 77 "answer": false, 78 "justification": "Temperature is set to 0 for determinism, and only single-run results are reported. No variance, standard deviation, or spread measure across multiple runs is provided." 79 } 80 }, 81 "evaluation_design": { 82 "baselines_included": { 83 "applies": true, 84 "answer": true, 85 "justification": "Six baseline defenses are compared in Table 1: Deberta, DataSentinel, MELON, Repeat Prompt, Delimiter, and Tool Filter, plus an undefended baseline." 86 }, 87 "baselines_contemporary": { 88 "applies": true, 89 "answer": true, 90 "justification": "Baselines include recent works: DataSentinel (Liu et al., 2025), MELON (Zhu et al., 2025), and Deberta (ProtectAI, 2024). These are state-of-the-art defenses." 91 }, 92 "ablation_study": { 93 "applies": true, 94 "answer": true, 95 "justification": "Section 4.2 ablates the prompting strategy (with vs without definition for GPT-3.5). Section 4.3 systematically varies model size and reasoning capability across the Qwen3 family." 96 }, 97 "multiple_metrics": { 98 "applies": true, 99 "answer": true, 100 "justification": "Four metrics are reported: FPR, FNR, UA (Utility under Attack), and ASR (Attack Success Rate)." 101 }, 102 "human_evaluation": { 103 "applies": true, 104 "answer": false, 105 "justification": "No human evaluation is conducted. All evaluation is automated through AgentDojo's programmatic success criteria." 106 }, 107 "held_out_test_set": { 108 "applies": true, 109 "answer": false, 110 "justification": "No separation of development and test data is described. The prompting strategy appears to have been designed and evaluated on the same AgentDojo scenarios. It is unclear whether any subset was used for prompt development." 111 }, 112 "per_category_breakdown": { 113 "applies": true, 114 "answer": false, 115 "justification": "Table 1 reports averages across the four attack types and combined ASR. No per-agent-type (banking, slack, travel, workspace) or per-attack-type breakdowns are provided despite AgentDojo having these categories." 116 }, 117 "failure_cases_discussed": { 118 "applies": true, 119 "answer": true, 120 "justification": "Section 4.2 shows GPT-3.5 without definition has 60.24% FNR. Section 4.3 discusses Qwen3-0.6B's extreme trade-off between FPR and FNR. These are explicit failure modes of the approach." 121 }, 122 "negative_results_reported": { 123 "applies": true, 124 "answer": true, 125 "justification": "GPT-3.5 without the definition prompt has very high FNR (60.24%). Qwen3-0.6B cannot achieve reasonable security and utility simultaneously regardless of reasoning mode. Naïve prompting is shown to be ineffective." 126 } 127 }, 128 "claims_and_evidence": { 129 "abstract_claims_supported": { 130 "applies": true, 131 "answer": true, 132 "justification": "Abstract claims of FPR/FNR below 1% are supported by Table 1 (GPT-4o: 0.07/0.23%, GPT-4.1: 0.56/0.13%, o4-mini: 0.34/0.47%). ASR below 1% is supported (GPT-4.1: 0.00%, GPT-4o: 0.47%, o4-mini: 0.08%). Robustness against adaptive attacks is supported by Table 3." 133 }, 134 "causal_claims_justified": { 135 "applies": true, 136 "answer": true, 137 "justification": "The primary causal claim is that PromptArmor reduces ASR. This is tested via controlled comparison: same benchmark, same backend LLM (GPT-4.1), same attack scenarios, with vs without PromptArmor. The ablations (Section 4.2-4.3) use controlled single-variable manipulation." 138 }, 139 "generalization_bounded": { 140 "applies": true, 141 "answer": false, 142 "justification": "The title claims 'Effective Prompt Injection Defenses' and the paper recommends PromptArmor 'as a standard baseline for evaluating new defenses,' but all results are on a single benchmark (AgentDojo). The paper does not bound its claims to this benchmark or discuss generalization to other prompt injection scenarios." 143 }, 144 "alternative_explanations_discussed": { 145 "applies": true, 146 "answer": true, 147 "justification": "Section 4.4 directly addresses the alternative explanation that effectiveness comes from memorization of AgentDojo data, using a memorization test on GPT-4.1 and the argument that GPT-3.5 (predating AgentDojo) also works." 148 }, 149 "proxy_outcome_distinction": { 150 "applies": true, 151 "answer": true, 152 "justification": "The paper measures FPR, FNR, ASR, and UA directly on the benchmark's formal evaluation criteria. These metrics directly measure the claimed defense capability — detecting and removing injected prompts — without framing them as proxies for a broader outcome." 153 } 154 }, 155 "setup_transparency": { 156 "model_versions_specified": { 157 "applies": true, 158 "answer": false, 159 "justification": "Models are referred to by marketing names: 'GPT-4o', 'GPT-4.1', 'o4-mini', 'GPT-3.5-Turbo', 'Qwen3-0.6B/8B/32B'. No API snapshot dates or exact version identifiers are provided." 160 }, 161 "prompts_provided": { 162 "applies": true, 163 "answer": true, 164 "justification": "Figure 2 provides the actual system and user message format used for the guardrail LLM. Section 4.2 describes the enhanced prompt for GPT-3.5 (adding a definition of prompt injection), though the exact definition text is not reproduced." 165 }, 166 "hyperparameters_reported": { 167 "applies": true, 168 "answer": true, 169 "justification": "Temperature is explicitly set to 0 for all models, stated in both Sections 4.1 ('temperature set to 0 to ensure reproducibility') and the PromptArmor implementation details." 170 }, 171 "scaffolding_described": { 172 "applies": true, 173 "answer": true, 174 "justification": "The PromptArmor pipeline is fully described in Section 3.1 and Figure 2: prompt guardrail LLM → detect injection → extract injected content → fuzzy matching removal → pass sanitized data to backend LLM." 175 }, 176 "data_preprocessing_documented": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 4.1 describes how AgentDojo constructs adversarial scenarios by combining user tasks with injection tasks (97 user tasks × injection tasks = 629 scenarios). The fuzzy matching removal technique for sanitization is described in Section 3.1." 180 } 181 }, 182 "limitations_and_scope": { 183 "limitations_section_present": { 184 "applies": true, 185 "answer": false, 186 "justification": "The paper has no dedicated Limitations or Threats to Validity section. The paper structure goes directly from evaluation results to Related Work to Conclusion." 187 }, 188 "threats_to_validity_specific": { 189 "applies": true, 190 "answer": false, 191 "justification": "No specific threats to validity are discussed anywhere in the paper." 192 }, 193 "scope_boundaries_stated": { 194 "applies": true, 195 "answer": false, 196 "justification": "The paper does not explicitly state what PromptArmor does NOT defend against, what attack types are excluded, or what settings the results do not apply to. It recommends adoption 'as a standard baseline' without bounding this recommendation." 197 } 198 }, 199 "data_integrity": { 200 "raw_data_available": { 201 "applies": true, 202 "answer": false, 203 "justification": "Only aggregate results are reported in Tables 1-3 and Figure 3. No per-scenario results, detection outputs, or raw data are provided for independent verification." 204 }, 205 "data_collection_described": { 206 "applies": true, 207 "answer": true, 208 "justification": "Section 4.1 describes AgentDojo's structure: 4 agent types with 16-40 user tasks each, injection points in environment state, 4 attack types, and 629 total adversarial scenarios." 209 }, 210 "recruitment_methods_described": { 211 "applies": false, 212 "answer": false, 213 "justification": "No human participants. The evaluation uses a standard benchmark (AgentDojo)." 214 }, 215 "data_pipeline_documented": { 216 "applies": true, 217 "answer": true, 218 "justification": "The evaluation pipeline is documented: AgentDojo generates scenarios → PromptArmor preprocesses tool-call results → agent executes with sanitized data → AgentDojo compares environment state to ground truth." 219 } 220 }, 221 "conflicts_of_interest": { 222 "funding_disclosed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No funding information or acknowledgments section is present in the paper." 226 }, 227 "affiliations_disclosed": { 228 "applies": true, 229 "answer": true, 230 "justification": "Author affiliations are listed: UC Berkeley, UC Santa Barbara, Duke University, National University of Singapore, KACST, University of Washington. No authors are affiliated with OpenAI or other model providers being evaluated." 231 }, 232 "funder_independent_of_outcome": { 233 "applies": true, 234 "answer": false, 235 "justification": "No funding is disclosed, so independence of the funder cannot be assessed." 236 }, 237 "financial_interests_declared": { 238 "applies": true, 239 "answer": false, 240 "justification": "No competing interests or financial disclosure statement is present in the paper." 241 } 242 }, 243 "contamination": { 244 "training_cutoff_stated": { 245 "applies": true, 246 "answer": false, 247 "justification": "No training data cutoff dates are stated for any of the models used (GPT-4o, GPT-4.1, o4-mini, GPT-3.5-Turbo, Qwen3 family)." 248 }, 249 "train_test_overlap_discussed": { 250 "applies": true, 251 "answer": true, 252 "justification": "Section 4.4 directly investigates whether GPT-4.1 has memorized AgentDojo data using the memorization test from Staab et al. (2023). Average similarity was 0.34, with only 3.5% above the 0.6 threshold." 253 }, 254 "benchmark_contamination_addressed": { 255 "applies": true, 256 "answer": true, 257 "justification": "Section 4.4 addresses contamination: GPT-3.5 (released before AgentDojo) still works, suggesting effectiveness is not from memorization. A formal memorization test on GPT-4.1 shows low similarity scores." 258 } 259 }, 260 "human_studies": { 261 "pre_registered": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in this study." 265 }, 266 "irb_or_ethics_approval": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in this study." 270 }, 271 "demographics_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this study." 275 }, 276 "inclusion_exclusion_criteria": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in this study." 280 }, 281 "randomization_described": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in this study." 285 }, 286 "blinding_described": { 287 "applies": false, 288 "answer": false, 289 "justification": "No human participants in this study." 290 }, 291 "attrition_reported": { 292 "applies": false, 293 "answer": false, 294 "justification": "No human participants in this study." 295 } 296 }, 297 "cost_and_practicality": { 298 "inference_cost_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "No inference cost, latency, or token consumption is reported despite PromptArmor requiring an additional LLM call for every data sample the agent processes." 302 }, 303 "compute_budget_stated": { 304 "applies": true, 305 "answer": false, 306 "justification": "No total computational budget, API costs, or hardware details are stated for any experiment." 307 } 308 }, 309 "experimental_rigor": { 310 "seed_sensitivity_reported": { 311 "applies": true, 312 "answer": false, 313 "justification": "All experiments use temperature=0 for determinism. No results across multiple seeds or stochastic runs are reported." 314 }, 315 "number_of_runs_stated": { 316 "applies": true, 317 "answer": false, 318 "justification": "The number of experimental runs is not explicitly stated. Temperature=0 implies single deterministic runs, but this is not confirmed." 319 }, 320 "hyperparameter_search_budget": { 321 "applies": true, 322 "answer": false, 323 "justification": "No hyperparameter search budget is reported. The prompt design process and how the final prompt was selected are not described." 324 }, 325 "best_config_selection_justified": { 326 "applies": true, 327 "answer": false, 328 "justification": "The paper shows multiple guardrail LLM configurations but does not explain how the prompting strategy was developed or whether it was tuned on AgentDojo scenarios." 329 }, 330 "multiple_comparison_correction": { 331 "applies": false, 332 "answer": false, 333 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 334 }, 335 "self_comparison_bias_addressed": { 336 "applies": true, 337 "answer": false, 338 "justification": "The authors implement PromptArmor and compare it against published baselines. No acknowledgment of author-evaluation bias or the possibility that their implementation of baselines may underperform." 339 }, 340 "compute_budget_vs_performance": { 341 "applies": true, 342 "answer": false, 343 "justification": "PromptArmor requires an additional LLM call per data sample, which could significantly increase cost. No analysis of the cost-performance tradeoff relative to baselines is provided." 344 }, 345 "benchmark_construct_validity": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether AgentDojo's attack scenarios and evaluation criteria are representative of real-world prompt injection threats. The paper uses AgentDojo without questioning its construct validity." 349 }, 350 "scaffold_confound_addressed": { 351 "applies": true, 352 "answer": true, 353 "justification": "The backend LLM is held constant (GPT-4.1) across all experiments in Table 1, isolating the guardrail LLM as the variable. The agent scaffold (AgentDojo environment) is also held constant." 354 } 355 }, 356 "data_leakage": { 357 "temporal_leakage_addressed": { 358 "applies": true, 359 "answer": true, 360 "justification": "Section 4.4 addresses temporal concerns: GPT-3.5 was released before AgentDojo and still shows effectiveness, arguing against memorization as the explanation for performance." 361 }, 362 "feature_leakage_addressed": { 363 "applies": true, 364 "answer": false, 365 "justification": "No discussion of whether the AgentDojo evaluation setup provides features or context to the guardrail LLM that would not be available in real deployment." 366 }, 367 "non_independence_addressed": { 368 "applies": true, 369 "answer": false, 370 "justification": "No discussion of whether AgentDojo's 629 scenarios are structurally independent or share templates, attack patterns, or injection styles that could inflate performance estimates." 371 }, 372 "leakage_detection_method": { 373 "applies": true, 374 "answer": true, 375 "justification": "Section 4.4 applies the Staab et al. (2023) memorization test: splitting data samples into prefix-suffix pairs, prompting GPT-4.1 with the prefix, and measuring edit-distance similarity to the suffix. Average similarity was 0.34, with only 3.5% above the 0.6 memorization threshold." 376 } 377 } 378 }, 379 "claims": [ 380 { 381 "claim": "PromptArmor achieves FPR and FNR below 1% with GPT-4o, GPT-4.1, and o4-mini on AgentDojo", 382 "evidence": "Table 1: GPT-4o (0.07% FPR, 0.23% FNR), GPT-4.1 (0.56% FPR, 0.13% FNR), o4-mini (0.34% FPR, 0.47% FNR)", 383 "supported": "strong" 384 }, 385 { 386 "claim": "After removing injected prompts with PromptArmor, the attack success rate drops to below 1%", 387 "evidence": "Table 1: GPT-4.1 achieves 0.00% ASR, GPT-4o 0.47%, o4-mini 0.08%, compared to 54.53% with no defense", 388 "supported": "strong" 389 }, 390 { 391 "claim": "PromptArmor is robust against adaptive attacks specifically designed to circumvent it", 392 "evidence": "Table 3: Against AgentVigil-Adaptive (attacks optimized against PromptArmor), ASR is 0.34% with 0.70% FPR and 2.26% FNR", 393 "supported": "moderate" 394 }, 395 { 396 "claim": "Larger LLMs generally make PromptArmor more effective, with model capacity being the primary factor", 397 "evidence": "Figure 3: Qwen3-0.6B has extreme FPR/FNR trade-offs, Qwen3-8B achieves reasonable balance, Qwen3-32B matches GPT-4.1 performance across all metrics", 398 "supported": "strong" 399 }, 400 { 401 "claim": "Reasoning capability improves performance especially in mid-sized LLMs but cannot compensate for insufficient model capacity", 402 "evidence": "Figure 3: Reasoning reduces Qwen3-8B FNR from 26.59% to 15.78%; for Qwen3-0.6B, reasoning shifts from high FPR to high FNR without resolving the fundamental trade-off", 403 "supported": "moderate" 404 }, 405 { 406 "claim": "The effectiveness is not due to memorization of AgentDojo data by the guardrail LLM", 407 "evidence": "Section 4.4: GPT-3.5 (released before AgentDojo) is effective; memorization test on GPT-4.1 shows average similarity 0.34 and only 3.5% above the 0.6 threshold", 408 "supported": "moderate" 409 }, 410 { 411 "claim": "PromptArmor challenges the common belief that off-the-shelf LLMs cannot defend against prompt injection attacks", 412 "evidence": "Section 1: Prior work (Liu et al., 2024, 2025) claimed this; the authors attribute the misconception to (1) older LLMs and (2) poorly designed prompts. Table 1 demonstrates effectiveness with newer models.", 413 "supported": "moderate" 414 } 415 ], 416 "red_flags": [ 417 { 418 "flag": "Single benchmark evaluation", 419 "detail": "All results are from AgentDojo only. The paper makes broad claims about prompt injection defense effectiveness and recommends adoption 'as a standard baseline' but provides no evidence from other benchmarks, real-world deployments, or diverse attack settings." 420 }, 421 { 422 "flag": "No limitations section", 423 "detail": "The paper entirely lacks a limitations or threats to validity section, which is unusual for a security paper and prevents readers from understanding the boundaries of the approach." 424 }, 425 { 426 "flag": "No uncertainty quantification", 427 "detail": "All results are single-run point estimates at temperature=0. No error bars, confidence intervals, or variance measures are reported. Claims of '0.00% ASR' are presented without acknowledging the finite sample (629 scenarios)." 428 }, 429 { 430 "flag": "Missing cost analysis for practical deployment", 431 "detail": "PromptArmor requires an additional LLM API call for every data sample processed by the agent, potentially doubling inference costs. No cost, latency, or overhead analysis is provided despite claiming practical applicability." 432 }, 433 { 434 "flag": "Unbounded generalization claims", 435 "detail": "The paper recommends PromptArmor 'as a standard baseline for evaluating new defenses against prompt injection attacks' based solely on AgentDojo results, without testing on other attack benchmarks or real-world scenarios." 436 } 437 ], 438 "cited_papers": [ 439 { 440 "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents", 441 "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"], 442 "year": 2024, 443 "relevance": "Primary benchmark used for evaluating prompt injection defenses, provides standardized attack scenarios for LLM agents." 444 }, 445 { 446 "title": "SecAlign: Defending Against Prompt Injection with Preference Optimization", 447 "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "David Wagner", "Chuan Guo"], 448 "year": 2025, 449 "relevance": "Training-based defense using DPO for prompt injection robustness; evaluated on AgentDojo." 450 }, 451 { 452 "title": "DataSentinel: A Game-Theoretic Detection of Prompt Injection Attacks", 453 "authors": ["Yupei Liu", "Yuqi Jia", "Jinyuan Jia", "Dawn Song", "Neil Zhenqiang Gong"], 454 "year": 2025, 455 "arxiv_id": "2504.11358", 456 "relevance": "Detection-based defense using minimax optimization; compared as baseline in the paper." 457 }, 458 { 459 "title": "MELON: Indirect Prompt Injection Defense via Masked Re-Execution and Tool Comparison", 460 "authors": ["Kaijie Zhu", "Xianjun Yang", "Jindong Wang", "Wenbo Guo", "William Yang Wang"], 461 "year": 2025, 462 "relevance": "System-level defense using masked re-execution; compared as baseline achieving 3.18% ASR." 463 }, 464 { 465 "title": "Defeating Prompt Injections by Design", 466 "authors": ["Edoardo Debenedetti", "Ilia Shumailov", "Tianqi Fan", "Jamie Hayes", "Nicholas Carlini"], 467 "year": 2025, 468 "arxiv_id": "2503.18813", 469 "relevance": "System-level defense (CaMeL) using control and data flow management against prompt injection." 470 }, 471 { 472 "title": "Progent: Programmable Privilege Control for LLM Agents", 473 "authors": ["Tianneng Shi", "Jingxuan He", "Zhun Wang", "Linyu Wu", "Hongwei Li", "Wenbo Guo", "Dawn Song"], 474 "year": 2025, 475 "arxiv_id": "2504.11703", 476 "relevance": "System-level defense using privilege control for LLM agent security." 477 }, 478 { 479 "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions", 480 "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"], 481 "year": 2024, 482 "arxiv_id": "2404.13208", 483 "relevance": "Training-based defense establishing instruction priority levels to mitigate prompt injection." 484 }, 485 { 486 "title": "StruQ: Defending Against Prompt Injection with Structured Queries", 487 "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"], 488 "year": 2024, 489 "arxiv_id": "2402.06363", 490 "relevance": "Training-based defense using structured queries to separate instructions from data." 491 }, 492 { 493 "title": "A Critical Evaluation of Defenses Against Prompt Injection Attacks", 494 "authors": ["Yuqi Jia", "Zedian Shao", "Yupei Liu", "Jinyuan Jia", "Dawn Song", "Neil Zhenqiang Gong"], 495 "year": 2025, 496 "arxiv_id": "2505.18333", 497 "relevance": "Systematic evaluation showing training-based defenses degrade instruction-following and remain vulnerable to adaptive attacks." 498 }, 499 { 500 "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents", 501 "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"], 502 "year": 2024, 503 "relevance": "Benchmark for indirect prompt injection attacks in tool-integrated LLM agents." 504 }, 505 { 506 "title": "Formalizing and Benchmarking Prompt Injection Attacks and Defenses", 507 "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"], 508 "year": 2024, 509 "relevance": "Formalization and benchmarking of prompt injection attacks and defenses; cited as prior work claiming off-the-shelf LLMs cannot defend against prompt injection." 510 }, 511 { 512 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 513 "authors": ["John Yang", "Carlos E Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"], 514 "year": 2024, 515 "relevance": "Key LLM agent for software engineering, representative of the agent paradigm that prompt injection attacks target." 516 }, 517 { 518 "title": "IsolateGPT: An Execution Isolation Architecture for LLM-Based Systems", 519 "authors": ["Yuhao Wu", "Franziska Roesner", "Tadayoshi Kohno", "Ning Zhang", "Umar Iqbal"], 520 "year": 2025, 521 "relevance": "System-level defense using execution environment isolation for LLM security." 522 } 523 ] 524 }