scan-v5.json (26949B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Indirect Prompt Injections: Are Firewalls All You Need, or Stronger Benchmarks?", 6 "authors": [ 7 "Rishika Bhagwatkar", 8 "Kevin Kasa", 9 "Abhay Puri", 10 "Gabriel Huang", 11 "Irina Rish", 12 "Graham W. Taylor", 13 "Krishnamurthy Dj Dvijotham", 14 "Alexandre Lacoste" 15 ], 16 "year": 2025, 17 "venue": "arXiv.org", 18 "arxiv_id": "2510.05244", 19 "doi": "10.48550/arXiv.2510.05244" 20 }, 21 "checklist": { 22 "claims_and_evidence": { 23 "abstract_claims_supported": { 24 "applies": true, 25 "answer": true, 26 "justification": "The abstract claims 0% or lowest possible ASR across four benchmarks; Tables 1-4 confirm Sanitizer achieves ≤0.02% ASR on AgentDojo, 16.33% (lowest among all methods) on original ASB, and 0% on InjecAgent/τ-bench. The parenthetical 'or the lowest possible' appropriately hedges the claim.", 27 "source": "haiku" 28 }, 29 "causal_claims_justified": { 30 "applies": true, 31 "answer": true, 32 "justification": "The paper ablates Minimizer, Sanitizer, and Combined separately across four benchmarks with multiple model runs, providing adequate empirical basis for attributing ASR reductions to the firewall components.", 33 "source": "haiku" 34 }, 35 "generalization_bounded": { 36 "applies": true, 37 "answer": true, 38 "justification": "Claims are explicitly bounded to four tested benchmarks, and Section 7 plus Appendix E demonstrate a Braille bypass that further limits the generalization of firewall effectiveness to current attacks.", 39 "source": "haiku" 40 }, 41 "alternative_explanations_discussed": { 42 "applies": true, 43 "answer": true, 44 "justification": "Section 6 systematically examines whether benchmark artifacts (forced tool injection, brittle utility metrics) explain results rather than genuine defense effectiveness, and discusses ambiguously benign attacks causing residual non-zero ASR.", 45 "source": "haiku" 46 }, 47 "proxy_outcome_distinction": { 48 "applies": true, 49 "answer": true, 50 "justification": "The paper explicitly defines ASR, BU, and UA as distinct metrics, critiques InjecAgent for lacking utility metrics, and flags ASB's ASR as inflated by benchmark design rather than agent vulnerability.", 51 "source": "haiku" 52 } 53 }, 54 "limitations_and_scope": { 55 "limitations_section_present": { 56 "applies": true, 57 "answer": false, 58 "justification": "There is no dedicated limitations section; caveats are scattered across Section 7 (Discussion), the Ethics Statement, and the Reproducibility Statement — none of which constitutes a dedicated limitations section.", 59 "source": "haiku" 60 }, 61 "threats_to_validity_specific": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper identifies specific validity threats: ASR inflation from forced tool injection (8× in ASB), brittle utility metrics mis-scoring success in AgentDojo, and the assumption that user tasks and tools are fully trusted as scope boundaries.", 65 "source": "haiku" 66 }, 67 "scope_boundaries_stated": { 68 "applies": true, 69 "answer": true, 70 "justification": "The threat model explicitly states scope: IPI attacks only, benign user assumed, no a priori knowledge of which tools are malicious; the Ethics section notes the defense does not protect against developer overreliance.", 71 "source": "haiku" 72 } 73 }, 74 "conflicts_of_interest": { 75 "funding_disclosed": { 76 "applies": true, 77 "answer": false, 78 "justification": "No funding disclosure appears in the paper; the acknowledgements thank colleagues at ServiceNow Research for compute resources but do not name any grant or funding source.", 79 "source": "haiku" 80 }, 81 "affiliations_disclosed": { 82 "applies": true, 83 "answer": true, 84 "justification": "Author affiliations are clearly listed on the title page: ServiceNow Research, Mila – Quebec AI Institute, Université de Montréal, Vector Institute, and University of Guelph.", 85 "source": "haiku" 86 }, 87 "funder_independent_of_outcome": { 88 "applies": true, 89 "answer": false, 90 "justification": "The majority of authors are from ServiceNow Research, which has institutional interest in AI agent security; the paper proposes and favorably evaluates a defense mechanism while disclosing no independent funder.", 91 "source": "haiku" 92 }, 93 "financial_interests_declared": { 94 "applies": true, 95 "answer": false, 96 "justification": "No competing interests, patent, or financial disclosure statement appears anywhere in the paper.", 97 "source": "haiku" 98 } 99 }, 100 "scope_and_framing": { 101 "key_terms_defined": { 102 "applies": true, 103 "answer": true, 104 "justification": "Indirect prompt injection, Tool-Input Firewall (Minimizer), Tool-Output Firewall (Sanitizer), Benign Utility, Utility under Attack, and Attack Success Rate are all defined with clear operational meanings in Sections 3 and 4.", 105 "source": "haiku" 106 }, 107 "intended_contribution_clear": { 108 "applies": true, 109 "answer": true, 110 "justification": "The Introduction explicitly lists four contributions: the firewall defense, benchmark saturation demonstration, benchmark fixes with revised baselines, and a Sanitizer bypass example.", 111 "source": "haiku" 112 }, 113 "engagement_with_prior_work": { 114 "applies": true, 115 "answer": true, 116 "justification": "Section 2 directly contrasts the approach with CaMeL, Melon, LLM retraining, and PI detectors, explaining how it simplifies prior three-firewall approaches (Bagdasarian 2024, Abdelnabi 2025) while maintaining effectiveness.", 117 "source": "haiku" 118 } 119 } 120 }, 121 "type_checklist": { 122 "empirical": { 123 "artifacts": { 124 "code_released": { 125 "applies": true, 126 "answer": false, 127 "justification": "The Reproducibility Statement says 'We will release all code required to reproduce our results upon acceptance' — a conditional future promise, not a current release.", 128 "source": "haiku" 129 }, 130 "data_released": { 131 "applies": true, 132 "answer": true, 133 "justification": "All four benchmarks used (AgentDojo, ASB, InjecAgent, τ-bench) are publicly available; the paper also commits to releasing revised versions of AgentDojo and ASB.", 134 "source": "haiku" 135 }, 136 "environment_specified": { 137 "applies": true, 138 "answer": false, 139 "justification": "No requirements.txt, Dockerfile, or package version specifications are provided; model names are given but no software environment details.", 140 "source": "haiku" 141 }, 142 "reproduction_instructions": { 143 "applies": true, 144 "answer": false, 145 "justification": "Algorithm 1 and Appendix B provide the algorithmic logic and prompts, but step-by-step instructions for running experiments are absent and code is not yet released.", 146 "source": "haiku" 147 } 148 }, 149 "statistical_methodology": { 150 "confidence_intervals_or_error_bars": { 151 "applies": true, 152 "answer": true, 153 "justification": "All tables report ± values alongside point estimates (e.g., '83.02 ±5.33', '0.02 ±0.03') throughout the paper.", 154 "source": "haiku" 155 }, 156 "significance_tests": { 157 "applies": true, 158 "answer": false, 159 "justification": "No formal statistical significance tests are applied when comparing defenses; only point estimates with standard deviations are reported.", 160 "source": "haiku" 161 }, 162 "effect_sizes_reported": { 163 "applies": true, 164 "answer": true, 165 "justification": "Absolute percentage differences are consistently reported: ASR drops from 57.69% to 0.02% for Sanitizer vs. no defense; ASB forced injection inflates ASR from 9.25% to 70% (nearly 8×).", 166 "source": "haiku" 167 }, 168 "sample_size_justified": { 169 "applies": true, 170 "answer": false, 171 "justification": "Sample sizes are inherited from existing benchmarks (949, 400, etc.) without justification or power analysis for detecting meaningful differences between defenses.", 172 "source": "haiku" 173 }, 174 "variance_reported": { 175 "applies": true, 176 "answer": true, 177 "justification": "All tables include ± values for every reported metric across all models and benchmarks, indicating variance across multiple runs is computed.", 178 "source": "haiku" 179 } 180 }, 181 "evaluation_design": { 182 "baselines_included": { 183 "applies": true, 184 "answer": true, 185 "justification": "Multiple baselines are compared across benchmarks: Spotlighting, Repeat prompt, PI Detector, Tool Filter, Melon, Melon-Aug, and CaMeL.", 186 "source": "haiku" 187 }, 188 "baselines_contemporary": { 189 "applies": true, 190 "answer": true, 191 "justification": "Baselines include 2025 publications (CaMeL from Debenedetti et al. 2025, Melon from ICML 2025), though the authors note they could not reproduce those results and use published numbers.", 192 "source": "haiku" 193 }, 194 "ablation_study": { 195 "applies": true, 196 "answer": true, 197 "justification": "Minimizer alone, Sanitizer alone, and Combined (Minimizer + Sanitizer) are evaluated separately on each applicable benchmark, enabling attribution of security-utility tradeoffs to each component.", 198 "source": "haiku" 199 }, 200 "multiple_metrics": { 201 "applies": true, 202 "answer": true, 203 "justification": "Three distinct metrics are used: Benign Utility (BU), Utility under Attack (UA), and Attack Success Rate (ASR), providing complementary views of the security-utility tradeoff.", 204 "source": "haiku" 205 }, 206 "human_evaluation": { 207 "applies": false, 208 "answer": false, 209 "justification": "Human evaluation of system outputs is not applicable to this automated security benchmark evaluation.", 210 "source": "haiku" 211 }, 212 "held_out_test_set": { 213 "applies": true, 214 "answer": true, 215 "justification": "The four benchmarks serve as fixed evaluation sets; the authors claim the firewall prompts were not tuned on these benchmarks and work 'out-of-the-box.'", 216 "source": "haiku" 217 }, 218 "per_category_breakdown": { 219 "applies": true, 220 "answer": true, 221 "justification": "AgentDojo results are broken down by suite (Banking, Slack, Travel, Workspace) in Tables 7-12; InjecAgent results are broken down by attack type (Direct Harm vs. Data Stealing) and attack setting (base vs. enhanced) in Table 19.", 222 "source": "haiku" 223 }, 224 "failure_cases_discussed": { 225 "applies": true, 226 "answer": true, 227 "justification": "Appendix E provides a detailed case study of a successful Braille-encoded attack bypassing the GPT-4o Sanitizer; Appendix C.3 discusses ambiguously benign attacks that produce non-zero residual ASR.", 228 "source": "haiku" 229 }, 230 "negative_results_reported": { 231 "applies": true, 232 "answer": true, 233 "justification": "The Minimizer reduces ASR but degrades utility; most common baseline defenses (Spotlighting, Repeat prompt) are shown largely ineffective; the Braille bypass demonstrates the Sanitizer can be defeated.", 234 "source": "haiku" 235 } 236 }, 237 "setup_transparency": { 238 "model_versions_specified": { 239 "applies": true, 240 "answer": true, 241 "justification": "Specific model versions are given: 'gpt-4o-2024-08-06' in Table 7, 'GPT-4o-2024-05-13' in Appendix E, 'Llama 3.3 70b,' 'Qwen3 32b,' and 'Qwen3 8b' throughout.", 242 "source": "haiku" 243 }, 244 "prompts_provided": { 245 "applies": true, 246 "answer": true, 247 "justification": "Appendix B provides complete system message and user message templates for both the Minimizer and Sanitizer firewalls, with all placeholder variables clearly indicated.", 248 "source": "haiku" 249 }, 250 "hyperparameters_reported": { 251 "applies": true, 252 "answer": false, 253 "justification": "Temperature, top-p, and other API hyperparameters for the firewall LLM calls and primary agent are not reported anywhere in the paper.", 254 "source": "haiku" 255 }, 256 "scaffolding_described": { 257 "applies": true, 258 "answer": true, 259 "justification": "Algorithm 1 provides a step-by-step pseudocode of the full tool-calling pipeline with both firewalls integrated, and Section 3 describes threat model, defense objective, and firewall mechanics in detail.", 260 "source": "haiku" 261 }, 262 "data_preprocessing_documented": { 263 "applies": true, 264 "answer": true, 265 "justification": "Section 6 and Appendix D document specific changes made to AgentDojo (injection vector placement, utility function fixes) and ASB (forced attack-tool injection removal) with concrete examples.", 266 "source": "haiku" 267 } 268 }, 269 "data_integrity": { 270 "raw_data_available": { 271 "applies": true, 272 "answer": false, 273 "justification": "Raw experimental outputs (per-task success/failure logs) are not released; code is promised only upon acceptance.", 274 "source": "haiku" 275 }, 276 "data_collection_described": { 277 "applies": true, 278 "answer": false, 279 "justification": "The procedure for running evaluations (number of trials per condition, seed selection, API call protocols) is not described in sufficient detail to independently verify or replicate results.", 280 "source": "haiku" 281 }, 282 "recruitment_methods_described": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants were recruited; all data comes from automated benchmark evaluation.", 286 "source": "haiku" 287 }, 288 "data_pipeline_documented": { 289 "applies": true, 290 "answer": false, 291 "justification": "The algorithmic pipeline is described (Algorithm 1) but the full experimental pipeline from benchmark instantiation through result aggregation is not documented in reproducible detail.", 292 "source": "haiku" 293 } 294 }, 295 "contamination": { 296 "training_cutoff_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "Training data cutoffs for GPT-4o, Llama 3.3, and Qwen3 models are not stated anywhere in the paper.", 300 "source": "haiku" 301 }, 302 "train_test_overlap_discussed": { 303 "applies": true, 304 "answer": false, 305 "justification": "The paper recommends future benchmarks add canary strings for contamination detection but does not discuss whether current benchmark scenarios appeared in model training data.", 306 "source": "haiku" 307 }, 308 "benchmark_contamination_addressed": { 309 "applies": true, 310 "answer": false, 311 "justification": "The paper explicitly recommends future benchmarks add canary strings to detect contamination, implicitly acknowledging this is unaddressed in the current evaluation.", 312 "source": "haiku" 313 } 314 }, 315 "human_studies": { 316 "pre_registered": { 317 "applies": false, 318 "answer": false, 319 "justification": "No human participants.", 320 "source": "haiku" 321 }, 322 "irb_or_ethics_approval": { 323 "applies": false, 324 "answer": false, 325 "justification": "No human participants.", 326 "source": "haiku" 327 }, 328 "demographics_reported": { 329 "applies": false, 330 "answer": false, 331 "justification": "No human participants.", 332 "source": "haiku" 333 }, 334 "inclusion_exclusion_criteria": { 335 "applies": false, 336 "answer": false, 337 "justification": "No human participants.", 338 "source": "haiku" 339 }, 340 "randomization_described": { 341 "applies": false, 342 "answer": false, 343 "justification": "No human participants.", 344 "source": "haiku" 345 }, 346 "blinding_described": { 347 "applies": false, 348 "answer": false, 349 "justification": "No human participants.", 350 "source": "haiku" 351 }, 352 "attrition_reported": { 353 "applies": false, 354 "answer": false, 355 "justification": "No human participants.", 356 "source": "haiku" 357 } 358 }, 359 "cost_and_practicality": { 360 "inference_cost_reported": { 361 "applies": true, 362 "answer": false, 363 "justification": "The Ethics Statement notes that firewall steps require additional API calls but no latency, token cost, or overhead figures are quantified anywhere in the paper.", 364 "source": "haiku" 365 }, 366 "compute_budget_stated": { 367 "applies": true, 368 "answer": false, 369 "justification": "Total compute budget for running experiments across four benchmarks and four models is not reported.", 370 "source": "haiku" 371 } 372 } 373 } 374 }, 375 "claims": [ 376 { 377 "claim": "A simple Sanitizer firewall achieves 0% or near-0% attack success rate across four public benchmarks while maintaining high utility.", 378 "evidence": "Tables 1-4 show Sanitizer ASR of 0.02% on AgentDojo, 16.33% (lowest among all methods) on original ASB, 0.30% base/0.00% enhanced on InjecAgent, and 0.00% on τ-bench with GPT-4o.", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Agent Security Bench's forced attack-tool injection artificially inflates ASR by nearly 8×.", 383 "evidence": "Table 6 shows ASR drops from 73.58% to 9.25% when forced attack-tool injection is disabled, demonstrating the benchmark design flaw rather than true agent vulnerability.", 384 "supported": "strong" 385 }, 386 { 387 "claim": "Fixing AgentDojo's injection vector placements and utility functions improves utility under attack by over 18%.", 388 "evidence": "Table 5 shows UA improves from 60.87% to 72.19% on AgentDojo-Revised vs. original under the tool knowledge attack with GPT-4o.", 389 "supported": "strong" 390 }, 391 { 392 "claim": "The Sanitizer outperforms CaMeL on utility while achieving equivalent near-zero ASR.", 393 "evidence": "Table 1 shows Sanitizer BU=67.68%, UA=69.17%, ASR=0.02% vs. CaMeL BU=53.6%, UA=54.5%, ASR=0.00%; however, CaMeL results are non-reproduced published numbers.", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "Braille-encoded attacks bypass the GPT-4o Sanitizer because Braille produces rare tokens where the model is poorly aligned.", 398 "evidence": "Appendix E shows the Sanitizer detected Braille but decoded rather than removed it, allowing the injected instruction to succeed in the AgentDojo travel suite.", 399 "supported": "moderate" 400 }, 401 { 402 "claim": "Existing IPI benchmarks are saturated by a simple defense, necessitating stronger attack strategies.", 403 "evidence": "The Sanitizer alone achieves the lowest or tied-lowest ASR on all four benchmarks without any benchmark-specific tuning, as shown in Tables 1-4.", 404 "supported": "strong" 405 } 406 ], 407 "methodology_tags": [ 408 "benchmark-eval" 409 ], 410 "key_findings": "A two-component firewall (Minimizer + Sanitizer) at the agent-tool interface achieves near-zero attack success rates across all four tested IPI benchmarks without LLM retraining, outperforming complex defenses like CaMeL at higher utility. However, the paper simultaneously demonstrates fundamental benchmark flaws: ASB's forced attack-tool injection inflates ASR by ~8×, and AgentDojo's brittle utility metrics undercount utility by 18%+. A Braille-encoded attack successfully bypasses the GPT-4o Sanitizer, proving real-world threats remain viable despite benchmark saturation. The paper's central argument — that current IPI benchmarks cannot distinguish strong from weak defenses and urgently need stronger adaptive attacks — is well-supported by its empirical evidence.", 411 "red_flags": [ 412 { 413 "flag": "Code not released", 414 "detail": "All code is promised 'upon acceptance' but unavailable, preventing independent reproduction of any result." 415 }, 416 { 417 "flag": "Non-reproduced baselines", 418 "detail": "CaMeL and Melon results in Table 1 are taken directly from their published papers under potentially different evaluation conditions; the authors explicitly state they could not reproduce them." 419 }, 420 { 421 "flag": "Undisclosed institutional conflict", 422 "detail": "Majority of authors are from ServiceNow Research with institutional interest in AI agent security; no funding source or competing interests statement is provided." 423 }, 424 { 425 "flag": "No significance testing", 426 "detail": "Comparisons between defenses rely solely on point estimates with standard deviations; no formal tests are used to evaluate whether observed differences are statistically meaningful." 427 }, 428 { 429 "flag": "Benchmark contamination unaddressed", 430 "detail": "The paper recommends future benchmarks add contamination canaries, implicitly acknowledging that current evaluations may be affected by model pre-training on benchmark scenarios." 431 }, 432 { 433 "flag": "Hyperparameters unreported", 434 "detail": "Temperature, top-p, and other LLM inference hyperparameters for both firewall models and the primary agent are not reported anywhere in the paper." 435 } 436 ], 437 "cited_papers": [ 438 { 439 "title": "AgentDojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents", 440 "relevance": "Primary benchmark evaluated; the paper's critique and revised version are central contributions" 441 }, 442 { 443 "title": "Agent Security Bench (ASB): Formalizing and benchmarking attacks and defenses in LLM-based agents", 444 "relevance": "Second primary benchmark; paper identifies forced attack-tool injection flaw inflating ASR by ~8×" 445 }, 446 { 447 "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents", 448 "relevance": "Third benchmark evaluated; critiqued for lacking utility metrics entirely" 449 }, 450 { 451 "title": "τ-bench: A benchmark for Tool-Agent-User interaction in real-world domains", 452 "relevance": "Fourth benchmark augmented with DoomArena attacks to evaluate security-utility tradeoffs on long-horizon tasks" 453 }, 454 { 455 "title": "Defeating prompt injections by design (CaMeL)", 456 "relevance": "Primary complex defense baseline using custom Python interpreter; shown to have lower utility than the Sanitizer" 457 }, 458 { 459 "title": "MELON: Provable defense against indirect prompt injection attacks in AI agents", 460 "relevance": "Second complex defense baseline providing provable security guarantees; compared in Table 1" 461 }, 462 { 463 "title": "Firewalls to secure dynamic LLM agentic networks", 464 "relevance": "Direct predecessor using three LLM firewalls; this paper simplifies to two and shows one output firewall suffices" 465 }, 466 { 467 "title": "AirGapAgent: Protecting privacy-conscious conversational agents", 468 "relevance": "Foundation for context minimization approach that inspired the Minimizer component" 469 }, 470 { 471 "title": "DoomArena: A framework for testing AI agents against evolving security threats", 472 "relevance": "Framework used to augment τ-bench with data-stealing attacks to create a security evaluation scenario" 473 }, 474 { 475 "title": "Defending against indirect prompt injection attacks with spotlighting", 476 "relevance": "Baseline defense method evaluated across all four benchmarks; uses special character delimiters to distinguish instructions" 477 } 478 ], 479 "engagement_factors": { 480 "practical_relevance": { 481 "score": 3, 482 "justification": "The firewall defense is model-agnostic, requires no retraining, and is described as deployable out-of-the-box to any tool-calling agent pipeline." 483 }, 484 "surprise_contrarian": { 485 "score": 3, 486 "justification": "Argues that the most widely used IPI benchmarks are fundamentally flawed and that a trivially simple defense saturates them, challenging the evaluation paradigm of the entire field." 487 }, 488 "fear_safety": { 489 "score": 3, 490 "justification": "Demonstrates that AI agents can be hijacked via indirect prompt injection to exfiltrate sensitive data or make fraudulent transactions, and that even proposed defenses can be bypassed with Braille encoding." 491 }, 492 "drama_conflict": { 493 "score": 2, 494 "justification": "Critiques three widely-used benchmarks (AgentDojo, ASB, InjecAgent) for implementation bugs, inflated metrics, and missing utility evaluation, challenging the validity of prior published results." 495 }, 496 "demo_ability": { 497 "score": 2, 498 "justification": "The Braille attack bypass is a concrete demonstrable example, but code is not yet released; public benchmarks allow partial replication of the defense evaluation." 499 }, 500 "brand_recognition": { 501 "score": 1, 502 "justification": "ServiceNow Research is a known industrial lab but not a top-tier AI brand; Mila affiliation (Irina Rish) adds some academic recognition in the ML community." 503 } 504 }, 505 "hn_data": { 506 "threads": [ 507 { 508 "hn_id": "42657501", 509 "title": "The GAN is dead; long live the GAN - A Modern GAN Baseline", 510 "points": 3, 511 "comments": 1, 512 "url": "https://news.ycombinator.com/item?id=42657501", 513 "created_at": "2025-01-10T17:07:45Z" 514 }, 515 { 516 "hn_id": "39202830", 517 "title": "Low-Resource Languages Jailbreak GPT-4", 518 "points": 1, 519 "comments": 0, 520 "url": "https://news.ycombinator.com/item?id=39202830", 521 "created_at": "2024-01-31T12:11:05Z" 522 }, 523 { 524 "hn_id": "28839434", 525 "title": "User-driven design and evaluation of Liquid Types in Java", 526 "points": 1, 527 "comments": 0, 528 "url": "https://news.ycombinator.com/item?id=28839434", 529 "created_at": "2021-10-12T13:32:25Z" 530 } 531 ], 532 "top_points": 3, 533 "total_points": 5, 534 "total_comments": 1 535 } 536 }