scan.json (31682B)
1 { 2 "paper": { 3 "title": "Defending Against Prompt Injection with DataFilter", 4 "authors": [ 5 "Yizhu Wang", 6 "Sizhe Chen", 7 "Raghad Alkhudair", 8 "Basel Alomair", 9 "David Wagner" 10 ], 11 "year": 2025, 12 "venue": "IEEE Conference on Secure and Trustworthy Machine Learning (SaTML) 2026", 13 "arxiv_id": "2510.19207", 14 "doi": "10.48550/arXiv.2510.19207" 15 }, 16 "scan_version": 3, 17 "active_modules": [ 18 "experimental_rigor", 19 "data_leakage" 20 ], 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The abstract states 'Our DataFilter model is released here for immediate use, with the code to reproduce our results here' with hyperlinks provided in footnote 1." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "The training data is the publicly available Alpaca dataset [59]. All evaluation benchmarks (SEP, InjecAgent, AgentDojo, AlpacaEval2) are public. The SFT dataset construction is fully specified by Algorithm 1." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "The paper mentions 'two 80GB GPUs (A100/H100) using DeepSpeed ZeRO-3' and 'BF16 precision' but does not provide a requirements.txt, Dockerfile, or detailed library version listing sufficient to recreate the environment." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": true, 41 "justification": "The paper provides Algorithm 1 for SFT dataset construction, full training hyperparameters in Section V-A, and states the code to reproduce results is released. The prompt template is provided in full." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "All results in Tables II-VIII are reported as single point estimates (e.g., '2.2% ASR') with no confidence intervals or error bars." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "No statistical significance tests are performed. Claims like 'DataFilter outperforms PromptArmor' are based solely on comparing raw percentages without any statistical test." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "The paper reports effect sizes with baseline context throughout, e.g., 'average ASR 2.2% vs 5.9%' for DataFilter vs PromptArmor, 'average utility drop of 1.0% vs 4.1%', and provides both absolute values and differences (Section V-E)." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "No justification for sample sizes. They note 'we randomly select 1K samples from SEP' for efficiency but provide no power analysis or justification for any benchmark sample sizes." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "No variance or standard deviation reported. The paper acknowledges 'the backend LLM (gpt-4o) is non-deterministic despite setting the sampling temperature to 0, rendering inevitable variability' (Section V-E) but does not report multi-run variance." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "Extensive baselines: PromptGuard, DataSentinel, Sandwich, Instructional, Spotlight, PromptArmor, and Tool Filter (Tables II-VI)." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "Baselines include concurrent work PromptArmor (2025), DataSentinel (2025), PromptGuard (2024), and Tool Filter (2025). These represent the current state of the art for model-agnostic defenses." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": false, 85 "justification": "No systematic ablation study. The discussion mentions a preliminary experiment where 'we trained a filter without providing the user's prompt as context' (Section VI), but there is no systematic ablation of the four training goals (anti-hallucination, EOS token, position randomization, etc.) that comprise DataFilter's design." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Security is measured by ASR across multiple benchmarks; utility is measured by task success rate (AgentDojo) and length-controlled WinRate (AlpacaEval2). Both dimensions are evaluated." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": false, 95 "justification": "No human evaluation is included. All security and utility assessments are automated. Human evaluation of filtered output quality or filter accuracy could have been informative." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "The filter is trained on the Alpaca dataset and evaluated on entirely separate benchmarks (SEP, InjecAgent, AgentDojo, AlpacaEval2), providing clear train/test separation." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Results are broken down by attack type (6 types on SEP in Table IV, 4 on AgentDojo in Table II, 2 on InjecAgent in Table III) and by benchmark." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Appendix C provides detailed false positive and false negative examples. The Limitations section discusses when DataFilter fails (optimization-based attacks, very long prompts, injections disguised as task-relevant instructions)." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Table VIII shows DataFilter achieves only 83% ASR reduction under strong adaptive LLM-based attacks — still substantially breakable. The Limitations section acknowledges several failure modes. The preliminary experiment without user prompt context showed utility degradation." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "The abstract claims 'reduces the prompt injection attack success rates to near zero while maintaining the LLMs' utility.' Results show average ASR ~2.2% (Tables II-IV) and utility drop of ~1-2% (Tables V-VI), consistent with the claims." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "Claims that DataFilter 'reduces' ASR are supported by controlled comparisons: same benchmarks, same backend LLMs, with and without the defense applied. This controlled experimental design adequately supports the causal claims." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper tests on multiple benchmarks (instruction-following and agentic), two backend LLMs (GPT-4o and Llama-3.1-8B), and multiple attack types. The Limitations section explicitly bounds scope: 'cannot defend against the strong optimization-based adaptive attacks' and acknowledges issues with very long prompts." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper does not substantively discuss alternative explanations for DataFilter's effectiveness. There is no discussion of whether the specific Alpaca training distribution, the choice of Llama-3.1-8B as filter backbone, or the particular benchmark properties drive the results. The utility-security tradeoff discussion is about mechanism design, not confounds." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper directly measures ASR (whether malicious actions occur) and utility (task success/win rates). These are direct measurements of the claimed defense properties, not proxies." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": true, 149 "justification": "Specifies 'gpt-4o-2024-05-13' (Section V-E), 'Llama-3.1-8B-Instruct' (filter model and backend), 'meta-llama/Llama-Prompt-Guard-2-86M' (baseline), and 'GPT-4.1' (PromptArmor detector). The snapshot date for gpt-4o is included." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": true, 154 "justification": "The full system prompt template for DataFilter is provided in Section IV-C (grey box). The PromptArmor reproduction prompt is also provided in Section V-C. Adaptive attack prompts are provided in Appendix B." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section V-A reports: batch size 1, gradient accumulation 16, learning rate 2×10^-5, cosine schedule, 100 warmup steps, BF16 precision, 300 steps. Backend LLM temperature set to 0 (Section V-E)." 160 }, 161 "scaffolding_described": { 162 "applies": true, 163 "answer": true, 164 "justification": "The DataFilter deployment architecture is described in Figure 1 and Section IV-B. The JSON data handling strategy for agentic applications is detailed in Section IV-D (recursive filtering of keys/values)." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "Algorithm 1 fully specifies the SFT dataset construction including truncation ratios, injection position distributions, and attack type simulation. The special tokens (<|end_of_instruction|>, <|end_of_data|>) are documented." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section VI contains a 'Limitations' paragraph with substantive discussion of three specific limitations: inference overhead, vulnerability to optimization-based attacks, and challenges with long user prompts." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": true, 181 "justification": "Specific threats discussed: DataFilter fails against optimization-based adaptive attacks (citing [12] which 'breaks our defense, as it breaches all existing defenses'), struggles with very long user prompts requiring developer extraction of short instructions, and adds an integration step." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": true, 186 "justification": "The paper explicitly states it does not consider optimization-based attacks (Section II-D: 'We do not consider optimization-based attacks'), positions DataFilter as 'a practical defense in the short and medium term', and states in Section VI that no work has solved the problem of strongest attacks." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": true, 193 "justification": "All evaluation benchmarks (SEP, InjecAgent, AgentDojo, AlpacaEval2) are public. The Alpaca training dataset is public. Code and model are released for independent verification." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Algorithm 1 fully specifies SFT dataset construction from Alpaca (N=19K samples with non-empty data part). Evaluation benchmarks are cited with descriptions (Sections V-B)." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants. Data sources are standard public benchmarks and a public instruction-tuning dataset." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "Algorithm 1 documents the full pipeline from Alpaca samples to SFT triples, including truncation, injection simulation, and position randomization with exact percentages at each step." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "Acknowledgments section lists: 'KACST-UC Berkeley Center of Excellence for Secure Computing, the NSF ACTION center through NSF grant 2229876, and by generous gifts from Google, Meta, and Noyce foundation.'" 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Author affiliations are clearly listed: UC Berkeley (Wang, Chen, Wagner) and KACST (Alkhudair, Alomair). They are not evaluating products from their own organizations." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": false, 225 "justification": "Google and Meta both fund this work and are major LLM providers. Both companies have commercial interest in prompt injection defenses (Google Bard and Meta's Llama are mentioned as targets). The paper evaluates Llama-3.1-8B-Instruct (Meta's model) as both filter and backend, creating a potential alignment between funder interest and positive results." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial disclosure statement is present in the paper." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": false, 236 "answer": false, 237 "justification": "The paper tests a defense mechanism, not model knowledge on benchmarks. The benchmarks measure defense effectiveness (ASR, utility under defense), not whether the model has memorized benchmark answers." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": false, 241 "answer": false, 242 "justification": "The paper tests defenses rather than model knowledge. Contamination of the backend LLM with benchmark data is not the focus — the defense's filtering behavior is what's evaluated." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": false, 246 "answer": false, 247 "justification": "Same rationale: the paper evaluates a defense mechanism, not a pre-trained model's capability on benchmarks." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in this study." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in this study." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in this study." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this study." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in this study." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in this study." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": true, 291 "justification": "Table IX reports per-sample cost overhead (<$0.0005) and latency overhead (<0.60s) for DataFilter with both GPT-5.1 and GPT-4o backends, computed using OpenRouter pricing." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "Training hardware is stated (two 80GB GPUs, A100/H100) and training steps (300) but total training time, GPU hours, or total training cost are not explicitly reported." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "The paper acknowledges non-determinism of gpt-4o (Section V-E: 'the backend LLM (gpt-4o) is non-deterministic despite setting the sampling temperature to 0') but does not report results across multiple seeds or runs." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": false, 308 "justification": "The number of experimental runs is never stated. Results appear to be from single runs without explicit mention of repetition." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": false, 313 "justification": "Training hyperparameters are reported but no hyperparameter search budget is described. The truncation ratios (65%/10%/10%/15%) and injection position ratios (20%/20%/60%) are presented as 'heuristically' chosen without justification." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": false, 318 "justification": "No justification for how the final configuration was selected. Heuristic choices (truncation percentages, position distributions, 300 training steps) are stated without explaining why these values were chosen over alternatives." 319 }, 320 "multiple_comparison_correction": { 321 "applies": true, 322 "answer": false, 323 "justification": "The paper makes numerous comparisons across 7 defenses, 6+ attack types, 4 benchmarks, and 2 backend LLMs without any statistical tests, let alone corrections for multiple comparisons." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "The authors evaluate their own DataFilter against baselines including their own re-implementation of PromptArmor (whose code was not released). No acknowledgment of self-evaluation bias, particularly regarding the re-implementation of concurrent work." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": true, 332 "answer": false, 333 "justification": "While Table IX reports DataFilter's overhead, there is no comparison of defense effectiveness as a function of compute budget. PromptArmor uses GPT-4.1 (much more expensive) vs DataFilter's Llama-3.1-8B, but this cost-effectiveness comparison is not made." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": false, 338 "justification": "No discussion of whether the benchmarks (SEP, InjecAgent, AgentDojo) actually measure real-world prompt injection defense effectiveness. No analysis of construct validity or comparison with real-world attack scenarios beyond benchmark settings." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": true, 342 "answer": true, 343 "justification": "DataFilter is applied as a preprocessing step before the same backend LLM and agent scaffold, so the scaffolding is identical across all defense comparisons. The confound is controlled by design." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "No discussion of temporal leakage. The filter is trained on Alpaca (2023) and tested on benchmarks released around the same time or later, but this is not discussed." 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "No discussion of whether the evaluation setup leaks information about attack patterns through features available to the filter that wouldn't be available in real deployment." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "No discussion of potential overlap or structural similarity between Alpaca training data and the evaluation benchmarks (particularly AlpacaEval2, which is derived from a related dataset)." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": false, 365 "justification": "No leakage detection or prevention method is used or discussed." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "DataFilter reduces prompt injection ASR to near zero (~2.2% average) while maintaining utility within 1-2 percentage points of the undefended model.", 372 "evidence": "Tables II-IV show ASR reductions across SEP, InjecAgent, and AgentDojo. Tables V-VI show utility preservation on AgentDojo (79.4% vs 81.4% benign) and AlpacaEval2 (54.1% vs 54.0% for GPT-4o).", 373 "supported": "strong" 374 }, 375 { 376 "claim": "DataFilter achieves a better security-utility tradeoff than all tested model-agnostic defenses.", 377 "evidence": "Figure 2 and comparative tables show DataFilter outperforms PromptArmor on both security (2.2% vs 5.9% average ASR) and utility (1.0% vs 4.1% average drop). Other baselines are substantially worse on security.", 378 "supported": "moderate" 379 }, 380 { 381 "claim": "DataFilter generalizes from basic training attacks (Straightforward, Ignore, Completion) to unseen attack types and agentic domains.", 382 "evidence": "Trained on Alpaca with 3 attack types but evaluated on 6 attack types including Context and Multi-Turn-Completion (Table IV), and on agentic benchmarks AgentDojo and InjecAgent (Tables II-III) with consistently low ASR.", 383 "supported": "strong" 384 }, 385 { 386 "claim": "DataFilter is robust to adaptive human-designed attacks targeting the filter itself.", 387 "evidence": "Table VII shows ASR ≤1% on SEP and 0% on AgentDojo under adaptive attacks that attempt to deceive or suppress the filter. Appendix B provides the attack prompts used.", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "DataFilter introduces marginal computational overhead (less than 4% cost, less than 18% latency).", 392 "evidence": "Table IX shows +3.7% cost and +4.0% latency with GPT-5.1 backend; +1.0% cost and +17.5% latency with GPT-4o backend.", 393 "supported": "strong" 394 }, 395 { 396 "claim": "DataFilter remains the most effective defense under strong adaptive LLM-based attacks, though still substantially breakable at 83% ASR.", 397 "evidence": "Table VIII shows DataFilter achieves 83% ASR vs 93% for PromptArmor under the strongest available attack from [12]. Authors note this attack 'breaks our defense, as it breaches all existing defenses.'", 398 "supported": "moderate" 399 } 400 ], 401 "methodology_tags": [ 402 "benchmark-eval" 403 ], 404 "key_findings": "DataFilter, a model-agnostic defense that fine-tunes Llama-3.1-8B-Instruct to strip prompt injections from untrusted data before it reaches the backend LLM, reduces average attack success rates from over 40% to ~2.2% across SEP, InjecAgent, and AgentDojo benchmarks while maintaining utility within 1-2 percentage points. It outperforms all tested model-agnostic baselines including PromptArmor (GPT-4.1-based) on both security and utility. However, strong adaptive LLM-based attacks still achieve 83% ASR against DataFilter, and all results lack uncertainty quantification (no error bars, no multi-run variance).", 405 "red_flags": [ 406 { 407 "flag": "No error bars or uncertainty quantification", 408 "detail": "All results across all tables are single point estimates with no confidence intervals, standard deviations, or multi-run variance. The authors acknowledge gpt-4o non-determinism but never quantify it. Results could be within noise of each other." 409 }, 410 { 411 "flag": "Self-implementation of concurrent baseline", 412 "detail": "PromptArmor (a key competitor) was re-implemented by the authors since no code was released. The authors modified the detection prompt ('we refined the detection prompt'), introducing potential bias in the comparison. The authors acknowledge PromptArmor's effectiveness 'is heavily dependent on ChatGPT's prior exposure' but their re-implementation choices are not validated." 413 }, 414 { 415 "flag": "Unjustified heuristic design choices", 416 "detail": "Multiple training design decisions use specific percentages described as 'heuristically' chosen (65%/10%/10%/15% truncation ratios, 20%/20%/60% injection positions) with no ablation, sensitivity analysis, or justification for these values." 417 }, 418 { 419 "flag": "No ablation study", 420 "detail": "DataFilter incorporates four specific training goals (benign preservation, anti-hallucination, anti-repetition, position randomization) but provides no ablation study to determine which components are necessary or how much each contributes to the final performance." 421 }, 422 { 423 "flag": "Strong adaptive attack substantially breaks the defense", 424 "detail": "Table VIII shows 83% ASR under the strongest adaptive LLM-based attack [12], yet the paper's framing emphasizes 'near zero' ASR from weaker attacks. The title and abstract do not mention this significant vulnerability." 425 } 426 ], 427 "cited_papers": [ 428 { 429 "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection", 430 "authors": [ 431 "K. Greshake", 432 "S. Abdelnabi", 433 "S. Mishra", 434 "C. Endres", 435 "T. Holz", 436 "M. Fritz" 437 ], 438 "year": 2023, 439 "relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications." 440 }, 441 { 442 "title": "The attacker moves second: Stronger adaptive attacks bypass defenses against llm jailbreaks and prompt injections", 443 "authors": [ 444 "M. Nasr", 445 "N. Carlini", 446 "C. Sitawarin" 447 ], 448 "year": 2025, 449 "arxiv_id": "2510.09023", 450 "relevance": "Demonstrates that strong adaptive attacks can break all existing prompt injection defenses including DataFilter (83% ASR)." 451 }, 452 { 453 "title": "Meta SecAlign: A Secure Foundation LLM Against Prompt Injection Attacks", 454 "authors": [ 455 "S. Chen", 456 "A. Zharmagambetov", 457 "D. Wagner", 458 "C. Guo" 459 ], 460 "year": 2025, 461 "arxiv_id": "2507.02735", 462 "relevance": "Fine-tuning defense for prompt injection that inspired DataFilter's training approach and demonstrated cross-domain generalization." 463 }, 464 { 465 "title": "StruQ: Defending against prompt injection with structured queries", 466 "authors": [ 467 "S. Chen", 468 "J. Piet", 469 "C. Sitawarin", 470 "D. Wagner" 471 ], 472 "year": 2025, 473 "relevance": "System-level defense that structures LLM queries to separate prompts from data, requiring model weight access." 474 }, 475 { 476 "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents", 477 "authors": [ 478 "Q. Zhan", 479 "Z. Liang", 480 "Z. Ying", 481 "D. Kang" 482 ], 483 "year": 2024, 484 "relevance": "Benchmark for evaluating prompt injection attacks in agentic tool-calling scenarios, used as primary evaluation benchmark." 485 }, 486 { 487 "title": "Agentdojo: A dynamic environment to evaluate attacks and defenses for llm agents", 488 "authors": [ 489 "E. Debenedetti", 490 "J. Zhang", 491 "M. Balunović", 492 "L. Beurer-Kellner", 493 "M. Fischer", 494 "F. Tramèr" 495 ], 496 "year": 2024, 497 "relevance": "Multi-tool agent benchmark for prompt injection with both security and utility evaluation, used as primary evaluation benchmark." 498 }, 499 { 500 "title": "Can llms separate instructions from data? and what do we even mean by that?", 501 "authors": [ 502 "E. Zverev", 503 "S. Abdelnabi", 504 "M. Fritz", 505 "C. H. Lampert" 506 ], 507 "year": 2025, 508 "relevance": "SEP benchmark providing controlled measurement of instruction-data separation in LLMs, used as evaluation benchmark." 509 }, 510 { 511 "title": "PromptArmor: Simple yet effective prompt injection defenses", 512 "authors": [ 513 "T. Shi", 514 "K. Zhu", 515 "Z. Wang" 516 ], 517 "year": 2025, 518 "arxiv_id": "2507.15219", 519 "relevance": "Concurrent defense using LLM-based detection and fuzzy string matching for injection removal; primary competitor to DataFilter." 520 }, 521 { 522 "title": "DataSentinel: A game-theoretic detection of prompt injection attacks", 523 "authors": [ 524 "Y. Liu", 525 "Y. Jia", 526 "J. Jia", 527 "D. Song", 528 "N. Z. Gong" 529 ], 530 "year": 2025, 531 "relevance": "Game-theoretic prompt injection detector using deliberately vulnerable LLM design, evaluated as baseline defense." 532 }, 533 { 534 "title": "Defeating prompt injections by design", 535 "authors": [ 536 "E. Debenedetti", 537 "I. Shumailov", 538 "T. Fan", 539 "J. Hayes", 540 "N. Carlini" 541 ], 542 "year": 2025, 543 "arxiv_id": "2503.18813", 544 "relevance": "System-level defense providing security-by-design against prompt injection through pipeline redesign." 545 }, 546 { 547 "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions", 548 "authors": [ 549 "E. Wallace", 550 "K. Xiao", 551 "R. Leike", 552 "L. Weng", 553 "J. Heidecke", 554 "A. Beutel" 555 ], 556 "year": 2024, 557 "arxiv_id": "2404.13208", 558 "relevance": "Model-level defense training LLMs to prioritize system instructions over injected instructions in data." 559 }, 560 { 561 "title": "Defending against indirect prompt injection attacks with spotlighting", 562 "authors": [ 563 "K. Hines", 564 "G. Lopez", 565 "M. Hall", 566 "F. Zarfati", 567 "Y. Zunger", 568 "E. Kiciman" 569 ], 570 "year": 2024, 571 "arxiv_id": "2403.14720", 572 "relevance": "Prompt-based defense using delimiting to mark untrusted data, evaluated as baseline." 573 } 574 ], 575 "engagement_factors": { 576 "practical_relevance": { 577 "score": 2, 578 "justification": "Released model and code for a plug-and-play prompt injection filter that can protect any backend LLM without modification, directly applicable to production agent systems." 579 }, 580 "surprise_contrarian": { 581 "score": 1, 582 "justification": "The main claim of near-zero ASR is undermined by the buried finding that adaptive LLM-based attacks still achieve 83% ASR, but the paper doesn't frame this tension as its headline." 583 }, 584 "fear_safety": { 585 "score": 2, 586 "justification": "Prompt injection defense is the core theme, with concrete demonstrations of real-world attacks against Google Bard, Slack AI, Claude Computer Use, and OpenAI Operator." 587 }, 588 "drama_conflict": { 589 "score": 1, 590 "justification": "Mildly challenges PromptArmor (a concurrent competitor they re-implemented themselves) and implicitly questions model providers for not shipping robust models, but no major controversy." 591 }, 592 "demo_ability": { 593 "score": 2, 594 "justification": "Model weights and reproduction code are released, requiring a Llama-3.1-8B setup but providing clear benchmarks to reproduce." 595 }, 596 "brand_recognition": { 597 "score": 1, 598 "justification": "UC Berkeley is well-recognized in security research but the authors and the tool itself are not household names in the broader tech community." 599 } 600 } 601 }