scan-v5.json (26359B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Defending Against Prompt Injection with DataFilter", 6 "authors": ["Yizhu Wang", "Sizhe Chen", "Raghad F Alkhudair", "Basel Alomair", "David Wagner"], 7 "year": 2025, 8 "venue": "arXiv.org", 9 "arxiv_id": "2510.19207", 10 "doi": "10.48550/arXiv.2510.19207" 11 }, 12 "checklist": { 13 "claims_and_evidence": { 14 "abstract_claims_supported": { 15 "applies": true, 16 "answer": true, 17 "justification": "All abstract claims are supported: ASR reduction to near-zero is shown in Tables II–IV, utility preservation within 1–2% in Tables V–VI, and superiority over baselines in Figure 2.", 18 "source": "haiku" 19 }, 20 "causal_claims_justified": { 21 "applies": true, 22 "answer": true, 23 "justification": "Controlled experiments hold all variables constant except presence of DataFilter, making causal attribution appropriate for claims about its effect on ASR and utility.", 24 "source": "haiku" 25 }, 26 "generalization_bounded": { 27 "applies": true, 28 "answer": true, 29 "justification": "The limitations section explicitly states DataFilter cannot defend against optimization-based adaptive attacks (83% ASR) and struggles with very long user prompts, bounding the generalization claims.", 30 "source": "haiku" 31 }, 32 "alternative_explanations_discussed": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper does not discuss whether Llama-3.1-8B's inherent instruction-following strength rather than the filtering mechanism drives results, nor other confounds like benchmark difficulty differences.", 36 "source": "haiku" 37 }, 38 "proxy_outcome_distinction": { 39 "applies": true, 40 "answer": true, 41 "justification": "ASR (whether malicious API call occurs) and utility (task completion rate) are clearly defined and tied to specific claims; no conflation between what is measured and what is claimed.", 42 "source": "haiku" 43 } 44 }, 45 "limitations_and_scope": { 46 "limitations_section_present": { 47 "applies": true, 48 "answer": true, 49 "justification": "Section VI contains a dedicated 'Limitations' paragraph listing inference overhead, failure against optimization-based attacks, and difficulties with long user prompts.", 50 "source": "haiku" 51 }, 52 "threats_to_validity_specific": { 53 "applies": true, 54 "answer": true, 55 "justification": "Specific threats are named: strong adaptive LLM-based attacks break the defense (83% ASR), and DataFilter requires developers to extract short user instructions when the full prompt is very long.", 56 "source": "haiku" 57 }, 58 "scope_boundaries_stated": { 59 "applies": true, 60 "answer": true, 61 "justification": "The paper explicitly states DataFilter 'cannot defend against the strong optimization-based adaptive attacks' and 'may not yet match the absolute strongest protection possible with model-level defenses.'", 62 "source": "haiku" 63 } 64 }, 65 "conflicts_of_interest": { 66 "funding_disclosed": { 67 "applies": true, 68 "answer": true, 69 "justification": "Funding is disclosed: KACST-UC Berkeley Center of Excellence for Secure Computing, NSF grant 2229876, and gifts from Google, Meta, and Noyce Foundation.", 70 "source": "haiku" 71 }, 72 "affiliations_disclosed": { 73 "applies": true, 74 "answer": true, 75 "justification": "Author affiliations are clearly stated on the title page: UC Berkeley and KACST.", 76 "source": "haiku" 77 }, 78 "funder_independent_of_outcome": { 79 "applies": true, 80 "answer": false, 81 "justification": "Meta and Google are funders; Meta's PromptGuard is one of the baselines being outperformed, and DataFilter uses Meta's Llama-3.1-8B as its backbone model.", 82 "source": "haiku" 83 }, 84 "financial_interests_declared": { 85 "applies": true, 86 "answer": false, 87 "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is provided beyond the funding acknowledgment.", 88 "source": "haiku" 89 } 90 }, 91 "scope_and_framing": { 92 "key_terms_defined": { 93 "applies": true, 94 "answer": true, 95 "justification": "Prompt injection attack, attack success rate, utility, and model-agnostic are all explicitly defined in Sections II and IV, with attacker and defender goals formally stated.", 96 "source": "haiku" 97 }, 98 "intended_contribution_clear": { 99 "applies": true, 100 "answer": true, 101 "justification": "The paper clearly contributes DataFilter: a test-time, model-agnostic SFT-based defense that removes injected instructions from untrusted data before it reaches the backend LLM.", 102 "source": "haiku" 103 }, 104 "engagement_with_prior_work": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section III provides extensive related work; Table I explicitly positions DataFilter against fine-tuning, prompting, detection, and system-level defenses, with concurrent work (PromptArmor, PromptLocate) distinguished.", 108 "source": "haiku" 109 } 110 } 111 }, 112 "type_checklist": { 113 "empirical": { 114 "artifacts": { 115 "code_released": { 116 "applies": true, 117 "answer": true, 118 "justification": "The abstract states 'Our DataFilter model is released here for immediate use, with the code to reproduce our results here,' indicating release of both model and code.", 119 "source": "haiku" 120 }, 121 "data_released": { 122 "applies": true, 123 "answer": true, 124 "justification": "All evaluation benchmarks are publicly available (SEP, InjecAgent, AgentDojo, AlpacaEval2), and training uses the public Alpaca dataset.", 125 "source": "haiku" 126 }, 127 "environment_specified": { 128 "applies": true, 129 "answer": false, 130 "justification": "Training hardware (A100/H100 GPUs) and key hyperparameters are stated, but no requirements.txt, Dockerfile, or explicit dependency specification is provided in the paper.", 131 "source": "haiku" 132 }, 133 "reproduction_instructions": { 134 "applies": true, 135 "answer": true, 136 "justification": "Algorithm 1 provides step-by-step SFT dataset construction, Section V-A describes all training parameters, and code is released; sufficient to reproduce without guessing.", 137 "source": "haiku" 138 } 139 }, 140 "statistical_methodology": { 141 "confidence_intervals_or_error_bars": { 142 "applies": true, 143 "answer": false, 144 "justification": "All results in Tables II–IX are single point estimates with no confidence intervals or error bars, despite the paper acknowledging GPT-4o is non-deterministic.", 145 "source": "haiku" 146 }, 147 "significance_tests": { 148 "applies": true, 149 "answer": false, 150 "justification": "No statistical significance tests are applied to any comparative claims despite making superiority claims over multiple baselines.", 151 "source": "haiku" 152 }, 153 "effect_sizes_reported": { 154 "applies": true, 155 "answer": true, 156 "justification": "Absolute percentage differences are reported (e.g., average ASR 2.2% vs 5.9% for PromptArmor; utility drop 1.0% vs 4.1%), providing practical effect size context.", 157 "source": "haiku" 158 }, 159 "sample_size_justified": { 160 "applies": true, 161 "answer": false, 162 "justification": "SEP is evaluated on a random 1K subset of 9.1K samples with no justification for the subset size or representativeness confirmation; no power analysis anywhere.", 163 "source": "haiku" 164 }, 165 "variance_reported": { 166 "applies": true, 167 "answer": false, 168 "justification": "No variance, standard deviation, or spread measures are reported across any experimental runs, despite acknowledged model non-determinism.", 169 "source": "haiku" 170 } 171 }, 172 "evaluation_design": { 173 "baselines_included": { 174 "applies": true, 175 "answer": true, 176 "justification": "Seven baselines are tested: PromptGuard, DataSentinel, Sandwich, Instructional, Spotlight, Tool Filter, and PromptArmor, spanning detection-based, prompt-based, and system-level approaches.", 177 "source": "haiku" 178 }, 179 "baselines_contemporary": { 180 "applies": true, 181 "answer": true, 182 "justification": "All baselines are from 2023–2025 publications and represent the current state of the art in model-agnostic prompt injection defense.", 183 "source": "haiku" 184 }, 185 "ablation_study": { 186 "applies": true, 187 "answer": false, 188 "justification": "Four training goals are described but their individual contributions are not systematically ablated; only a brief mention of training without user prompt context appears in the discussion.", 189 "source": "haiku" 190 }, 191 "multiple_metrics": { 192 "applies": true, 193 "answer": true, 194 "justification": "Multiple metrics are used: ASR, benign utility, utility under attack (AgentDojo), and length-controlled win rate (AlpacaEval2).", 195 "source": "haiku" 196 }, 197 "human_evaluation": { 198 "applies": false, 199 "answer": false, 200 "justification": "Human evaluation is not standard for prompt injection defense evaluation; utility is measured via GPT-4-based automatic evaluation (AlpacaEval2).", 201 "source": "haiku" 202 }, 203 "held_out_test_set": { 204 "applies": true, 205 "answer": true, 206 "justification": "DataFilter is trained on Alpaca and evaluated on entirely separate benchmarks (SEP, InjecAgent, AgentDojo, AlpacaEval2) not used in training.", 207 "source": "haiku" 208 }, 209 "per_category_breakdown": { 210 "applies": true, 211 "answer": true, 212 "justification": "Results are broken down by attack type (6 in SEP, 4 in AgentDojo, 2 in InjecAgent), backend model (gpt-4o vs Llama), and benchmark, providing granular breakdowns.", 213 "source": "haiku" 214 }, 215 "failure_cases_discussed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Appendix C provides concrete false negative (billing document confusion) and false positive (cooking recipe instructions) examples with full input/output shown.", 219 "source": "haiku" 220 }, 221 "negative_results_reported": { 222 "applies": true, 223 "answer": true, 224 "justification": "DataFilter fails against strong LLM-based adaptive attacks (83% ASR); false positives on benign imperative content are documented; limitations with long prompts reported.", 225 "source": "haiku" 226 } 227 }, 228 "setup_transparency": { 229 "model_versions_specified": { 230 "applies": true, 231 "answer": true, 232 "justification": "Exact model versions are specified: gpt-4o-2024-05-13, meta-llama/Llama-Prompt-Guard-2-86M, Llama-3.1-8B-Instruct, and GPT-5.1/GPT-4.1 for relevant comparisons.", 233 "source": "haiku" 234 }, 235 "prompts_provided": { 236 "applies": true, 237 "answer": true, 238 "justification": "The full system prompt and user message template for DataFilter are shown verbatim in Section IV-C, including the filter instruction and special token formatting.", 239 "source": "haiku" 240 }, 241 "hyperparameters_reported": { 242 "applies": true, 243 "answer": true, 244 "justification": "All key hyperparameters reported: batch size 1, gradient accumulation 16, learning rate 2×10^-5, cosine schedule, 100 warmup steps, BF16 precision, 300 training steps.", 245 "source": "haiku" 246 }, 247 "scaffolding_described": { 248 "applies": true, 249 "answer": true, 250 "justification": "JSON parsing and recursive filtering for structured agentic data (Section IV-D) and the multi-turn agent setup in AgentDojo are described in sufficient detail.", 251 "source": "haiku" 252 }, 253 "data_preprocessing_documented": { 254 "applies": true, 255 "answer": true, 256 "justification": "Algorithm 1 documents exact preprocessing: truncation proportions (65%/10%/10%/15%), injection position distributions (20%/20%/60%), and attack type assignments.", 257 "source": "haiku" 258 } 259 }, 260 "data_integrity": { 261 "raw_data_available": { 262 "applies": true, 263 "answer": false, 264 "justification": "The constructed SFT training dataset is not explicitly released as a separate artifact; only the base Alpaca source and the trained model are released.", 265 "source": "haiku" 266 }, 267 "data_collection_described": { 268 "applies": true, 269 "answer": true, 270 "justification": "Algorithm 1 provides the complete data construction procedure from Alpaca samples to (prompt, data, output) triples with all design decisions and proportions documented.", 271 "source": "haiku" 272 }, 273 "recruitment_methods_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants; evaluation uses automated benchmarks requiring no recruitment.", 277 "source": "haiku" 278 }, 279 "data_pipeline_documented": { 280 "applies": true, 281 "answer": true, 282 "justification": "The full pipeline from Alpaca → SFT dataset construction (Algorithm 1) → fine-tuning → deployment is documented with specific parameters and design rationale for each step.", 283 "source": "haiku" 284 } 285 }, 286 "contamination": { 287 "training_cutoff_stated": { 288 "applies": true, 289 "answer": false, 290 "justification": "Llama-3.1-8B-Instruct's training data cutoff is not stated; it is possible the model's pretraining included examples similar to or identical to evaluation benchmarks.", 291 "source": "haiku" 292 }, 293 "train_test_overlap_discussed": { 294 "applies": true, 295 "answer": false, 296 "justification": "The paper does not discuss whether Llama-3.1-8B's pretraining data overlaps with the evaluation benchmarks (SEP, InjecAgent, AgentDojo), which could inflate filtering performance.", 297 "source": "haiku" 298 }, 299 "benchmark_contamination_addressed": { 300 "applies": true, 301 "answer": false, 302 "justification": "SEP and InjecAgent were published before Llama 3.1's likely training cutoff; potential contamination of the filter model's base knowledge is not discussed.", 303 "source": "haiku" 304 } 305 }, 306 "human_studies": { 307 "pre_registered": { 308 "applies": false, 309 "answer": false, 310 "justification": "No human participants.", 311 "source": "haiku" 312 }, 313 "irb_or_ethics_approval": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants.", 317 "source": "haiku" 318 }, 319 "demographics_reported": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants.", 323 "source": "haiku" 324 }, 325 "inclusion_exclusion_criteria": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants.", 329 "source": "haiku" 330 }, 331 "randomization_described": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants.", 335 "source": "haiku" 336 }, 337 "blinding_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants.", 341 "source": "haiku" 342 }, 343 "attrition_reported": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants.", 347 "source": "haiku" 348 } 349 }, 350 "cost_and_practicality": { 351 "inference_cost_reported": { 352 "applies": true, 353 "answer": true, 354 "justification": "Table IX reports per-sample monetary cost and wall-clock time for GPT-5.1 (+3.7% cost, +4.0% latency) and GPT-4o (+1.0% cost, +17.5% latency) with DataFilter.", 355 "source": "haiku" 356 }, 357 "compute_budget_stated": { 358 "applies": true, 359 "answer": false, 360 "justification": "Training hardware (two 80GB A100/H100 GPUs) and steps (300) are mentioned but total GPU-hours for training are not reported.", 361 "source": "haiku" 362 } 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "DataFilter reduces average ASR from over 40% to approximately 2% across multiple benchmarks", 369 "evidence": "Tables II, III, IV show ASR reductions to max 1.2% on AgentDojo, ~2% on InjecAgent Base, and 1.5–3.4% on SEP for gpt-4o backend", 370 "supported": "strong" 371 }, 372 { 373 "claim": "DataFilter preserves utility within 1–2% of the undefended baseline", 374 "evidence": "Table V shows benign utility 79.4% vs 81.4% baseline on AgentDojo; Table VI shows 54.1% vs 54.0% on AlpacaEval2 for gpt-4o", 375 "supported": "strong" 376 }, 377 { 378 "claim": "DataFilter outperforms all tested model-agnostic baselines on security-utility tradeoff", 379 "evidence": "Figure 2 shows DataFilter closest to ideal defense; average ASR 2.2% vs PromptArmor 5.9%; average utility drop 1.0% vs 4.1% for PromptArmor", 380 "supported": "strong" 381 }, 382 { 383 "claim": "DataFilter trained on general instruction-tuning data generalizes to unseen agentic settings", 384 "evidence": "DataFilter trained on non-agentic Alpaca achieves low ASR on agentic benchmarks AgentDojo and InjecAgent involving multi-turn tool calls", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "DataFilter is the first model-agnostic defense simultaneously achieving strong security and high utility", 389 "evidence": "Table I categorizes all prior defenses as lacking at least one of security, utility, or model-agnostic properties; DataFilter satisfies all three", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "Strong optimization-based adaptive attacks break DataFilter with 83% ASR", 394 "evidence": "Table VIII shows DataFilter achieves 83% ASR under genetic algorithm-based LLM attack, though lowest among all tested defenses (93–100% for others)", 395 "supported": "strong" 396 } 397 ], 398 "methodology_tags": ["benchmark-eval"], 399 "key_findings": "DataFilter, a supervised fine-tuned Llama-3.1-8B model, reduces prompt injection ASR from >40% to ~2% across three benchmarks while maintaining utility within 2% of baseline, outperforming all tested model-agnostic defenses on the security-utility tradeoff. Training on general-purpose Alpaca data enables generalization to unseen agentic settings (AgentDojo, InjecAgent) without domain-specific adaptation. However, strong optimization-based adaptive attacks still achieve 83% ASR, and the defense struggles with very long user prompts requiring developer intervention. Marginal inference overhead (+1–4% cost, +4–18% latency) and plug-and-play deployment make it immediately practical for black-box commercial LLMs.", 400 "red_flags": [ 401 { 402 "flag": "No statistical testing", 403 "detail": "All comparative claims are made without confidence intervals, significance tests, or variance reporting, despite the paper acknowledging non-determinism in GPT-4o; results may not be reliable across runs." 404 }, 405 { 406 "flag": "Funder conflict with baseline", 407 "detail": "Meta and Google are funders; Meta's PromptGuard is a baseline being outperformed, and DataFilter uses Meta's Llama-3.1-8B as its backbone model." 408 }, 409 { 410 "flag": "PromptArmor reproduced by authors", 411 "detail": "Authors reproduced PromptArmor from scratch (no official code) and modified its detection prompt, which may not reflect the strongest possible PromptArmor configuration." 412 }, 413 { 414 "flag": "No ablation table", 415 "detail": "Four training goals (benign preservation, anti-hallucination, anti-repetition, position robustness) are described but their individual contributions are not systematically ablated in a table." 416 }, 417 { 418 "flag": "Contamination unaddressed", 419 "detail": "Llama-3.1-8B's training cutoff is not stated; evaluation benchmarks (SEP, InjecAgent) predate Llama 3.1 and may have been seen during pretraining, potentially inflating filtering performance." 420 }, 421 { 422 "flag": "SEP subsample without justification", 423 "detail": "Only 1K of 9.1K SEP samples are evaluated with no justification for subset size or confirmation that the subsample is representative." 424 } 425 ], 426 "cited_papers": [ 427 { 428 "title": "AgentDojo: A Dynamic Environment to Evaluate Attacks and Defenses for LLM Agents", 429 "relevance": "Primary evaluation benchmark for both security and utility of DataFilter in multi-turn agentic tool-calling settings" 430 }, 431 { 432 "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents", 433 "relevance": "Secondary evaluation benchmark measuring indirect injection in API-calling scenarios with 1K samples" 434 }, 435 { 436 "title": "Can LLMs Separate Instructions from Data? And What Do We Even Mean by That?", 437 "relevance": "SEP benchmark used for instruction-following security evaluation across 6 attack types" 438 }, 439 { 440 "title": "Meta SecAlign: A Secure Foundation LLM Against Prompt Injection Attacks", 441 "relevance": "State-of-the-art fine-tuning defense, used as reference for training strategy design and as comparison for model-level vs model-agnostic tradeoffs" 442 }, 443 { 444 "title": "The Attacker Moves Second: Stronger Adaptive Attacks Bypass Defenses Against LLM Jailbreaks and Prompt Injections", 445 "relevance": "Strong adaptive attack that breaks DataFilter, establishing the ceiling on defense effectiveness against optimized adversaries" 446 }, 447 { 448 "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection", 449 "relevance": "Foundational work defining indirect prompt injection and motivating the threat landscape for LLM agents" 450 }, 451 { 452 "title": "StruQ: Defending Against Prompt Injection with Structured Queries", 453 "relevance": "Fine-tuning defense using structured query format, key prior work in model-level defenses that DataFilter is positioned against" 454 }, 455 { 456 "title": "DataSentinel: A Game-Theoretic Detection of Prompt Injection Attacks", 457 "relevance": "Detection-based baseline that DataFilter outperforms, demonstrating the detection-vs-filtering design space tradeoff" 458 }, 459 { 460 "title": "Defeating Prompt Injections by Design", 461 "relevance": "System-level defense providing security-by-design guarantees, representing the alternative architectural approach to DataFilter" 462 }, 463 { 464 "title": "AlpacaEval: An Automatic Evaluator of Instruction-following Models", 465 "relevance": "Utility evaluation benchmark used to measure instruction-following quality with and without DataFilter applied" 466 } 467 ], 468 "engagement_factors": { 469 "practical_relevance": { 470 "score": 3, 471 "justification": "DataFilter is released as a plug-and-play defense for any LLM system, directly addressing OWASP #1 LLM threat with marginal overhead and no backend model access required." 472 }, 473 "surprise_contrarian": { 474 "score": 2, 475 "justification": "Challenges the assumed security-utility tradeoff in model-agnostic defenses, showing it is possible to nearly eliminate injections without meaningful utility loss." 476 }, 477 "fear_safety": { 478 "score": 3, 479 "justification": "Directly addresses OWASP #1 LLM threat citing real attacks against Google Bard, Slack AI, Anthropic Claude Computer Use, and OpenAI Operator causing data leakage and malware execution." 480 }, 481 "drama_conflict": { 482 "score": 1, 483 "justification": "Mild security arms race framing with acknowledgment that strong adaptive attacks break the defense, but no major controversy or conflict angle." 484 }, 485 "demo_ability": { 486 "score": 3, 487 "justification": "Model and code are explicitly released for immediate use; practitioners can deploy DataFilter today on any LLM application without accessing backend model weights." 488 }, 489 "brand_recognition": { 490 "score": 2, 491 "justification": "UC Berkeley affiliation, Meta and Google funding, and evaluation on GPT-4o/GPT-5.1 add credibility; David Wagner is a well-known security researcher." 492 } 493 }, 494 "hn_data": { 495 "threads": [ 496 { 497 "hn_id": "42919597", 498 "title": "Efficient Reasoning with Hidden Thinking", 499 "points": 172, 500 "comments": 43, 501 "url": "https://news.ycombinator.com/item?id=42919597", 502 "created_at": "2025-02-03T16:06:48Z" 503 }, 504 { 505 "hn_id": "38355249", 506 "title": "Open Problems in DAOs", 507 "points": 3, 508 "comments": 0, 509 "url": "https://news.ycombinator.com/item?id=38355249", 510 "created_at": "2023-11-20T21:39:59Z" 511 }, 512 { 513 "hn_id": "46311266", 514 "title": "Tiny-TSM: Efficiently Training a Lightweight SOTA Time Series Foundation Model", 515 "points": 1, 516 "comments": 0, 517 "url": "https://news.ycombinator.com/item?id=46311266", 518 "created_at": "2025-12-18T11:07:07Z" 519 }, 520 { 521 "hn_id": "37939342", 522 "title": "Can Large Language Models Explain Themselves? A Study", 523 "points": 1, 524 "comments": 0, 525 "url": "https://news.ycombinator.com/item?id=37939342", 526 "created_at": "2023-10-19T06:41:38Z" 527 } 528 ], 529 "top_points": 172, 530 "total_points": 177, 531 "total_comments": 43 532 } 533 }