scan-v4.json (33964B)
1 { 2 "scan_version": 4, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Defending Against Prompt Injection with DataFilter", 6 "authors": [ 7 "Yizhu Wang", 8 "Sizhe Chen", 9 "Raghad F Alkhudair", 10 "Basel Alomair", 11 "David Wagner" 12 ], 13 "year": 2025, 14 "venue": "arXiv.org", 15 "arxiv_id": "2510.19207", 16 "doi": "10.48550/arXiv.2510.19207" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "The abstract claims 'reduces the prompt injection attack success rates to near zero while maintaining the LLMs' utility.' Results show average ASR ~2.2% (Tables II-IV) and utility drop of ~1-2% (Tables V-VI), consistent with the claims.", 24 "source": "opus" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Claims that DataFilter 'reduces' ASR are supported by controlled comparisons: same benchmarks, same backend LLMs, with and without the defense applied. This controlled experimental design adequately supports the causal claims.", 30 "source": "opus" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": true, 35 "justification": "The paper tests on multiple benchmarks (instruction-following and agentic), two backend LLMs (GPT-4o and Llama-3.1-8B), and multiple attack types. The Limitations section explicitly bounds scope: 'cannot defend against the strong optimization-based adaptive attacks' and acknowledges issues with very long prompts.", 36 "source": "opus" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper does not substantively discuss alternative explanations for DataFilter's effectiveness. There is no discussion of whether the specific Alpaca training distribution, the choice of Llama-3.1-8B as filter backbone, or the particular benchmark properties drive the results. The utility-security tradeoff discussion is about mechanism design, not confounds.", 42 "source": "opus" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper directly measures ASR (whether malicious actions occur) and utility (task success/win rates). These are direct measurements of the claimed defense properties, not proxies.", 48 "source": "opus" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section VI contains a 'Limitations' paragraph with substantive discussion of three specific limitations: inference overhead, vulnerability to optimization-based attacks, and challenges with long user prompts.", 56 "source": "opus" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": true, 61 "justification": "Specific threats discussed: DataFilter fails against optimization-based adaptive attacks (citing [12] which 'breaks our defense, as it breaches all existing defenses'), struggles with very long user prompts requiring developer extraction of short instructions, and adds an integration step.", 62 "source": "opus" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper explicitly states it does not consider optimization-based attacks (Section II-D: 'We do not consider optimization-based attacks'), positions DataFilter as 'a practical defense in the short and medium term', and states in Section VI that no work has solved the problem of strongest attacks.", 68 "source": "opus" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": true, 75 "justification": "Acknowledgments section lists: 'KACST-UC Berkeley Center of Excellence for Secure Computing, the NSF ACTION center through NSF grant 2229876, and by generous gifts from Google, Meta, and Noyce foundation.'", 76 "source": "opus" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations are clearly listed: UC Berkeley (Wang, Chen, Wagner) and KACST (Alkhudair, Alomair). They are not evaluating products from their own organizations.", 82 "source": "opus" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": false, 87 "justification": "Google and Meta both fund this work and are major LLM providers. Both companies have commercial interest in prompt injection defenses (Google Bard and Meta's Llama are mentioned as targets). The paper evaluates Llama-3.1-8B-Instruct (Meta's model) as both filter and backend, creating a potential alignment between funder interest and positive results.", 88 "source": "opus" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests or financial disclosure statement is present in the paper.", 94 "source": "opus" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Prompt injection, indirect prompt injection, attack success rate, model-agnostic, and the attacker/defender threat model are explicitly defined in Section II.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper clearly states DataFilter is a test-time model-agnostic defense that removes injected instructions from data before they reach the backend LLM, explicitly positioned to fill the gap between system-level and model-level defenses.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section III systematically reviews three categories of prior work and Section VI explicitly positions DataFilter relative to concurrent PromptArmor and PromptLocate, explaining design differences.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": true, 124 "justification": "The abstract states 'Our DataFilter model is released here for immediate use, with the code to reproduce our results here' with hyperlinks provided in footnote 1.", 125 "source": "opus" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "The training data is the publicly available Alpaca dataset [59]. All evaluation benchmarks (SEP, InjecAgent, AgentDojo, AlpacaEval2) are public. The SFT dataset construction is fully specified by Algorithm 1.", 131 "source": "opus" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper mentions 'two 80GB GPUs (A100/H100) using DeepSpeed ZeRO-3' and 'BF16 precision' but does not provide a requirements.txt, Dockerfile, or detailed library version listing sufficient to recreate the environment.", 137 "source": "opus" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper provides Algorithm 1 for SFT dataset construction, full training hyperparameters in Section V-A, and states the code to reproduce results is released. The prompt template is provided in full.", 143 "source": "opus" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "All results in Tables II-VIII are reported as single point estimates (e.g., '2.2% ASR') with no confidence intervals or error bars.", 151 "source": "opus" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are performed. Claims like 'DataFilter outperforms PromptArmor' are based solely on comparing raw percentages without any statistical test.", 157 "source": "opus" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "The paper reports effect sizes with baseline context throughout, e.g., 'average ASR 2.2% vs 5.9%' for DataFilter vs PromptArmor, 'average utility drop of 1.0% vs 4.1%', and provides both absolute values and differences (Section V-E).", 163 "source": "opus" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "No justification for sample sizes. They note 'we randomly select 1K samples from SEP' for efficiency but provide no power analysis or justification for any benchmark sample sizes.", 169 "source": "opus" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "No variance or standard deviation reported. The paper acknowledges 'the backend LLM (gpt-4o) is non-deterministic despite setting the sampling temperature to 0, rendering inevitable variability' (Section V-E) but does not report multi-run variance.", 175 "source": "opus" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Extensive baselines: PromptGuard, DataSentinel, Sandwich, Instructional, Spotlight, PromptArmor, and Tool Filter (Tables II-VI).", 183 "source": "opus" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "Baselines include concurrent work PromptArmor (2025), DataSentinel (2025), PromptGuard (2024), and Tool Filter (2025). These represent the current state of the art for model-agnostic defenses.", 189 "source": "opus" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": false, 194 "justification": "No systematic ablation study. The discussion mentions a preliminary experiment where 'we trained a filter without providing the user's prompt as context' (Section VI), but there is no systematic ablation of the four training goals (anti-hallucination, EOS token, position randomization, etc.) that comprise DataFilter's design.", 195 "source": "opus" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Security is measured by ASR across multiple benchmarks; utility is measured by task success rate (AgentDojo) and length-controlled WinRate (AlpacaEval2). Both dimensions are evaluated.", 201 "source": "opus" 202 }, 203 "human_evaluation": { 204 "applies": true, 205 "answer": false, 206 "justification": "No human evaluation is included. All security and utility assessments are automated. Human evaluation of filtered output quality or filter accuracy could have been informative.", 207 "source": "opus" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "The filter is trained on the Alpaca dataset and evaluated on entirely separate benchmarks (SEP, InjecAgent, AgentDojo, AlpacaEval2), providing clear train/test separation.", 213 "source": "opus" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Results are broken down by attack type (6 types on SEP in Table IV, 4 on AgentDojo in Table II, 2 on InjecAgent in Table III) and by benchmark.", 219 "source": "opus" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Appendix C provides detailed false positive and false negative examples. The Limitations section discusses when DataFilter fails (optimization-based attacks, very long prompts, injections disguised as task-relevant instructions).", 225 "source": "opus" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "Table VIII shows DataFilter achieves only 83% ASR reduction under strong adaptive LLM-based attacks — still substantially breakable. The Limitations section acknowledges several failure modes. The preliminary experiment without user prompt context showed utility degradation.", 231 "source": "opus" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "Specifies 'gpt-4o-2024-05-13' (Section V-E), 'Llama-3.1-8B-Instruct' (filter model and backend), 'meta-llama/Llama-Prompt-Guard-2-86M' (baseline), and 'GPT-4.1' (PromptArmor detector). The snapshot date for gpt-4o is included.", 239 "source": "opus" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": true, 244 "justification": "The full system prompt template for DataFilter is provided in Section IV-C (grey box). The PromptArmor reproduction prompt is also provided in Section V-C. Adaptive attack prompts are provided in Appendix B.", 245 "source": "opus" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Section V-A reports: batch size 1, gradient accumulation 16, learning rate 2×10^-5, cosine schedule, 100 warmup steps, BF16 precision, 300 steps. Backend LLM temperature set to 0 (Section V-E).", 251 "source": "opus" 252 }, 253 "scaffolding_described": { 254 "applies": true, 255 "answer": true, 256 "justification": "The DataFilter deployment architecture is described in Figure 1 and Section IV-B. The JSON data handling strategy for agentic applications is detailed in Section IV-D (recursive filtering of keys/values).", 257 "source": "opus" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Algorithm 1 fully specifies the SFT dataset construction including truncation ratios, injection position distributions, and attack type simulation. The special tokens (<|end_of_instruction|>, <|end_of_data|>) are documented.", 263 "source": "opus" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": true, 270 "justification": "All evaluation benchmarks (SEP, InjecAgent, AgentDojo, AlpacaEval2) are public. The Alpaca training dataset is public. Code and model are released for independent verification.", 271 "source": "opus" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Algorithm 1 fully specifies SFT dataset construction from Alpaca (N=19K samples with non-empty data part). Evaluation benchmarks are cited with descriptions (Sections V-B).", 277 "source": "opus" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants. Data sources are standard public benchmarks and a public instruction-tuning dataset.", 283 "source": "opus" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "Algorithm 1 documents the full pipeline from Alpaca samples to SFT triples, including truncation, injection simulation, and position randomization with exact percentages at each step.", 289 "source": "opus" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": false, 295 "answer": false, 296 "justification": "The paper tests a defense mechanism, not model knowledge on benchmarks. The benchmarks measure defense effectiveness (ASR, utility under defense), not whether the model has memorized benchmark answers.", 297 "source": "opus" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": false, 301 "answer": false, 302 "justification": "The paper tests defenses rather than model knowledge. Contamination of the backend LLM with benchmark data is not the focus — the defense's filtering behavior is what's evaluated.", 303 "source": "opus" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": false, 307 "answer": false, 308 "justification": "Same rationale: the paper evaluates a defense mechanism, not a pre-trained model's capability on benchmarks.", 309 "source": "opus" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants in this study.", 317 "source": "opus" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants in this study.", 323 "source": "opus" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants in this study.", 329 "source": "opus" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants in this study.", 335 "source": "opus" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants in this study.", 341 "source": "opus" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants in this study.", 347 "source": "opus" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants in this study.", 353 "source": "opus" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": true, 360 "justification": "Table IX reports per-sample cost overhead (<$0.0005) and latency overhead (<0.60s) for DataFilter with both GPT-5.1 and GPT-4o backends, computed using OpenRouter pricing.", 361 "source": "opus" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "Training hardware is stated (two 80GB GPUs, A100/H100) and training steps (300) but total training time, GPU hours, or total training cost are not explicitly reported.", 367 "source": "opus" 368 } 369 }, 370 "experimental_rigor": { 371 "seed_sensitivity_reported": { 372 "applies": true, 373 "answer": false, 374 "justification": "The paper acknowledges non-determinism of gpt-4o (Section V-E: 'the backend LLM (gpt-4o) is non-deterministic despite setting the sampling temperature to 0') but does not report results across multiple seeds or runs.", 375 "source": "opus" 376 }, 377 "number_of_runs_stated": { 378 "applies": true, 379 "answer": false, 380 "justification": "The number of experimental runs is never stated. Results appear to be from single runs without explicit mention of repetition.", 381 "source": "opus" 382 }, 383 "hyperparameter_search_budget": { 384 "applies": true, 385 "answer": false, 386 "justification": "Training hyperparameters are reported but no hyperparameter search budget is described. The truncation ratios (65%/10%/10%/15%) and injection position ratios (20%/20%/60%) are presented as 'heuristically' chosen without justification.", 387 "source": "opus" 388 }, 389 "best_config_selection_justified": { 390 "applies": true, 391 "answer": false, 392 "justification": "No justification for how the final configuration was selected. Heuristic choices (truncation percentages, position distributions, 300 training steps) are stated without explaining why these values were chosen over alternatives.", 393 "source": "opus" 394 }, 395 "multiple_comparison_correction": { 396 "applies": true, 397 "answer": false, 398 "justification": "The paper makes numerous comparisons across 7 defenses, 6+ attack types, 4 benchmarks, and 2 backend LLMs without any statistical tests, let alone corrections for multiple comparisons.", 399 "source": "opus" 400 }, 401 "self_comparison_bias_addressed": { 402 "applies": true, 403 "answer": false, 404 "justification": "The authors evaluate their own DataFilter against baselines including their own re-implementation of PromptArmor (whose code was not released). No acknowledgment of self-evaluation bias, particularly regarding the re-implementation of concurrent work.", 405 "source": "opus" 406 }, 407 "compute_budget_vs_performance": { 408 "applies": true, 409 "answer": false, 410 "justification": "While Table IX reports DataFilter's overhead, there is no comparison of defense effectiveness as a function of compute budget. PromptArmor uses GPT-4.1 (much more expensive) vs DataFilter's Llama-3.1-8B, but this cost-effectiveness comparison is not made.", 411 "source": "opus" 412 }, 413 "benchmark_construct_validity": { 414 "applies": true, 415 "answer": false, 416 "justification": "No discussion of whether the benchmarks (SEP, InjecAgent, AgentDojo) actually measure real-world prompt injection defense effectiveness. No analysis of construct validity or comparison with real-world attack scenarios beyond benchmark settings.", 417 "source": "opus" 418 }, 419 "scaffold_confound_addressed": { 420 "applies": true, 421 "answer": true, 422 "justification": "DataFilter is applied as a preprocessing step before the same backend LLM and agent scaffold, so the scaffolding is identical across all defense comparisons. The confound is controlled by design.", 423 "source": "opus" 424 } 425 }, 426 "data_leakage": { 427 "temporal_leakage_addressed": { 428 "applies": true, 429 "answer": false, 430 "justification": "No discussion of temporal leakage. The filter is trained on Alpaca (2023) and tested on benchmarks released around the same time or later, but this is not discussed.", 431 "source": "opus" 432 }, 433 "feature_leakage_addressed": { 434 "applies": true, 435 "answer": false, 436 "justification": "No discussion of whether the evaluation setup leaks information about attack patterns through features available to the filter that wouldn't be available in real deployment.", 437 "source": "opus" 438 }, 439 "non_independence_addressed": { 440 "applies": true, 441 "answer": false, 442 "justification": "No discussion of potential overlap or structural similarity between Alpaca training data and the evaluation benchmarks (particularly AlpacaEval2, which is derived from a related dataset).", 443 "source": "opus" 444 }, 445 "leakage_detection_method": { 446 "applies": true, 447 "answer": false, 448 "justification": "No leakage detection or prevention method is used or discussed.", 449 "source": "opus" 450 } 451 } 452 } 453 }, 454 "claims": [ 455 { 456 "claim": "DataFilter reduces average attack success rate from over 40% to approximately 2% across multiple benchmarks", 457 "evidence": "Tables II–IV show ASR reductions: AgentDojo average 0.4%, InjecAgent Base ~2%, SEP ≤4.6% across all six attack types and both gpt-4o and Llama backends", 458 "supported": "strong" 459 }, 460 { 461 "claim": "DataFilter maintains utility within approximately 1–2 percentage points of the undefended model", 462 "evidence": "Table V shows benign utility 79.4% vs 81.4% baseline on AgentDojo; Table VI shows negligible AlpacaEval2 difference (54.1% vs 54.0% for gpt-4o)", 463 "supported": "strong" 464 }, 465 { 466 "claim": "DataFilter achieves a better security-utility tradeoff than all tested model-agnostic defenses", 467 "evidence": "Figures 2 and 3 show DataFilter closest to the ideal zero-ASR/no-utility-drop corner; it outperforms PromptArmor on both average ASR (2.2% vs 5.9%) and utility drop (1.0% vs 4.1%)", 468 "supported": "strong" 469 }, 470 { 471 "claim": "DataFilter generalizes to attack patterns not seen during training, including the Context attack", 472 "evidence": "Table IV shows DataFilter achieves 2.2% ASR on the Context attack (gpt-4o) while baselines trained on similar patterns (DataSentinel) remain at 8.6%; filter was trained only on Straightforward/Ignore/Completion", 473 "supported": "strong" 474 }, 475 { 476 "claim": "DataFilter is model-agnostic and protects any black-box backend LLM without requiring access to model weights", 477 "evidence": "Demonstrated on proprietary gpt-4o and open Llama-3.1-8B-Instruct backends; only two backend models tested, limiting generalization strength", 478 "supported": "moderate" 479 }, 480 { 481 "claim": "Strong optimization-based adaptive attacks break DataFilter at 83% ASR, as they break all existing defenses", 482 "evidence": "Table VIII shows DataFilter achieves lowest ASR (83%) among all defenses under genetic-algorithm LLM-based attack, with the next-best (PromptArmor) at 93%", 483 "supported": "strong" 484 }, 485 { 486 "claim": "DataFilter introduces only marginal computational overhead (under 4% cost, under 18% latency increase)", 487 "evidence": "Table IX: +$0.0005/sample cost increase for GPT-5.1 (+3.7%), +0.57s latency for GPT-4o (+17.5%), computed from industry pricing statistics", 488 "supported": "strong" 489 } 490 ], 491 "methodology_tags": [ 492 "benchmark-eval" 493 ], 494 "key_findings": "DataFilter, a supervised fine-tuned Llama-3.1-8B-Instruct model, filters malicious prompt injections from untrusted data before it reaches a backend LLM, reducing average ASR from over 40% to approximately 2% across three security benchmarks while maintaining utility within 2% of the undefended baseline. It outperforms all tested model-agnostic defenses on the security-utility tradeoff and generalizes to unseen attack patterns including semantically sophisticated context-aware injections. However, strong optimization-based adaptive attacks using LLM-guided genetic algorithms achieve 83% ASR, indicating this defense class does not solve the fundamental prompt injection problem. Inference overhead is marginal (under $0.0005 and 0.60s per sample added).", 495 "red_flags": [ 496 { 497 "flag": "No statistical tests or CIs", 498 "detail": "All results are single-run point estimates with no confidence intervals, error bars, or significance tests, despite the paper explicitly acknowledging GPT-4o non-determinism introduces variability in results." 499 }, 500 { 501 "flag": "No formal ablation study", 502 "detail": "Key training design choices (65/10/10/15% truncation distribution, 20/20/60% injection position distribution, 4N total training samples) have no ablation; only one informal comparison without a table is mentioned." 503 }, 504 { 505 "flag": "Funder–model alignment undisclosed", 506 "detail": "Meta provides research gifts to the project and Meta's Llama-3.1-8B-Instruct is chosen as the DataFilter backbone; no competing interests statement addresses this alignment." 507 }, 508 { 509 "flag": "PromptArmor baseline is a reproduction, not official", 510 "detail": "Authors reproduced PromptArmor and modified its detection prompt because the original code was not released; modifications could affect the comparison favorably for DataFilter." 511 }, 512 { 513 "flag": "SEP subsample without power analysis", 514 "detail": "Only 1K of 9.1K SEP samples are evaluated 'for efficient evaluation' with no power analysis or evidence that this subsample is representative across attack types." 515 }, 516 { 517 "flag": "Single-run results for non-deterministic model", 518 "detail": "GPT-4o results are single-run point estimates despite the paper acknowledging the model is non-deterministic even at temperature=0, making minor differences between defenses unreliable." 519 } 520 ], 521 "cited_papers": [ 522 { 523 "title": "AgentDojo: A Dynamic Environment to Evaluate Attacks and Defenses for LLM Agents", 524 "relevance": "Primary evaluation benchmark for security (ASR) and utility in multi-turn agentic tool-calling scenarios" 525 }, 526 { 527 "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated LLM Agents", 528 "relevance": "Core security benchmark measuring ASR in API-calling agent scenarios with embedded tool-output injections" 529 }, 530 { 531 "title": "Can LLMs Separate Instructions from Data? (SEP benchmark, ICLR 2025)", 532 "relevance": "Primary security benchmark testing six attack types on instruction-following models" 533 }, 534 { 535 "title": "Meta SecAlign: A Secure Foundation LLM Against Prompt Injection Attacks", 536 "relevance": "Key design inspiration; DataFilter adopts similar special token separator but extends to model-agnostic black-box settings" 537 }, 538 { 539 "title": "StruQ: Defending Against Prompt Injection with Structured Queries (USENIX Security 2025)", 540 "relevance": "Closely related model-level defense; DataFilter extends structured-query concepts to model-agnostic deployment" 541 }, 542 { 543 "title": "The Attacker Moves Second: Stronger Adaptive Attacks Bypass Defenses Against LLM Jailbreaks and Prompt Injections", 544 "relevance": "Provides the optimization-based adaptive attack that breaks all existing defenses including DataFilter (83% ASR in Table VIII)" 545 }, 546 { 547 "title": "DataSentinel: A Game-Theoretic Detection of Prompt Injection Attacks (IEEE S&P 2025)", 548 "relevance": "Key detection-based baseline compared in all benchmarks; represents the detection-without-filtering defense category" 549 }, 550 { 551 "title": "Defeating Prompt Injections by Design (Debenedetti et al., 2025)", 552 "relevance": "System-level defense representing the guaranteed-security-by-design approach with limited task coverage tradeoff" 553 }, 554 { 555 "title": "PromptArmor: Simple Yet Effective Prompt Injection Defenses", 556 "relevance": "Concurrent filtering defense used as the strongest model-agnostic baseline; DataFilter outperforms it on both security and utility metrics" 557 }, 558 { 559 "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection", 560 "relevance": "Foundational work establishing indirect prompt injection as a threat class; cited to motivate DataFilter's design" 561 } 562 ], 563 "engagement_factors": { 564 "practical_relevance": { 565 "score": 2, 566 "justification": "Released model and code for a plug-and-play prompt injection filter that can protect any backend LLM without modification, directly applicable to production agent systems." 567 }, 568 "surprise_contrarian": { 569 "score": 1, 570 "justification": "The main claim of near-zero ASR is undermined by the buried finding that adaptive LLM-based attacks still achieve 83% ASR, but the paper doesn't frame this tension as its headline." 571 }, 572 "fear_safety": { 573 "score": 2, 574 "justification": "Prompt injection defense is the core theme, with concrete demonstrations of real-world attacks against Google Bard, Slack AI, Claude Computer Use, and OpenAI Operator." 575 }, 576 "drama_conflict": { 577 "score": 1, 578 "justification": "Mildly challenges PromptArmor (a concurrent competitor they re-implemented themselves) and implicitly questions model providers for not shipping robust models, but no major controversy." 579 }, 580 "demo_ability": { 581 "score": 2, 582 "justification": "Model weights and reproduction code are released, requiring a Llama-3.1-8B setup but providing clear benchmarks to reproduce." 583 }, 584 "brand_recognition": { 585 "score": 1, 586 "justification": "UC Berkeley is well-recognized in security research but the authors and the tool itself are not household names in the broader tech community." 587 } 588 }, 589 "hn_data": { 590 "threads": [ 591 { 592 "hn_id": "42919597", 593 "title": "Efficient Reasoning with Hidden Thinking", 594 "points": 172, 595 "comments": 43, 596 "url": "https://news.ycombinator.com/item?id=42919597", 597 "created_at": "2025-02-03T16:06:48Z" 598 }, 599 { 600 "hn_id": "38355249", 601 "title": "Open Problems in DAOs", 602 "points": 3, 603 "comments": 0, 604 "url": "https://news.ycombinator.com/item?id=38355249", 605 "created_at": "2023-11-20T21:39:59Z" 606 }, 607 { 608 "hn_id": "46311266", 609 "title": "Tiny-TSM: Efficiently Training a Lightweight SOTA Time Series Foundation Model", 610 "points": 1, 611 "comments": 0, 612 "url": "https://news.ycombinator.com/item?id=46311266", 613 "created_at": "2025-12-18T11:07:07Z" 614 }, 615 { 616 "hn_id": "37939342", 617 "title": "Can Large Language Models Explain Themselves? A Study", 618 "points": 1, 619 "comments": 0, 620 "url": "https://news.ycombinator.com/item?id=37939342", 621 "created_at": "2023-10-19T06:41:38Z" 622 } 623 ], 624 "top_points": 172, 625 "total_points": 177, 626 "total_comments": 43 627 } 628 }