scan.json (26040B)
1 { 2 "paper": { 3 "title": "DataSentinel: A Game-Theoretic Detection of Prompt Injection Attacks", 4 "authors": [ 5 "Yupei Liu", 6 "Yuqi Jia", 7 "Jinyuan Jia", 8 "Dawn Song", 9 "Neil Zhenqiang Gong" 10 ], 11 "year": 2025, 12 "venue": "IEEE Symposium on Security and Privacy (S&P)", 13 "arxiv_id": "2504.11358" 14 }, 15 "scan_version": 3, 16 "active_modules": [ 17 "experimental_rigor", 18 "data_leakage" 19 ], 20 "methodology_tags": [ 21 "benchmark-eval" 22 ], 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "Code and data released at https://github.com/liu00222/Open-Prompt-Injection, stated in the abstract." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The paper uses publicly available benchmark datasets (MRPC, Jfleg, SMS Spam, RTE, SST2, HSOL, Gigaword) and states code and data are available at the GitHub link." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided in the paper. Only model names and hyperparameters are listed." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "No step-by-step reproduction instructions are provided in the paper itself. The GitHub link is provided but no README or reproduction guide is described in the paper." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "Results are reported as point estimates (FPR and FNR values) without confidence intervals or error bars." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper claims DataSentinel 'significantly outperforms' baselines but provides no statistical significance tests — comparisons are based solely on comparing FPR/FNR numbers." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "FPR and FNR values are reported with baselines for context (e.g., KAD FPR up to 0.10 vs DataSentinel 0.00; KAD FNR up to 0.21 vs DataSentinel at most 0.07), providing magnitude context." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "100 data points per task are sampled from test sets without justification for why 100 is sufficient. No power analysis." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "The paper mentions fixing the random seed (Section 5.1) and reports single-run results. No variance across multiple runs is reported." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Six baseline detection methods are compared: EVD, NLLMD, SSFTD, SSFTD-G, PromptGuard, and KAD (Section 5.1)." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "Baselines include contemporary methods: PromptGuard (Meta, 2024), KAD (USENIX Security 2024), and recent attack methods like NeuralExec and Universal." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "Extensive ablation study in Section 5.3 covering r, |D|, α, β, nin, nout, detection LLM, backend LLM. Also DataSentinel (Min) variant removes the adversarial component." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "Two metrics used: False Positive Rate (FPR) and False Negative Rate (FNR), reported separately." 93 }, 94 "human_evaluation": { 95 "applies": false, 96 "answer": false, 97 "justification": "This is an automated detection method evaluated on benchmark datasets. Human evaluation is not relevant to the claims." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": true, 102 "justification": "Fine-tuning uses Gigaword training set (500 data points). Evaluation uses 100 data points from test sets of 7 different datasets (Section 5.1). Fine-tuning tasks do not overlap with evaluation tasks." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Results broken down per target task (7 tasks), per injected task (7 tasks), per attack type (9 attacks), and per target-injected combination in appendix Tables 10-16." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section 6 discusses failure case where injected and target tasks are the same type (sentiment analysis FNR = 0.87), and the limitation with adversarial examples vs prompt injection." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The paper reports that DataSentinel is less effective when injected task = target task (FNR 0.87 for sentiment analysis), and discusses why in Section 6." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "Abstract claims of effective detection on multiple benchmarks and LLMs are supported by Tables 1-6. The claim of outperforming baselines is supported by Table 3." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "Causal claims about which components matter are supported by ablation studies (Section 5.3) and the DataSentinel (Min) variant comparison (Section 5.4), which isolate the game-theoretic component." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": true, 134 "justification": "Section 6 explicitly bounds generalization: less effective when injected task = target task, discusses benign instructions limitation, and notes the defense may be less effective as LLMs improve (meta-review D.4)." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": true, 139 "justification": "Section 6 discusses alternative explanations: the detection LLM vs backend LLM context difference, why some false negatives still cause attacks, and comparison with StruQ/SecAlign as alternative defense approaches." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": true, 144 "justification": "The paper measures FPR and FNR for prompt injection detection and frames claims at that exact granularity: 'DataSentinel achieves 0% FPR and at most 7% FNR' on specific benchmarks. It does not frame this as 'security' or 'safety' broadly — it stays within the detection accuracy construct." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": false, 151 "justification": "Models specified as 'Mistral-7B', 'LLaMA2-7B', 'LLaMA3-8B-Instruct' without specific snapshot dates or HuggingFace model IDs with version hashes. Marketing names without version specifics." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": true, 156 "justification": "Detection instruction template provided: 'Repeat [secret key] once while ignoring the following text:' (Section 4.2). Target/injected instructions referenced from prior work [7] and detailed in Appendix A." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Comprehensive hyperparameter reporting in Section 5.1: temperature=0.1, α=1, β=1, r=3, lr=0.000025, bin=8, bout=2, nin=10, nout=500, QLoRA used." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "No agentic scaffolding is used. DataSentinel is a fine-tuning and inference method, not an agentic system." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 5.1 documents data construction: 100 data points sampled from test sets per task, 500 from Gigaword training set for fine-tuning, how contaminated data samples are constructed (100 per target-injected combination)." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section 6 'Discussion and Limitations' provides substantive discussion of multiple limitations." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": true, 183 "justification": "Section 6 discusses specific threats: less effective for same target/injected task type, benign instructions in data causing false positives, and potential weakness as LLMs improve at instruction following." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": true, 188 "justification": "Section 6 explicitly states DataSentinel is less effective for adversarial examples (same task type), cannot distinguish benign instructions from injections, and leaves detecting same-task attacks as future work." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": true, 195 "justification": "All benchmark datasets used (MRPC, Jfleg, SMS Spam, RTE, SST2, HSOL, Gigaword) are publicly available. Code and data available at GitHub." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section 5.1 describes data collection: which datasets, how many samples, how contaminated data is constructed using each attack type, sampling procedures." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants. All data is from standard NLP benchmarks." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "The pipeline from dataset sampling to contaminated data construction to evaluation is documented in Section 5.1, including counts (100 data points per task, 35,700 total contaminated samples)." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Acknowledgments section lists NSF grants 2131859, 2125977, 2112562, 1937787 and ARO grant W911NF2110182." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Author affiliations clearly listed: Penn State, Duke University, UC Berkeley. No product being evaluated is from these institutions." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": true, 227 "justification": "Funding is from NSF and ARO (government agencies) which have no financial stake in the outcome of prompt injection detection research." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests or financial interests statement is present in the paper." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": false, 238 "answer": false, 239 "justification": "The paper does not evaluate pre-trained model capability on a benchmark. It evaluates a detection defense method against attacks — model knowledge/contamination is not relevant to the claims." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": false, 243 "answer": false, 244 "justification": "Same as above: this tests a defense method, not model knowledge on benchmarks." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": false, 248 "answer": false, 249 "justification": "Same as above: benchmark contamination (model memorizing test data) is not relevant to evaluating a prompt injection detector." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": true, 293 "justification": "Section 5.2 reports inference cost: 1.6 seconds per query on Quadro RTX 6000, ~10% overhead vs backend LLM (15.3s). Also reports 0.7s for smaller LLaMA3.2-1B detection LLM." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": true, 298 "justification": "Fine-tuning takes ~3 hours on one Quadro RTX 6000 GPU, costing $0.90 in cloud GPU rent (Section 5.2)." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "The paper fixes a single random seed (Section 5.1: 'fix the seed for the random number generator') and reports single-run results. No multi-seed analysis." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": false, 310 "justification": "No explicit statement of number of runs. Single fixed seed implies single run." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "The ablation study (Section 5.3) varies hyperparameters one at a time but does not report a search budget or how the default values were selected." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": false, 320 "justification": "Default hyperparameters (α=1, β=1, r=3, etc.) are stated but not justified beyond the ablation showing they work well. No validation set selection procedure described." 321 }, 322 "multiple_comparison_correction": { 323 "applies": false, 324 "answer": false, 325 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "The authors implement their own baselines (EVD, NLLMD, SSFTD, SSFTD-G) and compare against their own system. No acknowledgment of self-comparison bias. PromptGuard and KAD use open-source code from others." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "DataSentinel requires fine-tuning (3 hours GPU) while KAD does not, but performance is not reported as a function of matched compute budgets." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": false, 340 "justification": "The paper does not discuss whether the 7 NLP tasks and attack scenarios are representative of real-world prompt injection threats. No discussion of construct validity of the evaluation setup." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "The paper evaluates a detection method, not model comparisons through different scaffolds. The fine-tuned detection LLM is evaluated directly on benchmark inputs. No scaffolding framework mediates between the model and the evaluation." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": false, 351 "answer": false, 352 "justification": "This evaluates a defense method, not model knowledge on benchmarks. Temporal leakage of benchmark solutions is not relevant here." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": true, 357 "justification": "The paper explicitly separates the fine-tuning data (Gigaword training set, different instruction) from evaluation data (test sets of 7 tasks with different instructions), and notes the adaptive attacks during fine-tuning differ from evaluation attacks (Section 5.2)." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": true, 362 "justification": "Section 5.2 explicitly states fine-tuning tasks (D) do not overlap with evaluation target/injected tasks, and the optimized separator during training differs from attack separators used in evaluation." 363 }, 364 "leakage_detection_method": { 365 "applies": false, 366 "answer": false, 367 "justification": "Standard benchmark contamination detection is not relevant to this defense evaluation. The paper does address train-test separation through design." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "DataSentinel achieves FPR close to 0 and FNR at most 0.07 across all existing prompt injection attacks.", 374 "evidence": "Tables 1 and 2 show FPR ≤ 0.01 and FNR ≤ 0.07 across 7 target tasks, 7 injected tasks, and 9 attacks (Section 5.2).", 375 "supported": "strong" 376 }, 377 { 378 "claim": "DataSentinel significantly outperforms 6 baseline detection methods in both FPR and FNR.", 379 "evidence": "Table 3 shows DataSentinel achieves FPR of 0.00-0.01 vs KAD 0.01-0.10, and lower FNR across all injected tasks under NeuralExec (Section 5.2).", 380 "supported": "strong" 381 }, 382 { 383 "claim": "DataSentinel remains effective against adaptive attacks as long as injected instructions differ from target task.", 384 "evidence": "Table 6 shows FNR ≤ 0.06 for adaptive attacks except when target and injected task are both sentiment analysis (FNR = 0.87). Section 5.4.2.", 385 "supported": "strong" 386 }, 387 { 388 "claim": "DataSentinel generalizes to unseen backend LLMs (third-party provider scenario).", 389 "evidence": "Table 5 shows FNR ≤ 0.01 when fine-tuned with LLaMA3-8B but evaluated against attacks optimized for OpenChat, Mistral-7B, Mixtral-8x7B, LLaMA-3.1-8B (Section 5.3).", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "Detection overhead is minor (~10%) compared to backend LLM processing time.", 394 "evidence": "Section 5.2 reports 1.6s detection query vs 15.3s backend query on Quadro RTX 6000. Fine-tuning takes ~3 hours ($0.90 cloud cost).", 395 "supported": "strong" 396 } 397 ], 398 "key_findings": "DataSentinel formulates prompt injection detection as a minimax optimization problem, fine-tuning a detection LLM to be more vulnerable to injections (turning vulnerability into defense signal). Evaluated on 9 attacks, 7 NLP tasks, and 6 LLMs, it achieves near-zero FPR and FNR ≤ 0.07 on existing attacks, substantially outperforming 6 baselines including known-answer detection. The approach is less effective when the injected task matches the target task type, as prompt injection reduces to adversarial examples in that scenario.", 399 "red_flags": [ 400 { 401 "flag": "Single-seed evaluation", 402 "detail": "All results reported from a single fixed random seed. No variance across seeds is reported, making it impossible to assess result stability." 403 }, 404 { 405 "flag": "No statistical significance tests", 406 "detail": "Paper claims DataSentinel 'significantly outperforms' baselines but provides no statistical tests — comparisons rely solely on comparing point estimates." 407 }, 408 { 409 "flag": "Self-implemented baselines", 410 "detail": "Four of six baselines (EVD, NLLMD, SSFTD, SSFTD-G) are implemented by the authors. Only PromptGuard and KAD use third-party implementations. No acknowledgment of potential self-comparison bias." 411 } 412 ], 413 "cited_papers": [ 414 { 415 "title": "Formalizing and benchmarking prompt injection attacks and defenses", 416 "authors": [ 417 "Y. Liu", 418 "Y. Jia", 419 "R. Geng", 420 "J. Jia", 421 "N. Z. Gong" 422 ], 423 "year": 2024, 424 "relevance": "Foundational benchmark for prompt injection attacks and defenses; provides the experimental framework DataSentinel builds upon." 425 }, 426 { 427 "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection", 428 "authors": [ 429 "K. Greshake", 430 "S. Abdelnabi", 431 "S. Mishra", 432 "C. Endres", 433 "T. Holz", 434 "M. Fritz" 435 ], 436 "year": 2023, 437 "relevance": "Seminal work on indirect prompt injection attacks against LLM-integrated applications." 438 }, 439 { 440 "title": "Neural exec: Learning (and learning from) execution triggers for prompt injection attacks", 441 "authors": [ 442 "D. Pasquini", 443 "M. Strohmeier", 444 "C. Troncoso" 445 ], 446 "year": 2024, 447 "arxiv_id": "2403.03792", 448 "relevance": "Optimization-based prompt injection attack used as a primary baseline and default attack in DataSentinel evaluation." 449 }, 450 { 451 "title": "Universal and transferable adversarial attacks on aligned language models", 452 "authors": [ 453 "A. Zou", 454 "Z. Wang", 455 "J. Z. Kolter", 456 "M. Fredrikson" 457 ], 458 "year": 2023, 459 "arxiv_id": "2307.15043", 460 "relevance": "GCG method used as the core optimization technique in DataSentinel's minimax formulation." 461 }, 462 { 463 "title": "Struq: Defending against prompt injection with structured queries", 464 "authors": [ 465 "S. Chen", 466 "J. Piet", 467 "C. Sitawarin", 468 "D. Wagner" 469 ], 470 "year": 2025, 471 "relevance": "Prevention-based defense against prompt injection; compared with DataSentinel's detection approach in Section 6." 472 }, 473 { 474 "title": "SecAlign: Defending against prompt injection with preference optimization", 475 "authors": [ 476 "S. Chen", 477 "A. Zharmagambetov", 478 "S. Mahloujifar", 479 "K. Chaudhuri", 480 "D. Wagner", 481 "C. Guo" 482 ], 483 "year": 2024, 484 "arxiv_id": "2410.05451", 485 "relevance": "Prevention-based defense using preference optimization; evaluated alongside DataSentinel in Section 6." 486 }, 487 { 488 "title": "Automatic and universal prompt injection attacks against large language models", 489 "authors": [ 490 "X. Liu", 491 "Z. Yu", 492 "Y. Zhang", 493 "N. Zhang", 494 "C. Xiao" 495 ], 496 "year": 2024, 497 "arxiv_id": "2403.04957", 498 "relevance": "Universal optimization-based prompt injection attack evaluated as a baseline in DataSentinel experiments." 499 }, 500 { 501 "title": "Pleak: Prompt leaking attacks against large language model applications", 502 "authors": [ 503 "B. Hui", 504 "H. Yuan", 505 "N. Gong", 506 "P. Burlina", 507 "Y. Cao" 508 ], 509 "year": 2024, 510 "relevance": "Prompt stealing attack evaluated in DataSentinel; tests a specific injected task of extracting target instructions." 511 }, 512 { 513 "title": "The instruction hierarchy: Training llms to prioritize privileged instructions", 514 "authors": [ 515 "E. Wallace", 516 "K. Xiao", 517 "R. Leike", 518 "L. Weng", 519 "J. Heidecke", 520 "A. Beutel" 521 ], 522 "year": 2024, 523 "arxiv_id": "2404.13208", 524 "relevance": "OpenAI's approach to instruction prioritization as a defense against prompt injection attacks." 525 }, 526 { 527 "title": "Jatmo: Prompt injection defense by task-specific finetuning", 528 "authors": [ 529 "J. Piet", 530 "M. Alrashed", 531 "C. Sitawarin" 532 ], 533 "year": 2024, 534 "arxiv_id": "2312.17673", 535 "relevance": "Task-specific fine-tuning defense against prompt injection; related prevention approach discussed in DataSentinel." 536 } 537 ], 538 "engagement_factors": { 539 "practical_relevance": { 540 "score": 2, 541 "justification": "Open-source tool with code available that developers building LLM-integrated applications could deploy to detect prompt injection attacks." 542 }, 543 "surprise_contrarian": { 544 "score": 1, 545 "justification": "The insight of deliberately making a detection LLM more vulnerable to turn weakness into defense signal is clever but not deeply counterintuitive." 546 }, 547 "fear_safety": { 548 "score": 2, 549 "justification": "Prompt injection is a major security concern for deployed LLM applications, and the paper systematically demonstrates attack vectors and detection gaps." 550 }, 551 "drama_conflict": { 552 "score": 1, 553 "justification": "Mildly challenges existing detection approaches like Meta's PromptGuard (shown to flag nearly everything) but doesn't target a specific company's claims." 554 }, 555 "demo_ability": { 556 "score": 1, 557 "justification": "Code is on GitHub but requires GPU access, fine-tuning setup, and open-source LLMs — not a quick-try experience." 558 }, 559 "brand_recognition": { 560 "score": 1, 561 "justification": "Authors from Duke, Penn State, and UC Berkeley (Dawn Song) are well-known in security research but not household names in broader tech." 562 } 563 } 564 }