scan.json (33411B)
1 { 2 "paper": { 3 "title": "Defending Against Prompt Injection With a Few DefensiveTokens", 4 "authors": [ 5 "Sizhe Chen", 6 "Yizhu Wang", 7 "Nicholas Carlini", 8 "Chawin Sitawarin", 9 "David Wagner" 10 ], 11 "year": 2025, 12 "venue": "AISec@CCS", 13 "arxiv_id": "2507.07974", 14 "doi": "10.1145/3733799.3762982" 15 }, 16 "scan_version": 3, 17 "active_modules": [ 18 "experimental_rigor", 19 "data_leakage" 20 ], 21 "methodology_tags": [ 22 "benchmark-eval" 23 ], 24 "key_findings": "DefensiveToken introduces optimized special token embeddings prepended to LLM input as a test-time prompt injection defense. On the largest benchmark (TaskTracker, 31K samples), it reduces attack success rate to 0.24% averaged across four 7B/8B models, comparable to training-time defenses (0.20–0.51%) and far below other test-time defenses (>11%). The defense adds only 5 tokens (~20K parameters) with minimal utility loss, and can be toggled on/off at deployment time. Against adaptive optimization-based GCG attacks, ASR drops from 95.2% to 48.8%, outperforming all other test-time defenses.", 25 "checklist": { 26 "artifacts": { 27 "code_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The abstract states 'The code is available here' (present tense with hyperlink in PDF). The paper also references using code from [5] (Meta SecAlign)." 31 }, 32 "data_released": { 33 "applies": true, 34 "answer": true, 35 "justification": "All datasets used are publicly available: Cleaned Alpaca (training), AlpacaFarm, SEP, TaskTracker, CyberSecEval2, and InjecAgent (evaluation). Sources are cited with references." 36 }, 37 "environment_specified": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper mentions 'four NVIDIA Tesla A100s (80GB) with PyTorch FSDP' and 'peft library' but provides no version numbers for PyTorch, PEFT, transformers, or other dependencies. No requirements.txt or environment specification is provided." 41 }, 42 "reproduction_instructions": { 43 "applies": true, 44 "answer": false, 45 "justification": "No step-by-step reproduction instructions are provided in the paper. Algorithm 1 gives a high-level pseudocode for DefensiveToken optimization, but there are no concrete commands or README-style instructions." 46 } 47 }, 48 "statistical_methodology": { 49 "confidence_intervals_or_error_bars": { 50 "applies": true, 51 "answer": false, 52 "justification": "Main results in Table 3 and Figure 2 report point estimates only with no confidence intervals or error bars. One ablation reports ±std from 5 runs (Section 4.5: 'WinRate 53.84 ± 0.56' and 'ASR 2.81 ± 1.09'), but this is not the main results." 53 }, 54 "significance_tests": { 55 "applies": true, 56 "answer": false, 57 "justification": "No statistical significance tests are performed. Claims like 'DefensiveToken reduces ASR by an order of magnitude' and comparisons between methods are based solely on comparing point estimates." 58 }, 59 "effect_sizes_reported": { 60 "applies": true, 61 "answer": true, 62 "justification": "Effect sizes are reported with baseline context throughout. E.g., 'ASR of 0.24% (averaged across four models), which is comparable to training-time defenses (ASRs 0.20% to 0.51%) and significantly lower than three test-time alternatives (ASRs over 11.0%).' Table 3 provides full absolute numbers for all methods." 63 }, 64 "sample_size_justified": { 65 "applies": true, 66 "answer": false, 67 "justification": "No justification for why these particular benchmark sizes are sufficient. The paper uses existing benchmarks (208 to 31K samples) without discussing whether the sample sizes are adequate for the claimed conclusions." 68 }, 69 "variance_reported": { 70 "applies": true, 71 "answer": false, 72 "justification": "Main results in Table 3 are single-run numbers. The 5-run variance analysis in Section 4.5 is limited to one model (Llama3.1-8B-Instruct) on one benchmark (SEP), and is presented as an ablation, not as main results." 73 } 74 }, 75 "evaluation_design": { 76 "baselines_included": { 77 "applies": true, 78 "answer": true, 79 "justification": "Comprehensive baselines: three test-time defenses (Reminder, Sandwich, TextGrad) and three training-time defenses (StruQ-LoRA, StruQ-Full, SecAlign-LoRA), plus no-defense baseline. All compared in Table 3." 80 }, 81 "baselines_contemporary": { 82 "applies": true, 83 "answer": true, 84 "justification": "Baselines are contemporary: StruQ (2025), SecAlign (2025), TextGrad (2025), Reminder and Sandwich from recent prompt defense literature. The authors also introduce TextGrad as a new baseline using a 2025 framework." 85 }, 86 "ablation_study": { 87 "applies": true, 88 "answer": true, 89 "justification": "Extensive ablation studies in Section 4.5: number of tokens (Table 4), initialization strategy (Table 5), loss function (Table 6), position (Table 7), learning rate (Table 8), and multiple runs for reliability." 90 }, 91 "multiple_metrics": { 92 "applies": true, 93 "answer": true, 94 "justification": "Two primary metrics: Attack Success Rate (ASR, lower=better) for security and WinRate from AlpacaEval2 (higher=better) for utility. Both are reported for all methods." 95 }, 96 "human_evaluation": { 97 "applies": true, 98 "answer": false, 99 "justification": "All evaluation is automated. Security is assessed by LLM judge (gpt-4o) or string matching ('hacked' prefix). Utility is assessed by AlpacaEval2 LLM judge. No human evaluation of model outputs." 100 }, 101 "held_out_test_set": { 102 "applies": true, 103 "answer": true, 104 "justification": "Training uses Cleaned Alpaca (51K samples). Testing uses separate benchmarks: AlpacaFarm, SEP, TaskTracker, CyberSecEval2, InjecAgent. The paper explicitly notes 'This testset, AlpacaFarm, is different and in another domain from Cleaned Alpaca, the training dataset.'" 105 }, 106 "per_category_breakdown": { 107 "applies": true, 108 "answer": true, 109 "justification": "Table 3 provides full per-model (4 models) and per-benchmark (5 benchmarks) breakdowns for all methods. Results are not just averaged — individual model×benchmark numbers are shown." 110 }, 111 "failure_cases_discussed": { 112 "applies": true, 113 "answer": true, 114 "justification": "The paper discusses where DefensiveToken falls short: GCG attack ASR of 48.8% (still allows ~half of attacks), weaker performance on InjecAgent compared to training-time defenses, Falcon3 needing more tokens (70% ASR with 1 token), and TextGrad causing harm on some models (97.1% ASR on Falcon3)." 115 }, 116 "negative_results_reported": { 117 "applies": true, 118 "answer": true, 119 "justification": "Multiple negative results: SecAlign loss hurts utility drastically (Table 6, WinRate 18.70 vs 28.44), end-position placement destroys utility (Table 7, WinRate drops to 5.08), small learning rate 0.01 provides no security (Table 8), and large learning rate 1.0 destabilizes training." 120 } 121 }, 122 "claims_and_evidence": { 123 "abstract_claims_supported": { 124 "applies": true, 125 "answer": true, 126 "justification": "The abstract claims 'test-time defense with prompt injection robustness comparable to training-time alternatives' and 'minimal utility drop.' Table 3 and Figures 2-3 support both: ASR on TaskTracker is 0.24% vs 0.20-0.51% for training-time, and WinRate drops are small." 127 }, 128 "causal_claims_justified": { 129 "applies": true, 130 "answer": true, 131 "justification": "Causal claims like 'DefensiveToken reduces ASR' are supported by controlled experiments: same models tested with and without DefensiveTokens, and ablation studies isolating individual design choices (loss function, position, initialization, number of tokens)." 132 }, 133 "generalization_bounded": { 134 "applies": true, 135 "answer": false, 136 "justification": "The title 'Defending Against Prompt Injection With a Few DefensiveTokens' implies general applicability, but results are only on four 7B/8B open-source models. The paper discusses the approach as if it generalizes to any provider/model, e.g., 'the LLM provider (e.g., OpenAI) who optimizes DefensiveTokens,' without testing on larger models or commercial APIs." 137 }, 138 "alternative_explanations_discussed": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper hypothesizes why DefensiveToken works (large continuous optimization space, larger embedding magnitudes per Table 2) but does not discuss alternative explanations for the results or potential confounds. No consideration of whether improvements could be due to other factors." 142 }, 143 "proxy_outcome_distinction": { 144 "applies": true, 145 "answer": true, 146 "justification": "The paper measures ASR (attack success rate) and WinRate and claims security and utility respectively. ASR directly measures whether injections are followed, which is the security objective. WinRate measures response quality. Measurements match claims at the right granularity." 147 } 148 }, 149 "setup_transparency": { 150 "model_versions_specified": { 151 "applies": true, 152 "answer": true, 153 "justification": "Evaluated models are precisely specified: Llama3-8B-Instruct, Llama3.1-8B-Instruct, Falcon3-7B-Instruct, Qwen2.5-7B-Instruct. The AlpacaEval2 reference model is 'GPT-4 version turbo-2024-04-09.' The LLM judge is specified as 'gpt-4o' (lacks snapshot date but is not the primary evaluated system)." 154 }, 155 "prompts_provided": { 156 "applies": true, 157 "answer": true, 158 "justification": "Full prompt examples with actual content are provided for each benchmark evaluation (AlpacaFarm with all 3 attack variants + GCG, SEP, TaskTracker, CyberSecEval2). Baseline defense prompts are quoted verbatim (Reminder, Sandwich text). The [INST]/[DATA]/[RESP] format is fully specified." 159 }, 160 "hyperparameters_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Hyperparameters are reported: learning rate 0.1, 5 defensive tokens, 1 epoch, Cleaned Alpaca with 51K samples, LoRA r=64, lora_alpha=8, lora_dropout=0.1, target_modules=['q_proj','v_proj'], learning rates 4×10⁻⁶ (full) and 1.6×10⁻⁴ (LoRA) for baselines, GCG details." 164 }, 165 "scaffolding_described": { 166 "applies": false, 167 "answer": false, 168 "justification": "No agentic scaffolding is used. DefensiveToken is a simple embedding prepend, not an agent. InjecAgent benchmark uses ReAct prompts, but these are from the benchmark itself, not the defense." 169 }, 170 "data_preprocessing_documented": { 171 "applies": true, 172 "answer": true, 173 "justification": "Data preprocessing is documented: defensive training set construction follows StruQ (half unchanged, half attacked with two injection variants in equal probabilities), self-labeling trick using undefended LLM to generate responses (Algorithm 1, Line 1). Benchmark-specific injection procedures are described per benchmark." 174 } 175 }, 176 "limitations_and_scope": { 177 "limitations_section_present": { 178 "applies": true, 179 "answer": false, 180 "justification": "No dedicated limitations section. The conclusion contains a paragraph with limitations (only defends prompt injection, not jailbreaks; utility unknown on more datasets), but this is embedded in the conclusion, not a separate substantive section." 181 }, 182 "threats_to_validity_specific": { 183 "applies": true, 184 "answer": true, 185 "justification": "Specific threats mentioned in the conclusion: (1) 'does not apply to other safety settings, e.g., preventing jailbreaks, system following attacks, and data extraction attacks,' (2) 'we do not know the utility on more labeled datasets.' These are specific to this study." 186 }, 187 "scope_boundaries_stated": { 188 "applies": true, 189 "answer": true, 190 "justification": "Explicit scope boundaries: 'DefensiveToken only defends against prompt injections, where the user (instruction) is benign, and application-retrieved external data is malicious. DefensiveToken does not apply to other safety settings, e.g., preventing jailbreaks, system following attacks, and data extraction attacks, where the user is malicious.'" 191 } 192 }, 193 "data_integrity": { 194 "raw_data_available": { 195 "applies": true, 196 "answer": false, 197 "justification": "Benchmark input data is publicly available (AlpacaFarm, SEP, TaskTracker, etc.), but raw experimental outputs (model responses per sample, per-sample ASR judgments) are not released for independent verification." 198 }, 199 "data_collection_described": { 200 "applies": true, 201 "answer": true, 202 "justification": "Training data source is described (Cleaned Alpaca dataset, 51K samples, reference [34]). Each test benchmark is described with its provenance, size, and format (Section 4.1). The defensive dataset construction procedure is specified." 203 }, 204 "recruitment_methods_described": { 205 "applies": false, 206 "answer": false, 207 "justification": "No human participants. All data comes from standard publicly available benchmarks." 208 }, 209 "data_pipeline_documented": { 210 "applies": true, 211 "answer": true, 212 "justification": "Algorithm 1 documents the pipeline: (1) build defensive training set from self-labeled Cleaned Alpaca, (2) initialize random embeddings, (3) optimize with gradient descent. Evaluation pipeline is described per benchmark including injection placement and success criteria." 213 } 214 }, 215 "conflicts_of_interest": { 216 "funding_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Funding disclosed in Acknowledgments: 'Google-BAIR Commons (Year 6, project 03), National Science Foundation under grant 2229876 (the ACTION center), OpenAI, Open Philanthropy, Google, the Department of Homeland Security, and IBM.'" 220 }, 221 "affiliations_disclosed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Author affiliations are disclosed: UC Berkeley (Chen, Wang, Wagner), Google DeepMind and Anthropic (Carlini), Google DeepMind (Sitawarin)." 225 }, 226 "funder_independent_of_outcome": { 227 "applies": true, 228 "answer": false, 229 "justification": "Google and OpenAI are both funders and major LLM providers who face prompt injection threats. They have commercial interest in effective prompt injection defenses. The paper frames the defense as something 'the LLM provider (e.g., OpenAI)' would release." 230 }, 231 "financial_interests_declared": { 232 "applies": true, 233 "answer": false, 234 "justification": "No competing interests statement. Nicholas Carlini has affiliations with both Google DeepMind and Anthropic (LLM providers), Chawin Sitawarin with Google DeepMind. No declaration of financial interests, patents, or equity related to the findings." 235 } 236 }, 237 "contamination": { 238 "training_cutoff_stated": { 239 "applies": false, 240 "answer": false, 241 "justification": "The paper tests a defense mechanism against prompt injection, not a pre-trained model's capability on a knowledge benchmark. The benchmarks assess whether injections are followed, not whether the model has memorized answers." 242 }, 243 "train_test_overlap_discussed": { 244 "applies": false, 245 "answer": false, 246 "justification": "Contamination in the traditional sense (model memorizing benchmark answers) is not relevant to this defense evaluation. The concern is injection robustness, not model knowledge." 247 }, 248 "benchmark_contamination_addressed": { 249 "applies": false, 250 "answer": false, 251 "justification": "Standard benchmark contamination does not apply to prompt injection defense evaluation. The benchmarks test behavior (following/ignoring injections), not memorized knowledge." 252 } 253 }, 254 "human_studies": { 255 "pre_registered": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study. All evaluation is automated on benchmark datasets." 259 }, 260 "irb_or_ethics_approval": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants." 264 }, 265 "demographics_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants." 269 }, 270 "inclusion_exclusion_criteria": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants." 274 }, 275 "randomization_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants." 279 }, 280 "blinding_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants." 284 }, 285 "attrition_reported": { 286 "applies": false, 287 "answer": false, 288 "justification": "No human participants." 289 } 290 }, 291 "cost_and_practicality": { 292 "inference_cost_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "Inference cost or latency with DefensiveTokens is not reported. The paper notes the defense adds 'only 5 more tokens' but does not quantify the inference time overhead, tokens consumed, or per-example cost." 296 }, 297 "compute_budget_stated": { 298 "applies": true, 299 "answer": true, 300 "justification": "Training compute is stated: 'Our training requires four NVIDIA Tesla A100s (80GB) with PyTorch FSDP and takes one hour to complete.' TextGrad baseline optimization cost is also described (150 steps, batch size 8)." 301 } 302 }, 303 "experimental_rigor": { 304 "seed_sensitivity_reported": { 305 "applies": true, 306 "answer": true, 307 "justification": "Section 4.5 reports explicit seed sensitivity: '5 runs when optimizing different 5 DefensiveTokens for Llama3.1-8B-Instruct' with results 'WinRate 53.84 ± 0.56' and 'ASR 2.81 ± 1.09' with all 5 individual values listed." 308 }, 309 "number_of_runs_stated": { 310 "applies": true, 311 "answer": false, 312 "justification": "The seed sensitivity ablation states 5 runs, but the main results in Table 3 do not state the number of experimental runs. It is unclear whether main results are from single or multiple runs." 313 }, 314 "hyperparameter_search_budget": { 315 "applies": true, 316 "answer": false, 317 "justification": "Ablation studies vary learning rate (0.01, 0.1, 1.0), number of tokens (1, 5, 20), and initialization strategy, but these are presented as ablations, not as a hyperparameter search. Total search budget is not reported." 318 }, 319 "best_config_selection_justified": { 320 "applies": true, 321 "answer": true, 322 "justification": "Configuration choices are justified through ablation tables: 5 tokens selected because 'sufficient for good security with minimal utility loss' (Table 4), learning rate 0.1 chosen as 'a good choice for security and utility' (Table 8), random initialization shown to outperform alternatives (Table 5)." 323 }, 324 "multiple_comparison_correction": { 325 "applies": false, 326 "answer": false, 327 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 328 }, 329 "self_comparison_bias_addressed": { 330 "applies": true, 331 "answer": false, 332 "justification": "The authors implement their own version of TextGrad defense and re-implement training-time baselines without acknowledging the potential bias of evaluating their own system against their implementations of competing approaches." 333 }, 334 "compute_budget_vs_performance": { 335 "applies": true, 336 "answer": false, 337 "justification": "The paper states 'Optimizing DefensiveTokens requires similar computation to the training-time defense' but does not systematically compare performance as a function of compute. No compute-matched comparisons are provided." 338 }, 339 "benchmark_construct_validity": { 340 "applies": true, 341 "answer": false, 342 "justification": "Five prompt injection benchmarks are used without questioning whether they accurately measure real-world prompt injection defense effectiveness. No discussion of construct validity, ecological validity, or how benchmark attacks compare to real-world injection attempts." 343 }, 344 "scaffold_confound_addressed": { 345 "applies": false, 346 "answer": false, 347 "justification": "No scaffolding confound. DefensiveToken is a simple embedding prepend. Model comparisons all use the same setup with or without DefensiveTokens." 348 } 349 }, 350 "data_leakage": { 351 "temporal_leakage_addressed": { 352 "applies": true, 353 "answer": true, 354 "justification": "The paper addresses train/test separation: 'This testset, AlpacaFarm, is different and in another domain from Cleaned Alpaca, the training dataset' and 'The user instructions and injections in evaluation have no overlap with those used in model training.'" 355 }, 356 "feature_leakage_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of whether the evaluation setup leaks information. For example, the GCG attack has white-box access to DefensiveToken embeddings — no discussion of whether this evaluation paradigm leaks information beyond what a real attacker would have." 360 }, 361 "non_independence_addressed": { 362 "applies": true, 363 "answer": false, 364 "justification": "No analysis of structural similarities between training data (Cleaned Alpaca) and test benchmarks. While different datasets are used, no analysis of distributional overlap or shared patterns." 365 }, 366 "leakage_detection_method": { 367 "applies": true, 368 "answer": false, 369 "justification": "No concrete leakage detection or prevention method is applied. The paper relies on using separate dataset names but does not verify non-overlap through any formal method (n-gram analysis, decontamination, etc.)." 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "DefensiveToken achieves prompt injection robustness comparable to training-time defenses while being a test-time defense.", 376 "evidence": "Table 3 and Figure 2: On TaskTracker (31K samples), DefensiveToken achieves 0.24% average ASR vs 0.20-0.51% for training-time defenses (StruQ-LoRA, StruQ-Full, SecAlign-LoRA). Similar pattern on AlpacaFarm, SEP, and CyberSecEval2.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "DefensiveToken has minimal utility loss compared to all baselines.", 381 "evidence": "Table 3 and Figure 3: WinRate drops are small (e.g., Llama3.1 from 29.07 to 28.53, Qwen2.5 from 32.69 to 34.16 [increase]). DefensiveToken is consistently closest to the 'ideal defense' point in Figure 3.", 382 "supported": "strong" 383 }, 384 { 385 "claim": "DefensiveToken reduces GCG attack success rate from 95.2% to 48.8% on average.", 386 "evidence": "Figure 2 (GCG subplot) and Table 3: Average across 4 models, GCG ASR drops from ~95% (no defense) to 37.5-73.6% per model, averaging 48.8%.", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "DefensiveToken reduces InjecAgent ASR by 5 times compared to 2 times from the best test-time baseline.", 391 "evidence": "Table 3 InjecAgent column: Average ASR drops from ~26.6% (no defense) to ~5.2% (DefensiveToken). Sandwich defense achieves ~12.8%. The claimed reduction ratios are approximately correct.", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "5 DefensiveTokens are sufficient for all tested models.", 396 "evidence": "Table 4: With 5 tokens, ASR drops to <5% for all 4 models. Llama3 and Llama3.1 reach <1% ASR with even 1 token. Falcon3 needs 5 tokens (4.81% ASR) and further benefits from 20 tokens (0.48%).", 397 "supported": "strong" 398 }, 399 { 400 "claim": "Optimized DefensiveToken embeddings have two orders of magnitude larger norms than vocabulary embeddings.", 401 "evidence": "Table 2: Average 1-norm of vocabulary token embeddings is 34, while DefensiveToken embeddings average 4332 (127× larger) for Llama-3.1-8B-Instruct.", 402 "supported": "strong" 403 }, 404 { 405 "claim": "Random initialization outperforms vocabulary-based initialization for DefensiveTokens.", 406 "evidence": "Table 5: Random init with 1 token achieves 0.48% ASR vs 7.7% for space-based init on Llama3.1-8B-Instruct. Random init also preserves utility better (WinRate 28.44 vs 27.49).", 407 "supported": "moderate" 408 } 409 ], 410 "red_flags": [ 411 { 412 "flag": "No error bars on main results", 413 "detail": "Table 3 (the main results table) reports point estimates for all 4 models × 5 benchmarks × 8 methods. Only one ablation study (Section 4.5) reports variance from 5 runs, and only for one model on one benchmark. The stability of the main claimed results is unverified." 414 }, 415 { 416 "flag": "Funder conflict of interest", 417 "detail": "Research is funded by Google and OpenAI (among others), both major LLM providers with commercial interest in prompt injection defenses. Authors include Google DeepMind and Anthropic researchers. The paper frames the approach as something LLM providers would deploy. No competing interests statement is provided." 418 }, 419 { 420 "flag": "LLM judge without human validation", 421 "detail": "Security evaluation on 4 of 5 benchmarks relies on gpt-4o as the LLM judge to determine attack success. Utility is also judged by gpt-4o (AlpacaEval2). No human validation of the LLM judge's accuracy or agreement with human judgments on these specific tasks." 422 }, 423 { 424 "flag": "Limited model diversity", 425 "detail": "All four evaluated models are 7B/8B open-source instruction-tuned models. Despite framing the defense as suitable for 'the LLM provider (e.g., OpenAI),' no testing on larger models (70B+), different architectures, or commercial APIs. It is unknown whether results transfer to different model scales." 426 } 427 ], 428 "cited_papers": [ 429 { 430 "title": "StruQ: Defending against prompt injection with structured queries", 431 "authors": [ 432 "Sizhe Chen", 433 "Julien Piet", 434 "Chawin Sitawarin", 435 "David Wagner" 436 ], 437 "year": 2025, 438 "arxiv_id": "2402.06363", 439 "relevance": "Training-time prompt injection defense that DefensiveToken builds on; provides the loss function and dataset construction used by DefensiveToken." 440 }, 441 { 442 "title": "SecAlign: Defending Against Prompt Injection with Preference Optimization", 443 "authors": [ 444 "Sizhe Chen", 445 "Arman Zharmagambetov", 446 "Saeed Mahloujifar", 447 "Kamalika Chaudhuri", 448 "David Wagner", 449 "Chuan Guo" 450 ], 451 "year": 2025, 452 "arxiv_id": "2410.05451", 453 "relevance": "Training-time prompt injection defense using preference optimization; key baseline and comparison point." 454 }, 455 { 456 "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions", 457 "authors": [ 458 "Eric Wallace", 459 "Kai Xiao", 460 "Reimar Leike", 461 "Lilian Weng", 462 "Johannes Heidecke", 463 "Alex Beutel" 464 ], 465 "year": 2024, 466 "arxiv_id": "2404.13208", 467 "relevance": "Defines multi-layer instruction priority for prompt injection defense, implemented in GPT-4o and Gemini; training-time defense approach." 468 }, 469 { 470 "title": "Defeating prompt injections by design", 471 "authors": [ 472 "Edoardo Debenedetti", 473 "Ilia Shumailov", 474 "Tianqi Fan", 475 "Jamie Hayes", 476 "Nicholas Carlini" 477 ], 478 "year": 2025, 479 "arxiv_id": "2503.18813", 480 "relevance": "System-level prompt injection defense using system security principles; represents the system-design approach to defense." 481 }, 482 { 483 "title": "Get my drift? Catching LLM Task Drift with Activation Deltas", 484 "authors": [ 485 "Sahar Abdelnabi", 486 "Aideen Fay", 487 "Giovanni Cherubin", 488 "Ahmed Salem", 489 "Mario Fritz", 490 "Andrew Paverd" 491 ], 492 "year": 2025, 493 "arxiv_id": "2406.00799", 494 "relevance": "TaskTracker benchmark (31K samples) used as the largest evaluation benchmark; activation-based prompt injection detection." 495 }, 496 { 497 "title": "Instructional Segment Embedding: Improving LLM Safety with Instruction Hierarchy", 498 "authors": [ 499 "Tong Wu", 500 "Shujian Zhang", 501 "Kaiqiang Song" 502 ], 503 "year": 2025, 504 "arxiv_id": "2410.09102", 505 "relevance": "Training-time defense using instructional segment embeddings for prompt injection; related embedding-based defense approach." 506 }, 507 { 508 "title": "Can LLMs Separate Instructions From Data? And What Do We Even Mean By That?", 509 "authors": [ 510 "Egor Zverev", 511 "Sahar Abdelnabi", 512 "Mario Fritz", 513 "Christoph H Lampert" 514 ], 515 "year": 2025, 516 "relevance": "SEP benchmark (9.1K samples) for prompt injection evaluation; provides the utility-security evaluation framework used in this paper." 517 }, 518 { 519 "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents", 520 "authors": [ 521 "Qiusi Zhan", 522 "Zhixiang Liang", 523 "Zifan Ying", 524 "Daniel Kang" 525 ], 526 "year": 2024, 527 "doi": "10.18653/v1/2024.findings-acl.624", 528 "relevance": "Agentic tool-calling prompt injection benchmark (1K samples); tests defense generalization to API-calling scenarios." 529 }, 530 { 531 "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models", 532 "authors": [ 533 "Andy Zou", 534 "Zifan Wang", 535 "Nicholas Carlini", 536 "Milad Nasr", 537 "J. Zico Kolter", 538 "Matt Fredrikson" 539 ], 540 "year": 2023, 541 "arxiv_id": "2307.15043", 542 "relevance": "GCG attack used as the optimization-based adversarial attack for evaluating defense robustness." 543 }, 544 { 545 "title": "Cyberseceval 2: A wide-ranging cybersecurity evaluation suite for large language models", 546 "authors": [ 547 "Manish Bhatt", 548 "Sahana Chennabasappa" 549 ], 550 "year": 2024, 551 "arxiv_id": "2404.13161", 552 "relevance": "CyberSecEval2 prompt injection benchmark (55 test cases) used in evaluation." 553 }, 554 { 555 "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection", 556 "authors": [ 557 "Kai Greshake", 558 "Sahar Abdelnabi", 559 "Shailesh Mishra", 560 "Christoph Endres", 561 "Thorsten Holz", 562 "Mario Fritz" 563 ], 564 "year": 2023, 565 "doi": "10.1145/3605764.3623985", 566 "relevance": "Foundational work on indirect prompt injection attacks in real-world LLM applications." 567 }, 568 { 569 "title": "Lessons from Defending Gemini Against Indirect Prompt Injections", 570 "authors": [ 571 "Chongyang Shi", 572 "Sharon Lin", 573 "Shuang Song" 574 ], 575 "year": 2025, 576 "arxiv_id": "2505.14534", 577 "relevance": "Industry-scale perspective on defending against prompt injection in production (Gemini); instruction hierarchy implementation." 578 } 579 ], 580 "engagement_factors": { 581 "practical_relevance": { 582 "score": 2, 583 "justification": "Offers a deployable, toggleable prompt injection defense that LLM providers and system developers can integrate with minimal infrastructure changes." 584 }, 585 "surprise_contrarian": { 586 "score": 1, 587 "justification": "The finding that ~20K optimized parameters match full fine-tuning defenses is mildly surprising, but the general approach (soft prompt tuning for security) is an incremental extension of known techniques." 588 }, 589 "fear_safety": { 590 "score": 2, 591 "justification": "Directly addresses the OWASP #1 LLM threat (prompt injection) with concrete attack/defense demonstrations, though it's a defense paper rather than a novel attack." 592 }, 593 "drama_conflict": { 594 "score": 1, 595 "justification": "Implicitly challenges the adequacy of popular prompting defenses (Reminder, Sandwich) by showing they barely reduce ASR, but doesn't target specific companies or create controversy." 596 }, 597 "demo_ability": { 598 "score": 1, 599 "justification": "Code is released but requires A100 GPUs for optimization and setup of multiple 7B/8B models, making casual reproduction difficult." 600 }, 601 "brand_recognition": { 602 "score": 2, 603 "justification": "Authors include Nicholas Carlini (Google DeepMind/Anthropic, prominent adversarial ML researcher) and the work is funded by Google and OpenAI, lending significant credibility." 604 } 605 } 606 }