scan.json (32576B)
1 { 2 "paper": { 3 "title": "PISanitizer: Preventing Prompt Injection to Long-Context LLMs via Prompt Sanitization", 4 "authors": [ 5 "Runpeng Geng", 6 "Yanting Wang", 7 "Chenlong Yin", 8 "Minhao Cheng", 9 "Ying Chen", 10 "Jinyuan Jia" 11 ], 12 "year": 2025, 13 "venue": "arXiv.org", 14 "arxiv_id": "2511.10720", 15 "doi": "10.48550/arXiv.2511.10720" 16 }, 17 "scan_version": 3, 18 "active_modules": ["experimental_rigor", "data_leakage"], 19 "methodology_tags": ["benchmark-eval"], 20 "key_findings": "PISanitizer defends against prompt injection in long-context LLMs by using an attention-based sanitization approach that intentionally lets an LLM follow any instructions in context, then removes high-attention tokens driving instruction-following behavior. Across 6 LongBench datasets, 7 LLMs (including GPT-5), and multiple attack types, PISanitizer reduces attack success rates to near 0 while maintaining utility, outperforming baselines like Meta-SecAlign. The design creates an inherent dilemma for attackers: the more effective an injected instruction compels an LLM to follow it, the more likely it is to be sanitized.", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The abstract states: 'The code is available at https://github.com/sleeepeer/PISanitizer.' A concrete repository URL is provided." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "All datasets used are publicly available benchmarks: LongBench, TaskTracker, AlpacaFarm, SQuAD-v2, Dolly, InjecAgent, and AgentDojo. Attack implementations reference open-source code from prior work." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "The paper mentions '96GB H100 GPU' for runtime and scipy.signal for implementation, but no requirements.txt, Dockerfile, or detailed environment specification is provided." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "While Algorithm 1 is provided and hyperparameters are detailed in Appendix B, the paper does not include step-by-step reproduction instructions, a README with commands, or scripts to replicate experiments." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "All results in Tables 1-14 are point estimates. No confidence intervals, error bars, or ± notation appears anywhere in the paper." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "The paper claims 'PISanitizer outperforms baselines' and 'significantly outperform state-of-the-art baselines' by comparing raw numbers in Table 4 without any statistical significance tests (no p-values, t-tests, etc.)." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "Tables report absolute values for both baseline and PISanitizer (e.g., ASR drops from 0.92 to 0.0, utility maintained at 0.59 vs 0.59 without attack), providing sufficient context for the magnitude of effects." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "They use 100 samples per dataset (selected from 200 in LongBench) with no justification for why 100 is sufficient and no power analysis." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "No standard deviations, variance across runs, or spread measures are reported. All results appear to be from single experimental runs." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "Table 4 compares against 7 baselines: Sandwich Prevention, Instructional Prevention, Meta-SecAlign, PromptArmor, PromptLocate, DataFilter, plus detection-based defenses (DataSentinel, PromptGuard, AttentionTracker)." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "Baselines include very recent work: Meta-SecAlign (2025), DataSentinel (2025), DataFilter (2025), PromptLocate (2026), AttentionTracker (2025). These represent the current state of the art." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "Section 5.3 ablates: consecutive token joint consideration vs. individual thresholding (Table 7), different sanitization instructions (Table 8), noise-aware vs. average attention aggregation (Table 9), and hyperparameter sensitivity (Figure 3)." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Evaluation uses both Utility (task-specific: F1, ROUGE-L, Edit Sim, exact match) and ASR. Table 14 additionally reports token-level precision, recall, and F1-score. Table 4 also reports runtime." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": false, 95 "justification": "All evaluation is automated. ASR uses string matching or LLM-as-a-judge (Appendix D). Utility uses automated metrics from LongBench. No human evaluation of PISanitizer's outputs." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": false, 100 "justification": "The paper selects 100 samples per dataset for evaluation but does not describe any held-out test set separate from data used to tune hyperparameters (ws, d, θ). It is unclear whether the same data used for ablation was used for final reporting." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Results are broken down by dataset (6 tasks), attack type (6 heuristic + GCG), backend LLM (7 models), injected task type (4 types), context length (Table 13), and token-level sanitization accuracy per dataset (Table 14)." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Section 6.1 discusses failure with knowledge corruption attacks (non-instructional manipulation). Section 6.2 discusses failure with benign instructions in context (11/13 AgentDojo utility loss cases). Section 6.1 also discusses potential 'weak' attacks." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Section 6.2 reports utility loss on AgentDojo (0.82 → 0.71 without attacks). Section 6.1 reports PISanitizer cannot restore full utility when injected data (not just instructions) is present. Table 11 notes LCC utility doesn't fully recover." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "Abstract claims (ASR near 0, maintained utility, outperforms baselines, efficient at ~1.8s, robust to adaptive attacks) are all supported by Tables 1, 2, 4, 10, and associated text in Section 5." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "Causal claims (sanitization reduces ASR) are supported by controlled ablation experiments in Section 5.3: removing consecutive token consideration (Table 7), changing sanitization instructions (Table 8), and comparing attention aggregation strategies (Table 9) isolate component contributions." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": true, 132 "justification": "Section 6 explicitly states failure modes: knowledge corruption attacks (6.1), benign instructions (6.2), and potential weak attacks (6.1). The scope is bounded to instruction-based prompt injection. Title is appropriately scoped to 'Prompt Injection to Long-Context LLMs.'" 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper does not consider alternative explanations for PISanitizer's effectiveness. It does not discuss whether, e.g., the sanitization process itself (context modification) contributes to reduced ASR independent of the attention mechanism, or whether removal of any random tokens might partially achieve the same effect." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": true, 142 "justification": "Metrics directly measure what is claimed: ASR measures attack success (string match or LLM judge), Utility measures task performance (F1, ROUGE-L, etc.). No proxy gap exists between measurement and framing." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": false, 149 "justification": "Open-source models are specified (Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct, Qwen3-Omni-30B-A3B-Instruct) but closed-source models are listed only as 'GPT-4o', 'GPT-4o-mini', 'GPT-4.1', 'GPT-5' without snapshot dates or API versions." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": true, 154 "justification": "Full sanitization instructions provided in Section 4.1 and Appendix C (4 variants). Injected task prompts in Appendix A. LLM-as-a-judge prompt in Appendix D. GCG suffix initialization in Appendix E." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "Appendix B details: ws=9 (>500 tokens) or ws=5 (<500 tokens), d=10, θ=0.01, max 5 repetitions. GCG: 500 iterations, suffix of 50 tokens. Adaptive attack: β=5000. Peak filtering threshold 0.005." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "PISanitizer is a preprocessing/sanitization step, not agentic scaffolding. No agentic workflow, tool use, or multi-step scaffolding is involved in the defense mechanism itself." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 5.1 describes dataset construction: 100 random samples per dataset from LongBench's 200, random injection location for contaminated contexts. Appendix A describes target answer generation using GPT-4o with the exact prompt. Appendix E details suffix initialization for GCG attacks." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 6 'Discussion and Limitation' contains substantive discussion in two subsections: 6.1 'Core Motivations of PISanitizer' (failure conditions with experiments) and 6.2 'Benign Instructions in a Context' (with AgentDojo experiments)." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 6.1 identifies specific threats: (1) knowledge corruption attacks bypass PISanitizer because they don't use explicit instructions, with experimental validation on Open-Prompt-Injection. (2) 'Weak' attacks that receive low attention could evade detection. Section 6.2: benign instructions removed in 11/13 AgentDojo failure cases." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": true, 186 "justification": "Section 6.1 explicitly states PISanitizer cannot defend against knowledge corruption (non-instructional manipulation) and may fail against 'weak' attacks with low attention. Section 6.2 states it cannot distinguish benign from malicious instructions. Future work is bounded to multimodal extension." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": false, 193 "justification": "No raw experimental data (per-sample results, attention weight dumps, individual trial outcomes) is released. Only aggregate metrics are reported in tables." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Section 5.1 describes data sources (LongBench with 6 datasets, 200 samples each, 100 selected), how contaminated contexts are constructed, and injected task construction (Appendix A)." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants. All data comes from standard public benchmarks (LongBench, TaskTracker, AlpacaFarm, SQuAD-v2, Dolly, InjecAgent, AgentDojo)." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "The pipeline is documented: benchmark selection → sample selection (100/200) → contaminated context construction (attack-specific injection at random location) → evaluation with ASR/Utility metrics. Appendices A, D, E provide implementation details." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": false, 215 "justification": "No funding or acknowledgments section is present in the paper. Funding sources are not disclosed." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "All authors are affiliated with The Pennsylvania State University, clearly stated in the header. They are not affiliated with any company whose products are evaluated." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": false, 225 "justification": "Cannot determine funder independence because no funding source is disclosed. Without disclosure, independence cannot be verified." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial interests statement appears in the paper." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": false, 236 "answer": false, 237 "justification": "This paper tests a defense mechanism (PISanitizer) against prompt injection, not model knowledge or capability on benchmarks. Contamination of benchmark data in model training is not the relevant concern here." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": false, 241 "answer": false, 242 "justification": "The paper evaluates a defense rather than model capability. Whether the model has seen LongBench tasks during training is not the primary evaluation concern." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": false, 246 "answer": false, 247 "justification": "The paper tests defense effectiveness against prompt injection, not pre-trained model knowledge. Benchmark contamination is structurally inapplicable to this evaluation focus." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study. All evaluation is automated using benchmark datasets and LLM-as-a-judge." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants. Study involves only automated evaluation on benchmark datasets." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in the study." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in the study." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in the study." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in the study." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in the study." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": true, 291 "justification": "Table 4 reports runtime in seconds for all methods across 6 datasets. Section 5.2 states: 'it takes around 1.8s for PISanitizer to sanitize a very long input containing thousands of tokens.' Runtime is measured on a 96GB H100 GPU." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "The paper mentions '96GB H100 GPU' for runtime measurement but does not state total computational budget (total GPU hours, total experiments, API costs for closed-source LLMs like GPT-5)." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single runs. The random sample selection (100/200) and random injection locations are not assessed for sensitivity." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": false, 308 "justification": "The paper does not state how many runs produced the reported results. No 'averaged over K runs' or similar statement appears." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": false, 313 "justification": "Hyperparameters (ws, d, θ) are reported and ablated in Figure 3 and Appendix F, but no search budget is stated — it is unclear how the default values were selected." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": false, 318 "justification": "Default hyperparameters (ws=9, d=10, θ=0.01) are stated in Appendix B but no justification for their selection is provided. The ablation study (Figure 3) shows robustness but does not explain how these specific defaults were chosen." 319 }, 320 "multiple_comparison_correction": { 321 "applies": true, 322 "answer": false, 323 "justification": "The paper performs many comparisons across 6 datasets × multiple attacks × multiple baselines with no correction for multiple comparisons (no Bonferroni, Holm, or similar)." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "The authors re-implement PromptArmor themselves ('As there is no open-source implementation for PromptArmor, we implement it ourselves') and compare against it without acknowledging potential author-evaluation bias." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": true, 332 "answer": false, 333 "justification": "Table 4 reports runtime alongside performance but does not show performance as a function of compute budget. No analysis at matched compute levels or performance curves across compute. PromptLocate is ~50x slower but this tradeoff is not systematically analyzed." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": false, 338 "justification": "The paper uses LongBench as the primary benchmark without discussing whether its tasks are representative of real-world prompt injection scenarios. No discussion of construct validity for the evaluation setup." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "PISanitizer is a preprocessing defense applied uniformly across all model comparisons, not a scaffold. The paper does not compare models with different scaffolding." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "No discussion of whether LongBench tasks (published 2024) could appear in training data of models like GPT-5 or Llama-3.1. Temporal leakage is not addressed." 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "No discussion of whether the evaluation setup leaks information. For example, the sanitization LLM (Llama-3.1-8B) might have seen similar attack patterns during training." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "No discussion of whether training and test data share structural similarities. The 100 samples per dataset are randomly selected but independence is not verified." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": false, 365 "justification": "No concrete leakage detection or prevention method is applied (no canary strings, membership inference, or decontamination)." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "PISanitizer reduces attack success rates to nearly 0 across 6 datasets and multiple prompt injection attacks including GCG optimization-based attacks.", 372 "evidence": "Table 1 shows ASR ≤ 0.04 across all 6 LongBench datasets and 6 attack types (most entries 0.0). GCG attack ASR reduced from 0.82-1.0 to 0.0-0.02.", 373 "supported": "strong" 374 }, 375 { 376 "claim": "PISanitizer maintains utility comparable to no-defense settings on clean contexts.", 377 "evidence": "Table 1 shows identical or near-identical utility scores with and without PISanitizer when no attack is present (e.g., HotpotQA: 0.59 vs 0.59, GovReport: 0.34 vs 0.34).", 378 "supported": "strong" 379 }, 380 { 381 "claim": "PISanitizer generalizes effectively across 7 different backend LLMs including GPT-5.", 382 "evidence": "Table 2 shows ASR reduced to ≤ 0.02 across all 7 LLMs (Llama-3.1-8B, Llama-3.1-70B, Qwen3-Omni-30B, GPT-4o, GPT-4o-mini, GPT-4.1, GPT-5) while maintaining utility.", 383 "supported": "strong" 384 }, 385 { 386 "claim": "PISanitizer significantly outperforms state-of-the-art baselines including Meta-SecAlign.", 387 "evidence": "Table 4 shows PISanitizer achieves ASR 0.0-0.02 under GCG attack while Meta-SecAlign has ASR 0.85-1.0. Under Combined Attack, PISanitizer achieves ASR 0.0-0.01 vs Meta-SecAlign 0.0-0.58. However, no statistical significance tests are used.", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "PISanitizer is efficient, taking around 1.8 seconds to sanitize a context with thousands of tokens.", 392 "evidence": "Table 4 shows PISanitizer runtime of 10.14-13.92s total (including backend LLM generation), comparable to other defenses except PromptLocate (633-761s). The sanitization-specific time of ~1.8s is stated in Section 5.2.", 393 "supported": "strong" 394 }, 395 { 396 "claim": "PISanitizer is robust to optimization-based and heuristic adaptive attacks.", 397 "evidence": "Table 10 shows ASR ≤ 0.04 across 6 adaptive attacks (4 heuristic, 2 optimization-based), though these represent specific adaptive strategies the authors designed rather than a comprehensive adversarial evaluation.", 398 "supported": "moderate" 399 }, 400 { 401 "claim": "PISanitizer achieves high precision (0.80) and recall (0.90) for token-level sanitization of injected prompts.", 402 "evidence": "Table 14 provides per-dataset, per-attack precision/recall/F1 for token sanitization. Average across all settings: 0.80 precision, 0.90 recall, 0.82 F1-score.", 403 "supported": "strong" 404 }, 405 { 406 "claim": "Existing detection-based defenses are ineffective for long-context prompt injection.", 407 "evidence": "Table 5 shows DataSentinel and AttentionTracker have 1.0 FPR (flag everything as contaminated), while PromptGuard has 0.50-0.99 FNR (misses most injections).", 408 "supported": "strong" 409 } 410 ], 411 "red_flags": [ 412 { 413 "flag": "No error bars or variance reporting", 414 "detail": "All results across Tables 1-14 are point estimates from apparently single experimental runs. With 100 samples per dataset and random injection locations, variance across sample selections and injection positions could be substantial but is never quantified." 415 }, 416 { 417 "flag": "No statistical significance tests", 418 "detail": "The paper claims PISanitizer 'significantly outperforms' baselines (Section 1) based solely on comparing raw numbers across many datasets and attack types, without any statistical tests to support these comparative claims." 419 }, 420 { 421 "flag": "Self-implemented baseline", 422 "detail": "PromptArmor is re-implemented by the authors ('As there is no open-source implementation for PromptArmor, we implement it ourselves and use GPT-4o') creating potential for unfair comparison. No validation of their re-implementation against the original is provided." 423 }, 424 { 425 "flag": "Adaptive attacks designed by defenders", 426 "detail": "All 6 adaptive attacks (Section 5.4) are designed by the authors/defenders themselves. The paper does not involve external red-teamers or independent adversarial evaluation, which could miss stronger attack strategies." 427 }, 428 { 429 "flag": "No held-out validation", 430 "detail": "Hyperparameters (ws=9, d=10, θ=0.01) appear tuned on the same data used for evaluation. No separate validation split is described for hyperparameter selection." 431 } 432 ], 433 "cited_papers": [ 434 { 435 "title": "Formalizing and benchmarking prompt injection attacks and defenses", 436 "authors": ["Y. Liu", "Y. Jia", "R. Geng", "J. Jia", "N. Z. Gong"], 437 "year": 2024, 438 "relevance": "Key benchmark paper formalizing prompt injection attacks and defenses, provides the Open-Prompt-Injection benchmark and attack implementations used in this evaluation." 439 }, 440 { 441 "title": "Meta SecAlign: A secure foundation LLM against prompt injection attacks", 442 "authors": ["S. Chen", "A. Zharmagambetov", "D. Wagner", "C. Guo"], 443 "year": 2025, 444 "arxiv_id": "2507.02735", 445 "relevance": "State-of-the-art fine-tuning-based defense against prompt injection; primary baseline comparison showing PISanitizer's superiority." 446 }, 447 { 448 "title": "SecAlign: Defending against prompt injection with preference optimization", 449 "authors": ["S. Chen", "A. Zharmagambetov", "S. Mahloujifar", "K. Chaudhuri", "D. Wagner", "C. Guo"], 450 "year": 2025, 451 "relevance": "DPO-based defense against prompt injection; precursor to Meta-SecAlign and relevant baseline for understanding fine-tuning-based defense approaches." 452 }, 453 { 454 "title": "StruQ: Defending against prompt injection with structured queries", 455 "authors": ["S. Chen", "J. Piet", "C. Sitawarin", "D. Wagner"], 456 "year": 2024, 457 "relevance": "Prevention-based defense using special delimiters and fine-tuning; foundational work in the fine-tuning defense family." 458 }, 459 { 460 "title": "DataSentinel: A game-theoretic detection of prompt injection attacks", 461 "authors": ["Y. Liu", "Y. Jia", "J. Jia", "D. Song", "N. Z. Gong"], 462 "year": 2025, 463 "relevance": "State-of-the-art detection-based defense for prompt injection; key baseline shown to be ineffective for long contexts." 464 }, 465 { 466 "title": "Neural exec: Learning (and learning from) execution triggers for prompt injection attacks", 467 "authors": ["D. Pasquini", "M. Strohmeier", "C. Troncoso"], 468 "year": 2024, 469 "relevance": "Optimization-based prompt injection attack learning execution triggers; demonstrates advanced attack techniques evaluated against PISanitizer." 470 }, 471 { 472 "title": "The instruction hierarchy: Training LLMs to prioritize privileged instructions", 473 "authors": ["E. Wallace", "K. Xiao", "R. Leike", "L. Weng", "J. Heidecke", "A. Beutel"], 474 "year": 2024, 475 "relevance": "OpenAI's instruction hierarchy defense deployed in GPT-4o-mini; relevant prevention-based approach to prioritizing system messages over third-party content." 476 }, 477 { 478 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 479 "authors": ["K. Greshake", "S. Abdelnabi", "S. Mishra", "C. Endres", "T. Holz", "M. Fritz"], 480 "year": 2023, 481 "relevance": "Foundational paper on indirect prompt injection attacks against real-world LLM applications." 482 }, 483 { 484 "title": "Defeating prompt injections by design", 485 "authors": ["E. Debenedetti", "I. Shumailov", "T. Fan", "J. Hayes", "N. Carlini"], 486 "year": 2025, 487 "arxiv_id": "2503.18813", 488 "relevance": "CaMeL defense using security policies and control/data flow analysis to prevent prompt injection; alternative defense paradigm discussed as a limitation." 489 }, 490 { 491 "title": "A critical evaluation of defenses against prompt injection attacks", 492 "authors": ["Y. Jia", "Z. Shao", "Y. Liu", "J. Jia", "D. Song", "N. Z. Gong"], 493 "year": 2025, 494 "arxiv_id": "2505.18333", 495 "relevance": "Critical evaluation showing fine-tuning-based defenses are vulnerable to optimization-based attacks, motivating PISanitizer's different approach." 496 }, 497 { 498 "title": "PromptLocate: Localizing prompt injection attacks", 499 "authors": ["Y. Jia", "Y. Liu", "Z. Shao", "J. Jia", "N. Z. Gong"], 500 "year": 2026, 501 "relevance": "Attribution-based defense localizing injected prompts in context; key baseline comparison showing PISanitizer's better utility-effectiveness tradeoff." 502 }, 503 { 504 "title": "Defending against prompt injection with DataFilter", 505 "authors": ["Y. Wang", "S. Chen", "R. Alkhudair", "B. Alomair", "D. Wagner"], 506 "year": 2025, 507 "arxiv_id": "2510.19207", 508 "relevance": "Concurrent work training a sequence-to-sequence model to filter injected instructions; baseline comparison showing PISanitizer's advantage in long contexts." 509 }, 510 { 511 "title": "Universal and transferable adversarial attacks on aligned language models", 512 "authors": ["A. Zou", "Z. Wang", "N. Carlini", "M. Nasr", "J. Z. Kolter", "M. Fredrikson"], 513 "year": 2023, 514 "arxiv_id": "2307.15043", 515 "relevance": "GCG attack method used as the optimization-based attack in this paper's evaluation." 516 }, 517 { 518 "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents", 519 "authors": ["Q. Zhan", "Z. Liang", "Z. Ying", "D. Kang"], 520 "year": 2024, 521 "relevance": "Benchmark for evaluating prompt injection in LLM agents; used to evaluate PISanitizer in agent scenarios." 522 }, 523 { 524 "title": "AgentDojo: A dynamic environment to evaluate attacks and defenses for LLM agents", 525 "authors": ["E. Debenedetti", "J. Zhang", "M. Balunović", "L. Beurer-Kellner", "M. Fischer", "F. Tramèr"], 526 "year": 2024, 527 "relevance": "Dynamic environment for evaluating prompt injection attacks and defenses in LLM agents; used to test PISanitizer's limitations with benign instructions." 528 } 529 ], 530 "engagement_factors": { 531 "practical_relevance": { 532 "score": 2, 533 "justification": "PISanitizer is a deployable defense that works as a preprocessing step with any backend LLM, including closed-source ones like GPT-5, and code is released." 534 }, 535 "surprise_contrarian": { 536 "score": 2, 537 "justification": "The core insight reverses conventional wisdom: instead of preventing LLMs from following injected instructions (the standard approach), PISanitizer intentionally encourages it, creating an attacker dilemma." 538 }, 539 "fear_safety": { 540 "score": 2, 541 "justification": "Demonstrates that existing defenses (including GPT-4o-mini's built-in protection) fail for long-context prompt injection, raising security concerns for production LLM applications." 542 }, 543 "drama_conflict": { 544 "score": 0, 545 "justification": "No controversy, no claims of misconduct, no drama — straightforward defense research." 546 }, 547 "demo_ability": { 548 "score": 2, 549 "justification": "Code is released on GitHub (https://github.com/sleeepeer/PISanitizer) and only requires an open-source LLM (Llama-3.1-8B) and scipy." 550 }, 551 "brand_recognition": { 552 "score": 1, 553 "justification": "Penn State is a known university but not a famous AI lab. Paper evaluates GPT-5 which adds some brand appeal." 554 } 555 } 556 }