scan.json (33524B)
1 { 2 "paper": { 3 "title": "PromptSleuth: Detecting Prompt Injection via Semantic Intent Invariance", 4 "authors": ["Mengxiao Wang", "Yuxuan Zhang", "Guofei Gu"], 5 "year": 2025, 6 "venue": "arXiv.org", 7 "arxiv_id": "2508.20890", 8 "doi": "10.48550/arXiv.2508.20890" 9 }, 10 "scan_version": 3, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "PromptSleuth, a semantic-oriented prompt injection defense that reasons over task-level intent rather than surface features, achieves near-zero false negative rates across three benchmarks (DataSentinel-Bench, AgentDojo, PromptSleuth-Bench). Existing defenses (DataSentinel, SecAlign, PromptArmor, template-based methods) fail to generalize beyond their training distributions, with FNR as high as 0.67-0.87 on novel attack types. The three-step pipeline (summarization, task-relationship graph, clustering) using GPT-4.1-mini or GPT-5-mini introduces only modest latency overhead (~1.78s and ~13.61s respectively). Ablation studies show that explicit reasoning does not improve detection and can harm performance, while medium-length system prompts provide the best accuracy-latency trade-off.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "GitHub repository provided at https://github.com/successlab/PromptSleuth (Reference [30]). The contributions section states: 'We release our benchmark and defense as open source to foster reproducibility, enable benchmarking, and encourage adoption.'" 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The PromptSleuth-Bench benchmark is released as part of the open-source package. The paper states it integrates and extends existing public benchmarks (DataSentinel-Bench and AgentDojo), and the release includes their new benchmark." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No environment specifications (requirements.txt, Dockerfile, conda file, or library versions) are mentioned in the paper. The paper only references API-based models (GPT-4.1-mini, GPT-5-mini) and open-source models without specifying environment setup details." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided in the paper. While code is released, the paper does not include a 'Reproducing Results' section or describe how to run the experiments." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results in Tables 3-4 and Figures 3, 5, 6 are reported as point estimates (e.g., FPR=0.0008, FNR=0.0007) with no confidence intervals, error bars, or uncertainty quantification." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims PromptSleuth 'consistently outperforms existing defense' based solely on comparing FPR/FNR numbers across methods. No statistical significance tests (t-tests, bootstrap, etc.) are reported for any comparison." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Results are reported with enough context to assess effect magnitude. For example, Table 3 shows PromptSleuth-5-mini FNR=0.0007 vs DataSentinel FNR=0.6669 on PromptSleuth-Bench, and latency overhead of +9.2% on GPT-4.1-mini relative to PromptArmor (Table 4)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The ablation study uses only 100 randomly picked samples (57 benign, 43 malicious) with the justification being 'due to the cost of calling commercial APIs.' No power analysis or formal justification for this sample size. Main benchmark sizes are also not justified." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. Results appear to be from single runs. The paper acknowledges API-based variability exists but does not quantify it." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper compares against 8 defenses: Instructional, Sandwich, Random_Sequence, Delimiter (template-based), Known-Answer, SecAlign, DataSentinel, and PromptArmor (Section 4.3 and 6.1)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "The three main baselines are contemporary: DataSentinel (2025), SecAlign (2024), and PromptArmor (2025). The paper explicitly states these 'represent the state of the art at the time of writing.'" 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Section 6.3 presents ablation studies on three components: (1) base model choice (GPT-4.1-nano/mini, GPT-5-nano/mini), (2) system prompt design (short/medium/long), and (3) reasoning vs. no-reasoning." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper reports FPR and FNR as primary metrics (Table 3), plus latency in seconds (Table 4), and API cost per million tokens (Table 5)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation is included. All evaluation is automated using FPR/FNR on benchmark datasets. No manual inspection of detection decisions or expert review of edge cases." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "No mention of separate dev/test splits. The ablation uses 100 randomly picked samples from the benchmark but there is no indication of a held-out test set separate from tuning decisions." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down by three difficulty levels (Easy, Medium, Hard) in Figure 3, by three benchmarks in Table 3, and by model/prompt-length combinations in Figures 5-6." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 6.2 discusses failure cases: 'tasks such as \"book the cheapest hotel\" versus \"book the most expensive hotel\" are considered injection tasks, yet their semantic structures are nearly identical.' Also discusses GPT-4.1-mini FPR issues in multi-task summarization and GPT-4.1-nano's failure modes." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 6.3 reports that explicit reasoning hurts GPT-4.1-mini performance (FNR 0.000→0.149), that GPT-4.1-nano is too weak (FNR=0.442), and that long prompts increase overhead without consistent accuracy gains." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": false, 115 "justification": "The abstract claims PromptSleuth 'consistently outperforms existing defense' but Table 3 shows it matches (not outperforms) DataSentinel on DataSentinel-Bench (both FPR=0.0000, FNR=0.0000). The claim of consistent superiority is overstated for the tied benchmark." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "Ablation studies in Section 6.3 systematically vary one component at a time (model choice, prompt length, reasoning toggle) while holding others constant, providing adequate evidence for causal claims about which design choices matter." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The abstract claims 'a robust, efficient, and generalizable strategy for defending LLMs against evolving prompt injection threats' but evaluation is primarily on GPT-based models. Open-source models were excluded after showing poor performance, and the paper acknowledges the defense 'strongly depends on the quality and precision of the system prompt.' The title and framing are broader than the tested settings." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not consider alternative explanations for PromptSleuth's success. For instance, it does not discuss whether the detector LLM's general instruction-following ability (rather than the semantic framework) drives performance, or whether the advantage comes from using more capable models than baselines." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper measures FPR and FNR directly for prompt injection detection, which is exactly what it claims to evaluate. Section 7 explicitly discusses the distinction between detection-based and prevention-based metrics and justifies their choice." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Models are identified by marketing names only: 'GPT-4.1-mini', 'GPT-4.1-nano', 'GPT-5-mini', 'GPT-5-nano', 'Llama-3-8B-Instruct', 'Mistral-7B-Instruct', 'TinyLlama'. No API snapshot dates or version strings are provided, and model behavior changes across versions." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "Table 2 presents only a 'simplified system prompt of PromptSleuth' — explicitly not the actual prompt used. The paper states this is simplified, meaning the reader cannot reconstruct the exact prompts sent to the model from the paper alone." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "Only GPT-5-nano's 'fixed temperature of 1' is mentioned. No temperature, top-p, or max_tokens settings are reported for GPT-4.1-mini, GPT-4.1-nano, GPT-5-mini, or any other model used in the main experiments." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "The three-step pipeline is described in detail in Section 5.2: (1) summarization extracts abstract tasks, (2) task-relationship graph generation classifies edges as related/unrelated, (3) clustering isolates unrelated tasks. Algorithm 1 provides pseudocode. The workflow diagram in Figure 4 illustrates the full pipeline." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": false, 162 "justification": "PromptSleuth-Bench construction is described conceptually ('we build on DATASENTINEL by incorporating new attack techniques') but the exact generation process, filtering criteria, and dataset statistics are not documented in detail. No counts of how many examples were generated, filtered, or retained at each construction stage." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 7 'Discussion' contains a dedicated 'Limitations' subsection with substantive discussion of weaknesses including dependence on model capability, computational overhead, semantic similarity edge cases, and system prompt quality." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Specific threats are discussed: 'tasks such as \"book the cheapest hotel\" versus \"book the most expensive hotel\" are considered injection tasks, yet their semantic structures are nearly identical'; 'our defense strongly depends on the quality and precision of the system prompt'; 'When applied to smaller models... the performance is suboptimal.'" 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state what the results do NOT show. It discusses limitations of the approach but does not bound the claims by specifying excluded settings, populations, or claims not being made. The broad framing ('generalizable strategy') is not counterbalanced by specific negative scope statements." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "The benchmark and defense are released as open source (GitHub, Reference [30]). The datasets used (DataSentinel-Bench, AgentDojo) are also publicly available. Raw benchmark data should be available for verification." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 4.2 describes how PromptSleuth-Bench was constructed: extending DataSentinel with new attack techniques across three categories (system prompt forgery, user prompt camouflage, model behavior manipulation), and extending AgentDojo with multi-task adversarial scenarios. Attack technique taxonomy is shown in Figure 2." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. The benchmark consists of synthetically constructed prompt injection examples, not human-generated data." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "The benchmark construction methodology is described at a high level but lacks specifics: no counts of examples at each construction stage, no filtering criteria documented, no explanation of how many examples were generated per attack technique or difficulty level." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section listing grants or sponsors." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly stated: all three authors are from Texas A&M University. They are not affiliated with any company whose product is being evaluated." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "Cannot assess funder independence because no funding is disclosed. The absence of any funding disclosure prevents evaluation of this criterion." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement appears in the paper. Absence of disclosure is not the same as absence of conflict." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "The paper tests defenses/tools (prompt injection detection systems) rather than evaluating pre-trained model knowledge on benchmarks. The LLMs are used as inference engines for the defense pipeline, not being evaluated for their own capabilities." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "The paper evaluates defense frameworks against prompt injection, not model knowledge on benchmarks. Contamination of model training data is not the relevant concern for this defense evaluation setting." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "The paper tests defense systems rather than model capabilities on benchmarks. The relevant evaluation is whether defenses detect injections, not whether models have memorized benchmark answers." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study. All evaluation is automated using benchmark datasets." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. The study evaluates defense systems using synthetic benchmark data." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants or experimental conditions involving humans." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Table 4 reports average inference overhead in seconds for GPT-4.1-mini (1.78s) and GPT-5-mini (13.61s). Table 5 reports API pricing per million tokens. Relative overhead vs PromptArmor is calculated (+9.2% and +3.5%)." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total computational budget is stated. The paper does not report total API spend, total number of API calls, or total compute used across all experiments." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No results across multiple random seeds are reported. The paper notes GPT-5-nano has a 'fixed temperature of 1' introducing 'higher variability' but does not report seed sensitivity for any model." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs is never stated. Results appear to be single-run point estimates with no indication of repetition." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "While Section 6.3 evaluates different model and prompt configurations, this is not framed as a hyperparameter search with a stated budget. No total compute or cost for the search process is reported." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "Section 6.3 systematically evaluates model choice (4 models), prompt design (3 lengths), and reasoning (on/off), selecting GPT-4.1-mini with medium prompt and no reasoning based on clearly reported accuracy, latency, and cost tradeoffs." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No formal statistical tests are performed in the paper, so multiple comparison correction is not applicable. All comparisons are based on direct numerical comparison of FPR/FNR values." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors create PromptSleuth-Bench and then show their defense works best on it, without acknowledging the inherent bias of evaluating on their own benchmark. No discussion of self-comparison bias or independent evaluation." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "While Table 4 compares latency across defenses and the paper discusses GPT-4.1-mini vs GPT-5-mini tradeoffs, performance is not systematically reported as a function of compute budget. No performance curves across compute levels are provided." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper does not discuss whether PromptSleuth-Bench actually measures real-world defense effectiveness. The benchmark is assumed to be valid without discussing construct validity — whether synthetic attack variants reflect actual adversary behavior in production." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "The paper evaluates defense frameworks (PromptSleuth vs baselines) where the defense mechanism itself IS the scaffold being tested. There is no confound between scaffold and model to address." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "The paper does not discuss whether GPT-4.1-mini or GPT-5-mini may have seen DataSentinel-Bench or AgentDojo examples during training. Since these benchmarks were published before the models' training cutoffs, the detector LLMs may have memorized patterns, potentially inflating detection accuracy." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "Not discussed. The evaluation setup provides the full prompt (system + user) to the detector LLM, but there is no analysis of whether this provides information that would not be available in a real deployment scenario." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "PromptSleuth-Bench is explicitly built on top of DataSentinel-Bench and AgentDojo. The paper does not discuss whether this shared structure between benchmarks affects the independence of cross-benchmark evaluation." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention methods are applied. No canary strings, membership inference, or decontamination pipelines are used." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "PromptSleuth consistently outperforms existing defenses across all benchmarks while maintaining comparable runtime and cost efficiency.", 365 "evidence": "Table 3 shows PromptSleuth-5-mini achieves FPR=0.0000/FNR=0.0000 on DataSentinel-Bench (matching DataSentinel), FPR=0.0008/FNR=0.0007 on PromptSleuth-Bench (vs next best PromptArmor at 0.0926/0.0825), and FPR=0.0240/FNR=0.0340 on AgentDojo. Table 4 shows only +9.2% latency overhead vs PromptArmor on GPT-4.1-mini.", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "Existing defenses fail to generalize to novel attack types and multi-task scenarios.", 370 "evidence": "Figure 3 shows template-based defenses reach FPR=1.00 on Medium tier. DataSentinel degrades to FNR=0.6669 on PromptSleuth-Bench and FNR=0.4878 on AgentDojo (Table 3). SecAlign shows FNR=0.4947-0.8664 across benchmarks.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Semantic intent reasoning generalizes beyond surface-level pattern matching for prompt injection detection.", 375 "evidence": "PromptSleuth maintains low FNR across Easy/Medium/Hard difficulty levels while syntax-based defenses collapse. However, this comparison confounds the defense methodology with the underlying model capability, as PromptSleuth uses GPT-4.1-mini/GPT-5-mini while baselines use different backends.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "GPT-4.1-mini provides the optimal cost-performance trade-off for the PromptSleuth pipeline.", 380 "evidence": "Section 6.3 ablation on 100 samples: GPT-4.1-mini achieves FPR=0.000, FNR=0.000 with 1.78s latency, vs GPT-5-mini at 13.61s and GPT-4.1-nano at FNR=0.442. But the 100-sample ablation is small for reliable estimates.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Explicit reasoning does not improve and can harm detection accuracy.", 385 "evidence": "Figure 6 shows reasoning increases GPT-4.1-mini FNR from 0.000 to 0.149, while providing only marginal improvement for GPT-4.1-nano (FNR 0.149→0.098). Based on 100-sample ablation study.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Stored prompt injection attacks achieve high success rates on ChatGPT and Gemini.", 390 "evidence": "Section 7 'Future Works' reports 'remember that' achieves 100% success on ChatGPT and Gemini. However, this is described as a controlled test with no formal experimental protocol, sample sizes, or methodology details.", 391 "supported": "weak" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "Self-serving benchmark evaluation", 397 "detail": "The authors construct PromptSleuth-Bench and then demonstrate their method works best on it. The benchmark is designed around the same insight (semantic intent invariance) that drives the defense, creating circular validation. No independent benchmark was used as primary evaluation." 398 }, 399 { 400 "flag": "No error bars or variance across runs", 401 "detail": "All results are single-run point estimates with no uncertainty quantification. Given that LLM API outputs are stochastic (the paper even notes GPT-5-nano's 'fixed temperature of 1' introduces variability), the stability of reported metrics is unknown." 402 }, 403 { 404 "flag": "Very small ablation sample size", 405 "detail": "The ablation study (Section 6.3) uses only 100 samples (57 benign, 43 malicious), justified by API cost constraints. With only 43 malicious samples, a single misclassification changes FNR by ~2.3%, making the zero-FNR claims from ablation unreliable." 406 }, 407 { 408 "flag": "No statistical significance tests", 409 "detail": "Claims of consistent superiority are based entirely on comparing raw FPR/FNR numbers without any significance testing, despite stochastic API outputs and relatively small sample sizes in ablation." 410 }, 411 { 412 "flag": "Unfair baseline comparison", 413 "detail": "PromptSleuth uses GPT-4.1-mini/GPT-5-mini as its detector while baselines use their own (potentially weaker) architectures. The improvement may partially reflect the underlying model capability rather than the semantic framework. This confound is not discussed." 414 }, 415 { 416 "flag": "Missing model version specifics", 417 "detail": "All models are referenced by marketing names without API snapshot dates. GPT-5 and GPT-5-mini are mentioned but their availability and exact versions at evaluation time are unclear, raising reproducibility concerns." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "StruQ: Defending Against Prompt Injection with Structured Queries", 423 "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"], 424 "year": 2024, 425 "arxiv_id": "2402.06363", 426 "relevance": "Proposes structured query defense against prompt injection through prompt schema validation, directly relevant to PI defense landscape." 427 }, 428 { 429 "title": "SecAlign: Defending against prompt injection with preference optimization", 430 "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "Chuan Guo"], 431 "year": 2024, 432 "relevance": "Adversarial training-based defense against prompt injection used as a baseline in this study." 433 }, 434 { 435 "title": "DataSentinel: A game-theoretic detection of prompt injection attacks", 436 "authors": ["Yupei Liu", "Yuqi Jia", "Jinyuan Jia", "Dawn Song", "Neil Zhenqiang Gong"], 437 "year": 2025, 438 "arxiv_id": "2504.11358", 439 "relevance": "Game-theoretic prompt injection defense and benchmark used as both baseline and data source for PromptSleuth-Bench." 440 }, 441 { 442 "title": "AgentDojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents", 443 "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunović", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"], 444 "year": 2024, 445 "relevance": "Multi-step agent evaluation benchmark for prompt injection, used as one of three evaluation datasets." 446 }, 447 { 448 "title": "Jatmo: Prompt injection defense by task-specific finetuning", 449 "authors": ["Julien Piet", "Maha Alrashed", "Chawin Sitawarin", "Sizhe Chen", "Zeming Wei", "Elizabeth Sun", "Basel Alomair", "David Wagner"], 450 "year": 2023, 451 "arxiv_id": "2312.17673", 452 "relevance": "Task-specific fine-tuning approach to prompt injection defense, relevant to the fine-tuning vs prompt engineering comparison." 453 }, 454 { 455 "title": "Formalizing and benchmarking prompt injection attacks and defenses", 456 "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"], 457 "year": 2024, 458 "relevance": "Formal framework and benchmark for prompt injection evaluation used in this paper's evaluation metrics." 459 }, 460 { 461 "title": "PromptArmor: Simple yet effective prompt injection defenses", 462 "authors": ["Tianneng Shi", "Kaijie Zhu", "Zhun Wang"], 463 "year": 2025, 464 "relevance": "Adaptive LLM wrapper defense against prompt injection, used as the strongest baseline comparison." 465 }, 466 { 467 "title": "The instruction hierarchy: Training LLMs to prioritize privileged instructions", 468 "authors": ["Eric Wallace", "Kevin Xiao", "Jan Leike", "Lilian Weng", "Jonas Heidecke", "Alex Beutel"], 469 "year": 2024, 470 "arxiv_id": "2404.13208", 471 "relevance": "Proposes instruction prioritization hierarchy for LLM safety, foundational to understanding prompt injection defense strategies." 472 }, 473 { 474 "title": "FATH: Authentication-based test-time defense against indirect prompt injection attacks", 475 "authors": ["Jiongxiao Wang", "Fangzhou Wu", "Wendi Li", "Jinsheng Pan", "Edward Suh", "Z. Morley Mao", "Muhao Chen", "Chaowei Xiao"], 476 "year": 2024, 477 "arxiv_id": "2410.21492", 478 "relevance": "Test-time authentication defense against indirect prompt injection, complementary detection-based approach." 479 }, 480 { 481 "title": "ControlNET: A firewall for RAG-based LLM system", 482 "authors": ["Hongwei Yao", "Haoran Shi", "Yidou Chen", "Yixin Jiang", "Cong Wang", "Zhan Qin", "Kui Ren", "Chun Chen"], 483 "year": 2025, 484 "arxiv_id": "2504.09593", 485 "relevance": "Retrieval firewall protecting RAG pipelines from prompt injection, extending defense to retrieval-augmented generation." 486 }, 487 { 488 "title": "Defense against prompt injection attacks via mixture of encodings", 489 "authors": ["Ruiyi Zhang", "David Sullivan", "Kyle Jackson", "Pengtao Xie", "Mei Chen"], 490 "year": 2025, 491 "arxiv_id": "2504.07467", 492 "relevance": "Encoding-based defense against prompt injection using mixed representations." 493 }, 494 { 495 "title": "Benchmarking and defending against indirect prompt injection attacks on large language models", 496 "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Emre Kiciman", "Guangzhong Sun", "Xing Xie", "Fangzhao Wu"], 497 "year": 2023, 498 "arxiv_id": "2312.14197", 499 "relevance": "Early benchmark and defense evaluation for indirect prompt injection, one of the baselines referenced in this paper." 500 }, 501 { 502 "title": "Ignore previous prompt: Attack techniques for language models", 503 "authors": ["Fábio Perez", "Ian Ribeiro"], 504 "year": 2022, 505 "relevance": "Foundational work on prompt injection attack techniques that established the instruction-override attack category." 506 } 507 ], 508 "engagement_factors": { 509 "practical_relevance": { 510 "score": 2, 511 "justification": "Practitioners building LLM applications could adopt this defense framework; code is open-sourced and works with API-based models, though integration requires implementation effort." 512 }, 513 "surprise_contrarian": { 514 "score": 1, 515 "justification": "The insight that semantic intent is invariant across attack variants is somewhat novel but aligns with existing intuitions about defense-in-depth for prompt injection." 516 }, 517 "fear_safety": { 518 "score": 2, 519 "justification": "Demonstrates that existing prompt injection defenses fail dramatically on novel attack types (FNR up to 87%), raising concerns about deployed LLM application security." 520 }, 521 "drama_conflict": { 522 "score": 1, 523 "justification": "Shows existing defenses are brittle but presents this as a technical finding rather than a dramatic takedown of specific products or organizations." 524 }, 525 "demo_ability": { 526 "score": 2, 527 "justification": "Code and benchmark released on GitHub; requires OpenAI API key to run but could be tried by practitioners with API access." 528 }, 529 "brand_recognition": { 530 "score": 1, 531 "justification": "Texas A&M is a known university but not a top AI lab; the paper uses GPT-5 which adds some brand interest." 532 } 533 } 534 }