scan.json (32877B)
1 { 2 "paper": { 3 "title": "InjecGuard: Benchmarking and Mitigating Over-defense in Prompt Injection Guardrail Models", 4 "authors": ["Hao Li", "Xiaogeng Liu"], 5 "year": 2024, 6 "venue": "arXiv.org", 7 "arxiv_id": "2410.22770", 8 "doi": "10.48550/arXiv.2410.22770" 9 }, 10 "scan_version": 3, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Existing prompt injection guard models suffer from severe over-defense, with open-source models achieving less than 60% accuracy on benign inputs containing trigger words (near random guessing). The proposed InjecGuard model, trained with the MOF strategy on DeBERTa-v3-base, achieves 83.48% average accuracy across benign, malicious, and over-defense dimensions, rivaling GPT-4o (85.53%) at 500x the inference efficiency. The ablation study reveals that data-centric augmentation alone worsens over-defense (75.22% → 64.31%), but combining it with MOF and retraining from scratch yields the best balanced performance.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "GitHub repository URL provided in abstract: https://github.com/leolee99/InjecGuard. The paper states 'The code and datasets are released.'" 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The NotInject dataset (339 samples) and full training data (61,089 benign + 15,666 malicious samples) are stated to be released. The paper emphasizes being 'fully open-source' including 'the training dataset, strategies, code, and model' (Sec. 6). Evaluation also uses publicly available benchmarks (PINT, BIPIA, WildGuard)." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper specifies the backbone (DeBERTaV3-base) and training hyperparameters (Sec. 5.1) but provides no requirements.txt, Dockerfile, or library version listing that would allow environment recreation." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are included in the paper. While a GitHub repository is referenced, the paper itself does not contain a 'Reproducing Results' section or specific commands to run." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results in Tab. 1, Tab. 2, Tab. 3, Tab. 7, and Tab. 9 are reported as point estimates (e.g., '83.48%') with no confidence intervals, error bars, or ± notation." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims InjecGuard 'surpasses' and 'outperforms' baselines based solely on comparing accuracy numbers. No p-values, t-tests, or any statistical significance tests are used to support comparative claims." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Percentage improvements are reported with baseline context: 'surpassing the open-sourced runner-up prompt guard model by 30.8%' (relative improvement), '54.17%' improvement in over-defense accuracy, and absolute accuracy values for all baselines (Tab. 1), allowing readers to assess magnitude." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification is provided for why NotInject contains exactly 339 samples (113 per subset), why the training set has 61,089 benign and 15,666 malicious samples, or why MOF generates 1,000 samples (though Tab. 9 provides an ablation of the MOF scale, the other sizes are unjustified)." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "All results appear to be from single training runs. No standard deviation, variance, or spread across multiple runs is reported anywhere in the paper." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Eight baselines are compared: five prompt guard models (Fmops, Deepset, PromptGuard, ProtectAIv2, LakeraGuard) and three LLM-based methods (GPT-4o, Llama-2-chat, LlamaGuard3), as shown in Tab. 1." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Baselines include current state-of-the-art models: ProtectAIv2 (described as current SotA open-source), LakeraGuard (commercial), GPT-4o (2024), and LlamaGuard3 (2024). All are contemporary at time of writing." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Tab. 2 provides a thorough ablation study showing the contribution of each training component: basic dataset, data-centric augmentation, MOF with finetuning vs. scratch retraining. Tab. 3 compares MOF against a shortcut mitigation baseline. Tab. 9 ablates MOF sampling scale." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Three accuracy dimensions are reported (over-defense, benign, malicious) plus computational overhead metrics (GFLOPs, inference time in ms, efficiency score). Tab. 1 presents all metrics together." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "Evaluation of InjecGuard's outputs is entirely automated using accuracy on test datasets. No human evaluation of the system's detection outputs is performed. Human involvement is limited to dataset construction (trigger word refinement), not system evaluation." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Evaluation uses separate benchmarks (PINT test, WildGuard, BIPIA, NotInject) that are distinct from the training data. The paper explicitly states that MOF 'does not require any over-defense dataset for training' (Sec. 4.2), keeping NotInject as a held-out evaluation set." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Tab. 7 provides detailed per-benchmark results across all models: NotInject (one-word, two-word, three-word subsets), WildGuard (benign), PINT (benign, injection, overall), and BIPIA (injection)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Tab. 2 shows data-centric augmentation alone worsens over-defense accuracy (75.22% → 64.31%). Fig. 7 visualizes a case where ProtectAIv2 and PromptGuard fail but InjecGuard succeeds. Tab. 3 shows conventional shortcut mitigation degrades malicious accuracy by 10.42%." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Tab. 2 explicitly shows that data-centric augmentation alone reduces over-defense accuracy from 75.22% to 64.31% and reduces average accuracy from 74.64% to 73.87%. Tab. 3 shows shortcut mitigation method hurts malicious performance by 10.42%." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims state-of-the-art models suffer from over-defense near random guessing (confirmed: ProtectAIv2 at 56.64% in Tab. 1), InjecGuard surpasses the best model by 30.8% (confirmed as relative improvement: (83.48-63.81)/63.81 = 30.83%), and that code/datasets are released (GitHub URL provided)." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper claims MOF 'significantly reduces the bias on trigger words.' The ablation study (Tab. 2) provides controlled single-variable manipulation: basic dataset → +augmentation → +MOF, showing each component's individual effect. Fig. 3 provides attention weight visualization supporting the mechanistic claim." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The abstract claims InjecGuard offers 'a robust and open-source solution for detecting prompt injection attacks' broadly. The Limitations section acknowledges NotInject 'may not fully capture the diversity of real-world benign inputs, particularly in domain-specific applications,' but the main claims are not bounded to the tested benchmarks. The title uses the general framing 'Prompt Injection Guardrail Models.'" 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No substantive discussion of alternative explanations for the results. For example, the improvement could partly stem from InjecGuard having more training data than baselines (whose data is closed-source), but this confound is not addressed. The Limitations section discusses dataset diversity but not alternative explanations for observed performance gains." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper measures accuracy on injection detection benchmarks and claims effectiveness at injection detection. The claims match the granularity of measurements — they do not claim broader properties like 'security' that go beyond what accuracy on these benchmarks measures." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "GPT-4o-mini is versioned ('2024-07-18 version', Sec. 3.2) and DeBERTaV3-base is specified. However, GPT-4o used as a baseline in Tab. 1 has no version or snapshot date. Llama-2-chat-7b and LlamaGuard3 are named without specific checkpoint identifiers." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Full prompt text is provided in Appendix D: Fig. 11 (word-based generation prompt), Fig. 12 (LLM injection detection/refinement prompt), and Fig. 13 (long-tail format augmentation prompt). The GPT-4o evaluation prompt is also shown in Fig. 12." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Sec. 5.1 reports: DeBERTaV3-base backbone, batch size 32, 3 epochs, Adam optimizer, learning rate 2e-5, 100-step warm-up, max token length 512." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. InjecGuard is a single-pass text classification model based on DeBERTa." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Sec. 3.2 documents the NotInject pipeline in detail: trigger word identification via frequency analysis (Alg. 1), LLM-based refinement, human evaluation (3 evaluators), and corpus generation with safety checking. Sec. 4.1 documents training data collection from 20 open-source datasets with data-centric augmentation for long-tail formats. Sec. 4.2 documents the MOF pipeline." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "A dedicated 'Limitations' section is present, discussing that NotInject may not capture domain-specific diversity in fields like healthcare or finance." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "The Limitations section is brief and somewhat generic: 'may not fully capture the diversity of real-world benign inputs, particularly in domain-specific applications.' It does not discuss specific threats like training/test data overlap risk, the effect of GPT-4o-mini as both data generator and evaluation family member, or whether the trigger word methodology captures all forms of over-defense." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state what the results do NOT show. It acknowledges NotInject's limitations vaguely but does not bound the claims — e.g., does not state that results are limited to English text, to the specific benchmarks tested, or that the model has not been tested in production settings." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "The training dataset, NotInject evaluation dataset, and model are stated to be publicly released. The paper claims to be 'the first work to provide a fully open-source prompt guard model against injection, including the training dataset, strategies, code, and model' (Sec. 6)." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Data collection is described in detail: Sec. 3.2 describes NotInject construction (trigger word identification, refinement, corpus generation). Sec. 4.1 lists all 20 source datasets with counts (Tab. 4, 5, 6). Sec. 4.2 describes MOF data generation." 192 }, 193 "recruitment_methods_described": { 194 "applies": true, 195 "answer": false, 196 "justification": "Three human evaluators 'with security expertise' are employed for trigger word refinement (Sec. 3.2), but their recruitment method, institutional affiliation, and selection criteria are not described. Fig. 6 shows the evaluation agreement but not who these evaluators are or how they were selected." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The full pipeline is documented with counts at each stage: trigger word identification (Alg. 1) → LLM filtering → human refinement (3 evaluators, scoring threshold >3) → 113 trigger words → corpus generation (113 per subset × 3 subsets = 339 samples) → LLM safety check → manual review. Training data pipeline in Sec. 4.1 with counts in Tab. 4-6." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding source, acknowledgments section, or grant information is mentioned anywhere in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly stated: Hao Li at Washington University in St. Louis, Xiaogeng Liu at University of Wisconsin-Madison. Both are academic institutions with no obvious conflict with the evaluated products." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": false, 217 "answer": false, 218 "justification": "No funding is disclosed. The work appears to be unfunded academic research from two universities." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement or financial interest declaration is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "The paper evaluates prompt injection defense models, not pre-trained model knowledge/capability on benchmarks. InjecGuard is fine-tuned from scratch on curated data. While LLM baselines (GPT-4o, Llama-2-chat) are included, they are tested on defense capability, not knowledge recall." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "This paper tests defenses (prompt injection detection) rather than model knowledge. Contamination in the traditional sense (model memorizing benchmark answers) is not the primary concern." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "The paper evaluates defense/detection capability rather than pre-trained model knowledge on benchmarks." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in the study. The three human evaluators for trigger word refinement are annotators in the data pipeline, not study participants." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. The Ethics Statement discusses responsible use of the dataset but does not mention IRB approval." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in the study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in the study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in the study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Tab. 1 reports GFLOPs (60.45 for InjecGuard) and inference time (15.34 ms) for all models. An efficiency metric (Average Accuracy / Inference Time) is also computed. Fig. 1 plots performance vs time efficiency." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total training GPU hours, wall-clock training time, or API costs for data generation (GPT-4o-mini calls) are reported. Only inference-time compute is quantified." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from a single training run." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs is not stated anywhere. It is unclear whether results are from single or multiple runs." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "Hyperparameters are reported (Sec. 5.1) but no search budget, search method, or number of configurations tried is disclosed. Only the MOF sampling scale is ablated (Tab. 9)." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "Tab. 9 systematically evaluates MOF sampling scales (500, 1000, 2000) and justifies the selection of 1000 as the best balance. Tab. 2 builds up the full configuration through ablation, showing why each component was included." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "The paper compares 9 models across 4+ benchmarks and 3 accuracy dimensions without any statistical tests, let alone correction for multiple comparisons." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors evaluate their own InjecGuard model against baselines without acknowledging self-comparison bias. They also evaluate on their own NotInject dataset. No independent evaluation or acknowledgment of author-evaluation bias is provided." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "Fig. 1 plots average accuracy against time efficiency (log scale) for all models. Tab. 1 includes GFLOPs, inference time, and an efficiency metric (Accuracy/Inference Time) enabling performance-compute comparison." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether the evaluation benchmarks (PINT, BIPIA, WildGuard, NotInject) validly measure real-world prompt injection detection capability. The paper uses these benchmarks without questioning their construct validity or representativeness." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved. InjecGuard is a single-pass text classification model, and baselines are similarly direct classifiers or prompted LLMs." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of temporal leakage. The training datasets (Tab. 4-5) draw from many of the same source datasets as the evaluation benchmarks (e.g., BIPIA_train for training, BIPIA for evaluation; prompt-injections from Deepset for both). Whether temporal ordering prevents leakage is not addressed." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information. For example, NotInject was constructed using the same trigger word methodology that informs MOF, creating a potential methodological entanglement." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "Training data (Tab. 4-5) includes samples from BIPIA_train, prompt-injections (Deepset), jailbreak-classification, and other datasets that also contribute to evaluation benchmarks. The paper does not verify independence between training and test distributions or check for near-duplicate examples." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No concrete leakage detection or prevention method (e.g., n-gram overlap, decontamination pipeline) is used to verify train/test separation." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "InjecGuard achieves 83.48% average accuracy, surpassing the open-source runner-up (ProtectAIv2 at 63.81%) by 30.8% relative improvement.", 365 "evidence": "Tab. 1 shows InjecGuard at 83.48% average (87.32% over-defense, 85.74% benign, 77.39% malicious) vs ProtectAIv2 at 63.81%. Relative improvement: (83.48-63.81)/63.81 = 30.83%.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Existing open-source prompt guard models suffer from severe over-defense, with accuracy near random guessing (50%).", 370 "evidence": "Tab. 1: Fmops 5.60%, Deepset 5.31%, PromptGuard 0.88%, ProtectAIv2 56.64% over-defense accuracy. All open-source models are at or below 60%.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "InjecGuard achieves performance comparable to GPT-4o while being ~500x more efficient.", 375 "evidence": "Tab. 1: InjecGuard 83.48% avg at 15.34ms inference vs GPT-4o 85.53% at 7907.18ms. Efficiency ratio: 5.44/0.01 = 544x. InjecGuard is 2.05pp lower in accuracy.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "MOF (Mitigating Over-defense for Free) significantly reduces over-defense bias without relying on any specific over-defense dataset.", 380 "evidence": "Tab. 2 ablation: adding MOF with scratch retraining to basic+augmented data improves over-defense from 64.31% to 87.32% and average from 73.87% to 83.48%. Tab. 3 shows MOF outperforms conventional shortcut mitigation (87.32% vs 86.73% over-defense, 77.39% vs 65.53% malicious).", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Data-centric augmentation improves benign and malicious accuracy but exacerbates the over-defense issue.", 385 "evidence": "Tab. 2: augmentation improves benign (78.53% → 81.36%) and malicious (70.17% → 75.95%) but drops over-defense from 75.22% to 64.31%.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "MOF also improves performance in general (non-trigger-word) over-defense scenarios.", 390 "evidence": "Tab. 10 shows InjecGuard with MOF achieves 91.15% on PINT hard-negative dataset vs 87.86% without MOF. However, this is a single benchmark with no statistical testing.", 391 "supported": "moderate" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "Ambiguous improvement metrics", 397 "detail": "The paper reports '30.8% improvement' which is actually relative improvement ((83.48-63.81)/63.81), not the 19.67 percentage point absolute difference. This presentation can mislead readers into thinking the gap is larger than it is. The paper switches between relative and absolute metrics inconsistently." 398 }, 399 { 400 "flag": "Evaluation heavily relies on authors' own benchmark", 401 "detail": "A significant portion of InjecGuard's advantage comes from the NotInject dataset, which the authors created. While NotInject is a legitimate contribution, evaluating primarily on your own benchmark creates a methodological circularity — the same team that identified the problem (over-defense on trigger words) designed both the evaluation and the solution." 402 }, 403 { 404 "flag": "No error bars or multiple runs", 405 "detail": "All results are single point estimates with no variance, standard deviation, or confidence intervals. Fine-tuning DeBERTa can be sensitive to random seed, and performance differences (e.g., 83.48% vs 85.53% for GPT-4o) could be within noise." 406 }, 407 { 408 "flag": "Potential train/test data overlap", 409 "detail": "Training data (Tab. 4-5) and evaluation benchmarks draw from overlapping source datasets (e.g., BIPIA, Deepset prompt-injections, jailbreak-classification). While different splits may be used, no decontamination or overlap verification is reported." 410 }, 411 { 412 "flag": "GPT-4o-mini used for data generation and evaluation family as baseline", 413 "detail": "GPT-4o-mini generates both NotInject samples and MOF training data, while GPT-4o (same model family) serves as a baseline. Generated data could subtly favor patterns that GPT-4o recognizes or produces, potentially biasing the comparison." 414 }, 415 { 416 "flag": "Missing training compute costs", 417 "detail": "The paper reports inference costs but omits total training compute and API costs for GPT-4o-mini data generation. The claim of being 'lightweight' applies to inference but the full cost of the approach (including data generation) is unknown." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "Ignore Previous Prompt: Attack Techniques For Language Models", 423 "authors": ["Fábio Perez", "Ian Ribeiro"], 424 "year": 2022, 425 "arxiv_id": "2211.09527", 426 "relevance": "Foundational work identifying prompt injection attacks on LLMs, including goal hijacking and prompt leakage." 427 }, 428 { 429 "title": "Not what you've signed up for: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection", 430 "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"], 431 "year": 2023, 432 "arxiv_id": "2302.12173", 433 "relevance": "Demonstrates indirect prompt injection attacks against real-world LLM-integrated applications." 434 }, 435 { 436 "title": "Automatic and universal prompt injection attacks against large language models", 437 "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang", "Ning Zhang", "Chaowei Xiao"], 438 "year": 2024, 439 "arxiv_id": "2403.04957", 440 "relevance": "Proposes automated methods for generating prompt injection attacks against LLMs." 441 }, 442 { 443 "title": "Llama guard: Llm-based input-output safeguard for human-ai conversations", 444 "authors": ["Hakan Inan", "Kartikeya Upasani", "Jianfeng Chi"], 445 "year": 2023, 446 "arxiv_id": "2312.06674", 447 "relevance": "LLM-based guardrail model for detecting unsafe content in human-AI conversations, used as a baseline comparison." 448 }, 449 { 450 "title": "Benchmarking and Defending Against Indirect Prompt Injection Attacks on Large Language Models", 451 "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu"], 452 "year": 2023, 453 "arxiv_id": "2312.14197", 454 "relevance": "BIPIA benchmark for indirect prompt injection attacks, used as evaluation dataset in this paper." 455 }, 456 { 457 "title": "Wildguard: Open one-stop moderation tools for safety risks, jailbreaks", 458 "authors": ["Seungju Han", "Kavel Rao", "Allyson Ettinger"], 459 "year": 2024, 460 "arxiv_id": "2406.18495", 461 "relevance": "Open-source moderation tool with benchmark for safety risk detection, used for benign accuracy evaluation." 462 }, 463 { 464 "title": "Agentdojo: A dynamic environment to evaluate attacks and defenses for llm agents", 465 "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunović"], 466 "year": 2024, 467 "arxiv_id": "2406.13352", 468 "relevance": "Dynamic benchmark for evaluating prompt injection attacks and defenses in LLM agent settings." 469 }, 470 { 471 "title": "Injecagent: Benchmarking indirect prompt injections in tool-integrated large language model agents", 472 "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"], 473 "year": 2024, 474 "arxiv_id": "2403.02691", 475 "relevance": "Benchmark for indirect prompt injection in tool-integrated LLM agents." 476 }, 477 { 478 "title": "Optimization-based prompt injection attack to llm-as-a-judge", 479 "authors": ["Jiawen Shi", "Zenghui Yuan", "Yinuo Liu"], 480 "year": 2024, 481 "arxiv_id": "2403.17710", 482 "relevance": "Optimization-based prompt injection targeting LLM-as-a-judge evaluation paradigms." 483 }, 484 { 485 "title": "Neural exec: Learning (and learning from) execution triggers for prompt injection attacks", 486 "authors": ["Dario Pasquini", "Martin Strohmeier", "Carmela Troncoso"], 487 "year": 2024, 488 "arxiv_id": "2403.03792", 489 "relevance": "Novel approach to learning execution triggers for prompt injection, advancing attack methodology understanding." 490 }, 491 { 492 "title": "Struq: Defending against prompt injection with structured queries", 493 "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David A. Wagner"], 494 "year": 2024, 495 "relevance": "Defense approach using structured queries to prevent prompt injection attacks." 496 }, 497 { 498 "title": "Prompt Injection Attacks and Defenses in LLM-Integrated Applications", 499 "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"], 500 "year": 2023, 501 "arxiv_id": "2310.12815", 502 "relevance": "Comprehensive study of prompt injection attacks and defenses in LLM-integrated applications." 503 } 504 ], 505 "engagement_factors": { 506 "practical_relevance": { 507 "score": 2, 508 "justification": "InjecGuard is a released, lightweight prompt injection detector that practitioners could deploy, though it addresses a niche security concern rather than being a general-purpose tool." 509 }, 510 "surprise_contrarian": { 511 "score": 1, 512 "justification": "Showing that existing guard models have severe over-defense issues (accuracy near random guessing) is mildly surprising but not deeply contrarian — false positive problems in security classifiers are well-known." 513 }, 514 "fear_safety": { 515 "score": 2, 516 "justification": "Directly addresses prompt injection attack defense, a real security concern for LLM deployments, and demonstrates that current defenses are inadequate." 517 }, 518 "drama_conflict": { 519 "score": 1, 520 "justification": "Implicitly criticizes Meta's PromptGuard (0.88% over-defense accuracy) and ProtectAI's model, but framed academically without sensationalism." 521 }, 522 "demo_ability": { 523 "score": 2, 524 "justification": "Code, model, and datasets are released on GitHub, making it testable. However, it requires setup as a Python/HuggingFace model rather than being a one-click demo." 525 }, 526 "brand_recognition": { 527 "score": 1, 528 "justification": "From academic labs (WashU, UW-Madison), not a major AI lab. References Meta and OpenAI products as baselines but is not from these organizations." 529 } 530 } 531 }