scan.json (27617B)
1 { 2 "paper": { 3 "title": "Bypassing LLM Guardrails: An Empirical Analysis of Evasion Attacks against Prompt Injection and Jailbreak Detection Systems", 4 "authors": [ 5 "William Hackett", 6 "Lewis Birch", 7 "Stefan Trawicki", 8 "Neeraj Suri", 9 "Peter Garraghan" 10 ], 11 "year": 2025, 12 "venue": "arXiv", 13 "arxiv_id": "2504.11168" 14 }, 15 "checklist": { 16 "artifacts": { 17 "code_released": { 18 "applies": true, 19 "answer": false, 20 "justification": "No source code repository URL is provided in the paper. The authors mention a HuggingFace dataset of evaded samples (footnote 2) but no code for reproducing the attacks or experimental pipeline." 21 }, 22 "data_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "The authors released evaded prompt injection and jailbreak samples on HuggingFace (https://huggingface.co/datasets/Mindgard/evaded-prompt-injection-and-jailbreak-samples, footnote 2 in Section 5). The datasets used (safe-guard-prompt-injection, hackGPT) are publicly available." 26 }, 27 "environment_specified": { 28 "applies": true, 29 "answer": false, 30 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. The paper mentions using TextAttack (Morris et al., 2020) but does not specify library versions or Python version." 31 }, 32 "reproduction_instructions": { 33 "applies": true, 34 "answer": false, 35 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The experimental setup section (Section 4) describes the methodology at a high level but not enough to reproduce without guessing implementation details." 36 } 37 }, 38 "statistical_methodology": { 39 "confidence_intervals_or_error_bars": { 40 "applies": true, 41 "answer": false, 42 "justification": "All results are reported as point estimates of Attack Success Rate (ASR) with no confidence intervals or error bars. Tables and figures show single percentage values only." 43 }, 44 "significance_tests": { 45 "applies": true, 46 "answer": false, 47 "justification": "The paper makes comparative claims about which attacks and defenses are more effective (e.g., 'Vijil Prompt Injection exhibited the highest susceptibility') but provides no statistical significance tests. Comparisons are based solely on comparing raw percentages." 48 }, 49 "effect_sizes_reported": { 50 "applies": true, 51 "answer": true, 52 "justification": "The paper provides ASR percentages with baseline context. Table 3 reports baseline ASR, new ASR, and delta (percentage change). Section 5 gives specific ASR values for each technique against each guardrail, providing enough context to assess magnitude (e.g., 'Emoji Smuggling achieved 100% ASR', 'Protect AI v2 reduced prompt injection ASR to 20.26%')." 53 }, 54 "sample_size_justified": { 55 "applies": true, 56 "answer": false, 57 "justification": "The jailbreak dataset contains only 78 prompts, which is quite small for drawing broad conclusions. No justification for this sample size is provided, nor is there any acknowledgment that 78 prompts may be too few for reliable ASR estimates." 58 }, 59 "variance_reported": { 60 "applies": true, 61 "answer": false, 62 "justification": "No variance, standard deviation, or spread measures are reported. All ASR results appear to be from single experimental runs with no indication of multiple runs or seed variation." 63 } 64 }, 65 "evaluation_design": { 66 "baselines_included": { 67 "applies": true, 68 "answer": true, 69 "justification": "Table A.1 reports detection baselines for each guardrail system on the unperturbed adversarial samples. The evasion results are compared against these baseline detection rates, showing how much each attack degrades detection." 70 }, 71 "baselines_contemporary": { 72 "applies": true, 73 "answer": true, 74 "justification": "The six guardrail systems tested are contemporary: Azure Prompt Shield (2024), ProtectAI v2 (April 2024), Meta Prompt Guard (2024), Vijil (2025), NeMo Guard (2024). These represent current state-of-the-art protection systems." 75 }, 76 "ablation_study": { 77 "applies": true, 78 "answer": false, 79 "justification": "The paper tests multiple attack techniques independently but does not perform ablation studies to isolate which components of the attacks contribute most to evasion success. For example, the word importance transferability experiment (Section 5.3) is a separate experiment, not a systematic ablation of the AML pipeline." 80 }, 81 "multiple_metrics": { 82 "applies": true, 83 "answer": false, 84 "justification": "The paper uses only a single metric: Attack Success Rate (ASR), defined as the rate at which modified prompts are misclassified as benign. No other metrics such as semantic similarity preservation, query efficiency, or perceptibility scores are reported." 85 }, 86 "human_evaluation": { 87 "applies": true, 88 "answer": false, 89 "justification": "No human evaluation of the perturbed prompts is included. The paper claims perturbations 'maintain adversarial utility' and are 'imperceptible' but provides no human judgment to verify these claims. Section 8 (Limitations) acknowledges that 'more rigorous quantitative analyses are needed to determine how perturbations affect the success rate and intended behavior of modified prompt injections or jailbreaks.'" 90 }, 91 "held_out_test_set": { 92 "applies": true, 93 "answer": true, 94 "justification": "The prompt injection dataset uses a test set split: 'From its test set (2,060 examples), we selected only adversarial samples (650 examples), finally filtering out jailbreak samples totaling 476 prompt injection prompts' (Section 4). The jailbreak dataset is a separate repository of 78 prompts." 95 }, 96 "per_category_breakdown": { 97 "applies": true, 98 "answer": true, 99 "justification": "Results are broken down per attack technique and per guardrail system, shown in Figures 2-5 and Tables A.5-A.8. This allows readers to see which specific attacks are effective against which specific guardrails." 100 }, 101 "failure_cases_discussed": { 102 "applies": true, 103 "answer": true, 104 "justification": "The paper discusses cases where attacks were less effective: 'Deletion Characters' was the least effective (26.82% for prompt injections, 7.95% for jailbreaks), and Pruthi showed decreased ASR with transferability (-18.18% and -1.01%). The Limitations section acknowledges adversarial prompt efficacy concerns." 105 }, 106 "negative_results_reported": { 107 "applies": true, 108 "answer": true, 109 "justification": "The paper reports that some attack techniques had low success rates (e.g., Pruthi, Alzantot on jailbreaks), and that transferability decreased ASR for Pruthi (-18.18% for jailbreaks, -1.01% for prompt injections in Table 3). It also notes that AML evasion had lower overall success than character injection." 110 } 111 }, 112 "claims_and_evidence": { 113 "abstract_claims_supported": { 114 "applies": true, 115 "answer": true, 116 "justification": "The abstract claims that 'both methods can be used to evade detection while maintaining adversarial utility achieving in some instances up to 100% evasion success' — this is supported by the results (Emoji Smuggling achieves 100% ASR). The claim about word importance ranking transferability is supported by Table 3. The qualifier 'in some instances' appropriately hedges the claim." 117 }, 118 "causal_claims_justified": { 119 "applies": true, 120 "answer": true, 121 "justification": "The paper's causal claims are about attacks causing evasion, which is directly demonstrated through controlled experiments: applying specific perturbations (the treatment) and measuring whether classification changes (the outcome). This is an adequate experimental design for these causal claims." 122 }, 123 "generalization_bounded": { 124 "applies": true, 125 "answer": false, 126 "justification": "The title claims 'Bypassing LLM Guardrails' generally, but the study tests only 6 specific guardrail systems. The abstract says 'vulnerabilities within current LLM protection mechanisms' without bounding this to the tested systems. Section 8 partially addresses this by noting 'it limits the generalizability of our findings' regarding the black-box scope, but the title and conclusion still overgeneralize." 127 }, 128 "alternative_explanations_discussed": { 129 "applies": true, 130 "answer": true, 131 "justification": "The Discussion section (Section 6) considers alternative explanations: differences in training data exposure (Section 6.1), differences in tokenizer handling (Section 6.1), and design differences in input size and token support (Section 6.3). Section 6.1 explains why success varies across models." 132 } 133 }, 134 "setup_transparency": { 135 "model_versions_specified": { 136 "applies": true, 137 "answer": false, 138 "justification": "The guardrail models are named but specific version hashes or dates are not given for most. ProtectAI v1 and v2 have release dates ('25th November 2023' and '21st April 2024'). Meta Prompt Guard is described as fine-tuned from mDeBERTa-v3-base (86M parameters). GPT-4o-mini is mentioned without a snapshot date. Azure Prompt Shield has no version identifier." 139 }, 140 "prompts_provided": { 141 "applies": false, 142 "answer": false, 143 "justification": "The paper does not use prompting in the traditional sense — it tests classification-based guardrail systems, not prompt-based LLM evaluations. The 'prompts' are the adversarial inputs themselves, which are described via the datasets used and example perturbations shown in the Appendix." 144 }, 145 "hyperparameters_reported": { 146 "applies": true, 147 "answer": false, 148 "justification": "No hyperparameters are reported for the AML evasion techniques (e.g., perturbation budget, similarity thresholds, number of iterations). TextAttack is mentioned as the framework but no configuration details are provided. The character injection techniques have no tunable parameters, but the AML techniques do and these are not documented." 149 }, 150 "scaffolding_described": { 151 "applies": false, 152 "answer": false, 153 "justification": "No agentic scaffolding is used. The paper tests classification-based guardrail systems with automated attack techniques, not agentic pipelines." 154 }, 155 "data_preprocessing_documented": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 4 describes the data pipeline: 'From its test set (2,060 examples), we selected only adversarial samples (650 examples), finally filtering out jailbreak samples totaling 476 prompt injection prompts.' The jailbreak dataset source and size (78 prompts) are specified. The filtering steps and counts are documented." 159 } 160 }, 161 "limitations_and_scope": { 162 "limitations_section_present": { 163 "applies": true, 164 "answer": true, 165 "justification": "Section 8 is titled 'Limitations' and provides substantive discussion of three specific limitations: black-box target scope, transferability mechanisms, and adversarial prompt efficacy." 166 }, 167 "threats_to_validity_specific": { 168 "applies": true, 169 "answer": true, 170 "justification": "The limitations are specific to this study: 'Our study focused solely on Azure Prompt Shield as the representative black-box target' (Section 8), and 'the underlying mechanisms driving this transferability, particularly regarding word importance, remain unclear' — these are specific to the experimental design, not generic disclaimers." 171 }, 172 "scope_boundaries_stated": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section 8 explicitly states scope boundaries: the black-box evaluation is limited to Azure Prompt Shield only, transferability mechanisms are not fully understood, and the efficacy of perturbed prompts against actual LLMs has not been rigorously tested. These are specific things the paper did NOT test." 176 } 177 }, 178 "data_integrity": { 179 "raw_data_available": { 180 "applies": true, 181 "answer": true, 182 "justification": "The evaded prompt samples are released on HuggingFace (footnote 2), the prompt injection dataset (safe-guard-prompt-injection) is public, and the jailbreak dataset (hackGPT) is publicly available on GitHub. This allows independent verification of the attack results." 183 }, 184 "data_collection_described": { 185 "applies": true, 186 "answer": true, 187 "justification": "Section 4 describes how the evaluation data was collected: the prompt injection dataset (Erdogan et al., 2024) with 10,296 examples, filtering to 476 prompt injections from the test set, and the jailbreak dataset (NoDataFound, 2024) with 78 prompts." 188 }, 189 "recruitment_methods_described": { 190 "applies": false, 191 "answer": false, 192 "justification": "No human participants were involved. The study uses publicly available datasets and automated attacks against guardrail systems." 193 }, 194 "data_pipeline_documented": { 195 "applies": true, 196 "answer": true, 197 "justification": "Section 4 documents the pipeline: dataset selection, filtering criteria (test set only, adversarial samples only, jailbreak samples removed), final sample counts (476 prompt injections, 78 jailbreaks), attack application via automated system/TextAttack, and evaluation against guardrails. The data flow is traceable." 198 } 199 }, 200 "conflicts_of_interest": { 201 "funding_disclosed": { 202 "applies": true, 203 "answer": false, 204 "justification": "No funding information is disclosed anywhere in the paper. There is no acknowledgments section listing grants or sponsors, only a brief acknowledgment of guardrail vendors for the disclosure process (Section 10)." 205 }, 206 "affiliations_disclosed": { 207 "applies": true, 208 "answer": true, 209 "justification": "Author affiliations are listed: '1Mindgard, 2Lancaster University'. All authors have dual affiliations with Mindgard (an AI security company) and Lancaster University. The Mindgard affiliation is clearly listed." 210 }, 211 "funder_independent_of_outcome": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding source is disclosed. However, all authors are affiliated with Mindgard, an AI security company. Mindgard has a commercial interest in demonstrating that existing guardrails are vulnerable (their business is AI security testing). This potential conflict is not discussed. The funder independence cannot be assessed because no funding is disclosed." 215 }, 216 "financial_interests_declared": { 217 "applies": true, 218 "answer": false, 219 "justification": "No competing interests statement is provided. The authors are affiliated with Mindgard, an AI security company that would commercially benefit from demonstrating guardrail vulnerabilities, but this financial interest is not declared or discussed." 220 } 221 }, 222 "contamination": { 223 "training_cutoff_stated": { 224 "applies": false, 225 "answer": false, 226 "justification": "The paper tests evasion attacks against guardrail classification models, not pre-trained LLM capabilities on benchmarks. The study evaluates whether guardrails can detect perturbed malicious prompts, not whether a model has memorized test data." 227 }, 228 "train_test_overlap_discussed": { 229 "applies": false, 230 "answer": false, 231 "justification": "Same as above — the paper does not evaluate a pre-trained model's knowledge on a benchmark. It tests whether classification-based guardrails can detect adversarially perturbed inputs." 232 }, 233 "benchmark_contamination_addressed": { 234 "applies": false, 235 "answer": false, 236 "justification": "Not applicable — the paper is a security evaluation of guardrail classifiers, not a benchmark evaluation of LLM capabilities. Data contamination in the traditional sense is not a relevant concern here." 237 } 238 }, 239 "human_studies": { 240 "pre_registered": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants were involved in this study." 244 }, 245 "irb_or_ethics_approval": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants were involved in this study." 249 }, 250 "demographics_reported": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants were involved in this study." 254 }, 255 "inclusion_exclusion_criteria": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants were involved in this study." 259 }, 260 "randomization_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants were involved in this study." 264 }, 265 "blinding_described": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants were involved in this study." 269 }, 270 "attrition_reported": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants were involved in this study." 274 } 275 }, 276 "cost_and_practicality": { 277 "inference_cost_reported": { 278 "applies": true, 279 "answer": false, 280 "justification": "No API costs, token consumption, or wall-clock time is reported for the evasion attacks or the guardrail evaluations. This is relevant because AML attacks require many queries to the target model, and the paper does not report query counts or costs." 281 }, 282 "compute_budget_stated": { 283 "applies": true, 284 "answer": false, 285 "justification": "No computational budget is stated. The paper does not report GPU hours, API costs, hardware used, or total time to run experiments." 286 } 287 } 288 }, 289 "claims": [ 290 { 291 "claim": "Character injection techniques and AML evasion techniques can bypass LLM guardrail detection, achieving up to 100% evasion success in some instances.", 292 "evidence": "Figures 2-3 and Tables A.5-A.6 show Emoji Smuggling achieves 100% ASR across all tested guardrails for both prompt injections and jailbreaks. Multiple other character injection techniques achieve high ASRs (Section 5.1).", 293 "supported": "strong" 294 }, 295 { 296 "claim": "Vijil Prompt Injection exhibited the highest susceptibility to character injection attacks, with average ASRs of 87.95% for prompt injections and 91.67% for jailbreaks.", 297 "evidence": "Section 5.1 reports these figures, with per-technique breakdowns in Tables A.5 and A.6 supporting the claim.", 298 "supported": "strong" 299 }, 300 { 301 "claim": "AML evasion techniques exhibit lower overall success rates compared to character injection.", 302 "evidence": "Section 5.2 reports AML ASRs ranging from under 18% to 57.57%, compared to character injection techniques reaching up to 100%. This pattern is consistent across the results in Figures 4-5.", 303 "supported": "strong" 304 }, 305 { 306 "claim": "White-box word importance ranking transferability can enhance ASR against black-box targets.", 307 "evidence": "Table 3 shows 6 of 8 techniques improved jailbreak ASR and 7 of 8 improved prompt injection ASR when using Protect AI v2 to compute word importance rankings for attacking Azure Prompt Shield. However, improvements are modest (e.g., Bert-Attack prompt injection ASR improved from 65.34% to 73.11%).", 308 "supported": "moderate" 309 }, 310 { 311 "claim": "Perturbed prompts maintain adversarial utility while evading detection.", 312 "evidence": "The paper provides qualitative examples in Appendix Tables A.2-A.4 but acknowledges in Section 8 (Limitations) that 'more rigorous quantitative analyses are needed to determine how perturbations affect the success rate and intended behavior.' No systematic evaluation of prompt efficacy post-perturbation is conducted.", 313 "supported": "weak" 314 } 315 ], 316 "methodology_tags": [ 317 "benchmark-eval" 318 ], 319 "key_findings": "Character injection techniques (especially emoji smuggling, Unicode tags, and number substitution) can achieve near-complete evasion of LLM guardrail systems, with emoji smuggling reaching 100% ASR across all tested guardrails. Adversarial ML evasion techniques are less effective overall but still achieve significant bypass rates, particularly against white-box targets. Word importance ranking computed from white-box models can transfer to improve attack success against black-box guardrails, though improvements are modest. The study reveals that the gap between what guardrail classifiers can detect and what LLMs can interpret creates an exploitable vulnerability.", 320 "red_flags": [ 321 { 322 "flag": "Company evaluating competitor products", 323 "detail": "All authors are affiliated with Mindgard, an AI security company. The paper demonstrates that competitor guardrail products (Azure, Meta, ProtectAI, Vijil, NeMo) are vulnerable to attacks, which directly benefits Mindgard's commercial interests. No conflict of interest statement is included." 324 }, 325 { 326 "flag": "Small jailbreak dataset", 327 "detail": "The jailbreak evaluation uses only 78 prompts (from the hackGPT repository), which is small for drawing generalizable conclusions about attack effectiveness. ASR percentages on 78 samples have wide implicit confidence intervals (e.g., 11.54% = 9/78 samples)." 328 }, 329 { 330 "flag": "No uncertainty quantification", 331 "detail": "All results are single-run point estimates with no confidence intervals, error bars, or variance measures. Given the small jailbreak dataset size, the reported ASR percentages may not be stable estimates." 332 }, 333 { 334 "flag": "Adversarial utility not rigorously verified", 335 "detail": "The paper claims perturbed prompts 'maintain adversarial utility' but provides no systematic evaluation of whether perturbed prompts actually achieve their malicious intent when passed to an LLM. The Limitations section acknowledges this gap." 336 }, 337 { 338 "flag": "Missing attack cost analysis", 339 "detail": "AML evasion techniques require multiple queries to the target model for word importance ranking and perturbation refinement, but the number of queries needed per successful evasion is never reported, making practical threat assessment difficult." 340 } 341 ], 342 "cited_papers": [ 343 { 344 "title": "Breaking down the defenses: A comparative survey of attacks on large language models", 345 "authors": ["Arijit Ghosh Chowdhury", "Md Mofijul Islam", "Vaibhav Kumar", "Faysal Hossain Shezan", "Vaibhav Kumar", "Vinija Jain", "Aman Chadha"], 346 "year": 2024, 347 "arxiv_id": "2403.04786", 348 "relevance": "Comprehensive survey of attacks on LLMs, directly relevant to understanding the threat landscape for LLM guardrails." 349 }, 350 { 351 "title": "Safeguarding large language models: A survey", 352 "authors": ["Yi Dong", "Ronghui Mu", "Yanghao Zhang", "Siqi Sun", "Tianle Zhang", "Changshun Wu", "Gaojie Jin", "Yi Qi", "Jinwei Hu", "Jie Meng", "Saddek Bensalem", "Xiaowei Huang"], 353 "year": 2024, 354 "arxiv_id": "2406.02622", 355 "relevance": "Survey on LLM safeguarding techniques including guardrails, directly relevant to the defense mechanisms evaluated in this paper." 356 }, 357 { 358 "title": "Bad characters: Imperceptible NLP attacks", 359 "authors": ["Nicholas Boucher", "Ilia Shumailov", "Ross Anderson", "Nicolas Papernot"], 360 "year": 2021, 361 "arxiv_id": "2106.09898", 362 "relevance": "Foundational work on character-level adversarial attacks against NLP models, a core technique category evaluated in this paper." 363 }, 364 { 365 "title": "Automatic and universal prompt injection attacks against large language models", 366 "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang", "Ning Zhang", "Chaowei Xiao"], 367 "year": 2024, 368 "arxiv_id": "2403.04957", 369 "relevance": "Automatic prompt injection attack methods against LLMs, directly relevant to the prompt injection threat model." 370 }, 371 { 372 "title": "AutoDAN: Generating stealthy jailbreak prompts on aligned large language models", 373 "authors": ["Xiaogeng Liu", "Nan Xu", "Muhao Chen", "Chaowei Xiao"], 374 "year": 2024, 375 "arxiv_id": "2310.04451", 376 "relevance": "Automated jailbreak generation method relevant to understanding the jailbreak attack landscape." 377 }, 378 { 379 "title": "Fundamental limitations of alignment in large language models", 380 "authors": ["Yotam Wolf", "Noam Wies", "Oshri Avnery", "Yoav Levine", "Amnon Shashua"], 381 "year": 2024, 382 "relevance": "Theoretical analysis of alignment limitations in LLMs, providing foundational context for why guardrails may be fundamentally bypassable." 383 }, 384 { 385 "title": "Emoji attack: Enhancing jailbreak attacks against judge LLM detection", 386 "authors": ["Zhipeng Wei", "Yuqi Liu", "N. Benjamin Erichson"], 387 "year": 2025, 388 "arxiv_id": "2411.01077", 389 "relevance": "Directly related work on emoji-based attacks against LLM detection systems, one of the most effective techniques found in this paper." 390 }, 391 { 392 "title": "TextAttack: A framework for adversarial attacks, data augmentation, and adversarial training in NLP", 393 "authors": ["John Morris", "Eli Lifland", "Jin Yong Yoo", "Jake Grigsby", "Di Jin", "Yanjun Qi"], 394 "year": 2020, 395 "relevance": "The framework used to implement the AML evasion techniques in this paper, relevant to understanding reproducibility of adversarial NLP attacks." 396 }, 397 { 398 "title": "Improved large language model jailbreak detection via pretrained embeddings", 399 "authors": ["Erick Galinkin", "Martin Sablotny"], 400 "year": 2024, 401 "arxiv_id": "2412.01547", 402 "relevance": "Describes the NeMo Guard jailbreak detection system, one of the six guardrails evaluated in this paper." 403 }, 404 { 405 "title": "The Llama 3 herd of models", 406 "authors": ["Abhimanyu Dubey et al."], 407 "year": 2024, 408 "arxiv_id": "2407.21783", 409 "relevance": "Describes Meta's Prompt Guard classifier used in guardrail systems, one of the six guardrails evaluated in this paper." 410 }, 411 { 412 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 413 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 414 "year": 2024, 415 "relevance": "LLM evaluation methodology paper relevant to understanding how LLM-based content safety systems are assessed." 416 }, 417 { 418 "title": "'Prompter Says': A linguistic approach to understanding and detecting jailbreak attacks against large-language models", 419 "authors": ["Dylan Lee", "Shaoyuan Xie", "Shagoto Rahman", "Kenneth Pat", "David Lee", "Qi Alfred Chen"], 420 "year": 2024, 421 "relevance": "Linguistic approach to jailbreak detection, relevant to understanding alternative detection methods for LLM guardrails." 422 } 423 ] 424 }