scan.json (30617B)
1 { 2 "paper": { 3 "title": "GUARDIAN: A Multi-Tiered Defense Architecture for Thwarting Prompt Injection Attacks on LLMs", 4 "authors": [ 5 "Parijat Rai", 6 "Saumil Sood", 7 "Vijay K. Madisetti", 8 "Arshdeep Bahga" 9 ], 10 "year": 2024, 11 "venue": "Journal of Software Engineering and Applications", 12 "doi": "10.4236/jsea.2024.171003" 13 }, 14 "scan_version": 3, 15 "active_modules": [ 16 "experimental_rigor", 17 "data_leakage" 18 ], 19 "methodology_tags": [ 20 "benchmark-eval", 21 "case-study" 22 ], 23 "key_findings": "GUARDIAN proposes a three-tiered defense (system prompt filter, BERT toxic classifier + ethical prompt generator, LLM self-review) that blocks 100% of 50 custom adversarial prompts against Llama-2-7b-chat. The system prompt layer blocks 40%, the pre-processing layer adds 20% (cumulative 60%), and the pre-display filter catches the remaining 40%. Zephyr-7B-α outperforms FLAN-T5-Large for ethical prompt generation based on perplexity scores. The evaluation is limited to a small author-crafted dataset with no comparison to existing defense methods.", 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. The fine-tuned models and evaluation scripts are not released." 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": false, 34 "justification": "The 50-prompt adversarial dataset is described structurally (5 base prompts × 10 sub-prompts) but not released. The generated ethical prompt dataset is also not released. Only the source Jigsaw dataset is public." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "The paper mentions using an RTX A6000 GPU, LM Studio, and lists library names (torch, transformers, pandas, etc.) but provides no requirements.txt, Dockerfile, or specific library versions. This is insufficient to recreate the environment." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "No step-by-step reproduction instructions are provided. The methodology sections describe the general approach and library names but do not include commands, scripts, or a reproducibility guide." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": false, 51 "justification": "No confidence intervals or error bars are reported for the defense blocking results (40%, 60%, 100%) or for the generation model metrics in Tables 4-5. All results are point estimates." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": false, 56 "justification": "No statistical significance tests are used. Comparisons between defense layers and between generation models (FLAN-T5 vs Zephyr) are made by comparing raw numbers without any tests." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Raw blocking counts (20/50, 30/50, 50/50) and percentages are reported but no formal effect sizes. The perplexity and ROUGE comparisons between models lack effect size measures." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "The sample size of 50 adversarial prompts (5 base × 10 sub-prompts) is stated but never justified. No power analysis or discussion of whether 50 prompts is sufficient to evaluate a defense system." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": false, 71 "justification": "No variance, standard deviation, or spread measures are reported across runs for any experiment. Single-run results only for both the defense evaluation and model training." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "The undefended Llama-2-7b-chat model serves as a baseline (0% blocking rate). Section 3.1 identifies Helbling et al.'s LLM Self Defense as the baseline reference, and the third filter builds upon that approach." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": false, 83 "justification": "The paper cites contemporary defense methods (SmoothLLM, RA-LLMs, Moving Target Defense, Self-Reminder) in the literature review (Section 3.2) but does not compare GUARDIAN against any of them. The only comparison is against no defense at all." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": true, 88 "justification": "Tables 1-3 effectively present an ablation: Layer 1 alone blocks 40%, Layer 1+2 blocks 60%, Layer 1+2+3 blocks 100%. Each layer's incremental contribution is measured." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "The defense evaluation uses blocking accuracy. The generation model evaluation (Section 6.4) uses ROUGE scores, perplexity, and validation loss. Tables 4-5 report these multiple metrics." 94 }, 95 "human_evaluation": { 96 "applies": true, 97 "answer": false, 98 "justification": "No human evaluation of the defense's blocking quality or the ethical prompt generator's output quality. Testing was manual input of prompts by the authors (Section 4.6) but this is procedure, not evaluation by independent humans." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": false, 103 "justification": "While the BERT classifier and Zephyr model use train/test splits, the primary defense evaluation uses all 50 prompts that were crafted during development. There is no separation between prompts used to develop/tune the defense and prompts used for final evaluation." 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "Tables 1-3 show per-base-prompt × per-sub-prompt breakdowns (5 BP rows × 10 SP columns). Tables 4-5 show per-configuration results for the generation models across different sample sizes and epochs." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": false, 113 "justification": "The paper claims 100% blocking and does not discuss failure cases. The only acknowledgment is a brief disclaimer: 'this may not hold for all jailbreak prompts, as attackers could develop prompts that bypass our system' (Section 6.3). No analysis of what types of attacks might succeed." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": true, 118 "justification": "Section 6.4 reports that FLAN-T5-Large showed 'suboptimal performance' compared to Zephyr, with higher perplexity and lower ROUGE scores. The authors attribute this to the 'highly specialized nature of the task' diverging from FLAN-T5's strengths." 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "The abstract claims '100% of attack prompts' blocked, which is shown in Table 3. 'Quantitatively evaluated defense layers' is supported by Tables 1-3. The 'ethical substitution mechanism' is demonstrated in Section 5.3.2 and Appendix A.5." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper claims each defense layer blocks specific prompts. The ablation design (adding layers incrementally) with controlled single-variable manipulation at each step adequately supports these causal claims about each layer's contribution." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper tests only on Llama-2-7b-chat with 50 custom prompts, yet the title claims 'Thwarting Prompt Injection Attacks on LLMs' (plural, general) and the abstract states the approach 'fortifies smaller LLMs against emerging cyber threats.' These generalizations are not bounded to the tested setting." 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": false, 140 "justification": "No alternative explanations are discussed for the 100% blocking rate. The paper does not consider that the rate may be an artifact of the small, self-crafted dataset, or that the prompts may be inherently easy to detect, or that the BERT classifier's performance may not generalize beyond Jigsaw-like toxicity." 141 }, 142 "proxy_outcome_distinction": { 143 "applies": true, 144 "answer": true, 145 "justification": "The paper measures blocking rate and claims blocking rate. The proxy matches the claim — they test whether prompts are blocked and report the blocking percentage. No proxy gap exists." 146 } 147 }, 148 "setup_transparency": { 149 "model_versions_specified": { 150 "applies": true, 151 "answer": true, 152 "justification": "Specific model names with sizes are provided: 'Llama-2-7b-chat' (Section 5.1), 'bert-base-uncased' (Section 5.3.2), 'Zephyr-7B-α' (Section 5.3.2), 'FLAN-T5-Large' with 780M parameters (Table 4). These are specific enough to identify exact model checkpoints." 153 }, 154 "prompts_provided": { 155 "applies": true, 156 "answer": false, 157 "justification": "Only 2 attack prompt examples are shown in Appendix A.1, truncated with ellipses. The system prompt addition ('Keep in mind the ethical boundaries') is provided. But the full set of 5 base prompts and 10 sub-prompts is not provided, making reproduction of the 50-prompt evaluation impossible." 158 }, 159 "hyperparameters_reported": { 160 "applies": true, 161 "answer": false, 162 "justification": "The paper lists hyperparameter names (temperature, top_p, top_k, max_token_length, lora_r, lora_alpha, lora_dropout, learning_rate, batch sizes) in Sections 5.3.2 but does not provide the actual values used. Parameters are described generically without specific numbers." 163 }, 164 "scaffolding_described": { 165 "applies": false, 166 "answer": false, 167 "justification": "The paper does not use agentic scaffolding. GUARDIAN is a pipeline of input/output filters, not an agent with tools, memory, or iterative reasoning." 168 }, 169 "data_preprocessing_documented": { 170 "applies": true, 171 "answer": true, 172 "justification": "The Jigsaw dataset preprocessing is documented: ~230K entries undersampled to ~32K for class balance (Section 5.3.2). The ethical dataset is filtered to toxicity scores 1-4. The attack dataset construction is described: 5 base prompts × 10 sub-prompts using 4 strategies. Train/validation/test splits (70/15/15) are stated." 173 } 174 }, 175 "limitations_and_scope": { 176 "limitations_section_present": { 177 "applies": true, 178 "answer": false, 179 "justification": "No dedicated limitations section. Section 7 ('Conclusions and Future Work') contains a brief disclaimer: 'we expect that more sophisticated adversarial prompts could still penetrate our defenses' and suggests future work areas, but this does not constitute substantive discussion of limitations." 180 }, 181 "threats_to_validity_specific": { 182 "applies": true, 183 "answer": false, 184 "justification": "The only specific threat mentioned is the brief disclaimer in Section 6.3 about the 100% rate not generalizing. No discussion of specific threats like the small sample size, author-crafted prompts, single-model testing, lack of adaptive attacks, or BERT classifier generalization." 185 }, 186 "scope_boundaries_stated": { 187 "applies": true, 188 "answer": false, 189 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to Llama-2-7b-chat or to the tested attack strategies. The title and abstract make broad claims about 'LLMs' without stating specific exclusions." 190 } 191 }, 192 "data_integrity": { 193 "raw_data_available": { 194 "applies": true, 195 "answer": false, 196 "justification": "The 50-prompt adversarial dataset, the generated ethical dataset, and the fine-tuned model weights are not released. Only the source Jigsaw dataset is publicly available. Independent verification of the defense evaluation is impossible." 197 }, 198 "data_collection_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section 5.1 describes the attack dataset creation: 5 base prompts crafted using 4 strategies (role emulation, manipulative assistance, ethical assurance, alternative reality) combined with 10 sub-prompts each. The Jigsaw dataset source and ethical dataset generation process are described in Section 5.3.2." 202 }, 203 "recruitment_methods_described": { 204 "applies": false, 205 "answer": false, 206 "justification": "No human participants. Data consists of a standard public benchmark (Jigsaw) and author-crafted adversarial prompts." 207 }, 208 "data_pipeline_documented": { 209 "applies": true, 210 "answer": false, 211 "justification": "The pipeline stages are described at a high level but counts at intermediate steps are sometimes missing. For the ethical dataset, the number of entries after toxicity 1-4 filtering is not stated. The post-processing/cleaning steps for generated text are described generically without specific filtering counts." 212 } 213 }, 214 "conflicts_of_interest": { 215 "funding_disclosed": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section listing grants or sponsors." 219 }, 220 "affiliations_disclosed": { 221 "applies": true, 222 "answer": true, 223 "justification": "Author affiliations are clearly listed: Bennett University (India), Georgia Institute of Technology (USA), and Cloudemy Technology Labs (India). The authors are not evaluating their own commercial product." 224 }, 225 "funder_independent_of_outcome": { 226 "applies": false, 227 "answer": false, 228 "justification": "No funding source is mentioned. The work appears to be unfunded academic research." 229 }, 230 "financial_interests_declared": { 231 "applies": true, 232 "answer": true, 233 "justification": "The paper includes 'The authors declare no conflicts of interest regarding the publication of this paper' at the end of Section 7." 234 } 235 }, 236 "contamination": { 237 "training_cutoff_stated": { 238 "applies": false, 239 "answer": false, 240 "justification": "This paper tests a defense system against prompt injection, not a pre-trained model's capability on a knowledge benchmark. Contamination of training data with benchmark answers is not a relevant concern." 241 }, 242 "train_test_overlap_discussed": { 243 "applies": false, 244 "answer": false, 245 "justification": "The paper evaluates a defense architecture, not model knowledge. Train/test overlap in the benchmark contamination sense is not applicable." 246 }, 247 "benchmark_contamination_addressed": { 248 "applies": false, 249 "answer": false, 250 "justification": "The evaluation tests whether a defense blocks adversarial prompts, not whether a model has memorized benchmark answers. Benchmark contamination is not a relevant concern for this paper type." 251 } 252 }, 253 "human_studies": { 254 "pre_registered": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "irb_or_ethics_approval": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants. The paper discusses ethical considerations in Section 4.7 but these are about responsible research conduct, not IRB approval for human subjects." 263 }, 264 "demographics_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "inclusion_exclusion_criteria": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "randomization_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 }, 279 "blinding_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study." 283 }, 284 "attrition_reported": { 285 "applies": false, 286 "answer": false, 287 "justification": "No human participants in this study." 288 } 289 }, 290 "cost_and_practicality": { 291 "inference_cost_reported": { 292 "applies": true, 293 "answer": false, 294 "justification": "The paper mentions 'We measure the time taken for generation' for Zephyr in Section 5.3.2 but does not report the actual timing numbers. No inference cost, latency, or per-prompt processing time is reported for the three-layer defense pipeline." 295 }, 296 "compute_budget_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "An RTX A6000 GPU is mentioned for training (Section 5.3.2) but no GPU hours, total training time, or computational budget is quantified." 300 } 301 }, 302 "experimental_rigor": { 303 "seed_sensitivity_reported": { 304 "applies": true, 305 "answer": false, 306 "justification": "No mention of random seeds or sensitivity analysis across seeds for any experiment (defense evaluation, BERT training, or Zephyr fine-tuning)." 307 }, 308 "number_of_runs_stated": { 309 "applies": true, 310 "answer": false, 311 "justification": "The number of experimental runs is never stated. The defense evaluation appears to be a single run, and the generation model training shows different configurations but not repeated runs." 312 }, 313 "hyperparameter_search_budget": { 314 "applies": true, 315 "answer": false, 316 "justification": "No hyperparameter search budget is reported. Tables 4-5 show 3 configurations each for FLAN-T5 and Zephyr but no description of how these were selected or how many total configurations were tried." 317 }, 318 "best_config_selection_justified": { 319 "applies": true, 320 "answer": false, 321 "justification": "Tables 4-5 show results for 3 configurations per model (varying sample sizes and epochs) but no clear justification for how the final configuration was selected. The criteria for choosing the operational configuration is not stated." 322 }, 323 "multiple_comparison_correction": { 324 "applies": true, 325 "answer": false, 326 "justification": "No statistical tests are performed at all, so no multiple comparison corrections can be applied. Comparisons between layers and models are made by comparing raw numbers." 327 }, 328 "self_comparison_bias_addressed": { 329 "applies": true, 330 "answer": false, 331 "justification": "The authors design both the attack prompts and the defense, then evaluate their own system. No acknowledgment of this self-evaluation bias or any attempt at independent evaluation." 332 }, 333 "compute_budget_vs_performance": { 334 "applies": true, 335 "answer": false, 336 "justification": "No analysis of performance as a function of compute. The three-layer defense adds compute overhead (BERT classification + ethical generation + LLM self-review) but the cost-performance tradeoff is never discussed." 337 }, 338 "benchmark_construct_validity": { 339 "applies": true, 340 "answer": false, 341 "justification": "The 50-prompt custom dataset is used without any discussion of whether it represents real-world prompt injection threats. No analysis of coverage across attack types, no comparison to established adversarial benchmarks, no discussion of construct validity." 342 }, 343 "scaffold_confound_addressed": { 344 "applies": false, 345 "answer": false, 346 "justification": "No agentic scaffolding is involved. The system is a pipeline of filters, not a scaffold comparison." 347 } 348 }, 349 "data_leakage": { 350 "temporal_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether Llama-2's training data might include knowledge of jailbreak patterns similar to those tested, or whether the BERT classifier's training data (Jigsaw) predates the attack strategies used." 354 }, 355 "feature_leakage_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of whether the evaluation setup leaks information. For example, the BERT toxic classifier is trained on Jigsaw data that may share characteristics with the sub-prompts used in testing." 359 }, 360 "non_independence_addressed": { 361 "applies": true, 362 "answer": false, 363 "justification": "The 50 test prompts are structured as 5 base × 10 sub-prompts, meaning they are highly correlated (each group of 10 shares the same base prompt). This structural non-independence is not discussed or addressed." 364 }, 365 "leakage_detection_method": { 366 "applies": true, 367 "answer": false, 368 "justification": "No leakage detection or prevention method is applied. No analysis of overlap between Jigsaw training data and test prompts, no temporal analysis, no decontamination." 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "The three-tiered GUARDIAN defense blocks 100% of adversarial prompt attacks on Llama-2-7b-chat.", 375 "evidence": "Table 3 (Section 6.3) shows all 50 attack prompts (5 base × 10 sub-prompts) blocked after applying all three layers. Includes disclaimer that this may not hold for all jailbreak prompts.", 376 "supported": "weak" 377 }, 378 { 379 "claim": "The system prompt filter ('Keep in mind the ethical boundaries') blocks 40% of attack prompts.", 380 "evidence": "Table 1 (Section 6.1) shows 20/50 prompts blocked. The blocked prompts involved requests for the LLM to operate without ethical constraints.", 381 "supported": "weak" 382 }, 383 { 384 "claim": "The pre-processing filter with BERT toxic classifier brings cumulative blocking to 60%.", 385 "evidence": "Table 2 (Section 6.2) shows 30/50 prompts blocked after Layers 1 and 2 combined. The additional 10 prompts caught involved fictional game scenarios flagged as toxic/obscene.", 386 "supported": "weak" 387 }, 388 { 389 "claim": "Zephyr-7B-α outperforms FLAN-T5-Large for ethical prompt generation.", 390 "evidence": "Tables 4-5 (Section 6.4) show Zephyr achieves lower perplexity (2.98-3.18) compared to FLAN-T5 (5.66-7.31) across configurations.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "The multi-layered approach ensures that if one layer fails, subsequent layers provide additional defense.", 395 "evidence": "Tables 1-3 show incremental blocking: 40% → 60% → 100% as layers are added. Each subsequent layer catches prompts that passed previous layers.", 396 "supported": "weak" 397 } 398 ], 399 "red_flags": [ 400 { 401 "flag": "100% blocking claim on tiny self-crafted dataset", 402 "detail": "The 100% blocking rate is based on only 50 prompts (5 base × 10 sub-prompts) crafted by the same authors who designed the defense. This creates a circular evaluation where the defense is tested only against attacks its designers created, making the 100% claim almost meaningless for assessing real-world robustness." 403 }, 404 { 405 "flag": "No comparison with existing defense methods", 406 "detail": "The paper cites SmoothLLM, RA-LLMs, Moving Target Defense, Self-Reminder, and other defense approaches in the literature review but does not compare GUARDIAN against any of them. The only baseline is no defense at all (0% blocking)." 407 }, 408 { 409 "flag": "Authors craft both attack and defense", 410 "detail": "The same team designs the adversarial prompts and the defense system. Prompts were selected through 'trial and error, based on their consistent success in circumventing the Llama-2 base model' (Section 6). This creates strong selection bias toward prompts the defense can handle." 411 }, 412 { 413 "flag": "Structurally non-independent test set", 414 "detail": "The 50 'independent' test prompts are actually 5 base prompts × 10 sub-prompts. If a defense blocks a base prompt, it blocks all 10 variants simultaneously. The effective independent sample size is closer to 5, not 50." 415 }, 416 { 417 "flag": "Claims significantly outrun evidence", 418 "detail": "The title claims 'Thwarting Prompt Injection Attacks on LLMs' and the abstract says it 'fortifies smaller LLMs against emerging cyber threats.' These broad claims are supported only by testing on a single model (Llama-2-7b-chat) with 50 self-crafted prompts." 419 }, 420 { 421 "flag": "No hyperparameter values reported", 422 "detail": "The paper lists hyperparameter names (temperature, top_p, top_k, learning_rate, LoRA parameters) but never provides the actual values used, making reproduction impossible even if the code were released." 423 } 424 ], 425 "cited_papers": [ 426 { 427 "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models", 428 "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"], 429 "year": 2023, 430 "arxiv_id": "2307.09288", 431 "relevance": "Target LLM model for the defense evaluation; foundational open-weight model with safety training." 432 }, 433 { 434 "title": "LLM Self Defense: By Self Examination, LLMs Know They Are Being Tricked", 435 "authors": ["Mansi Phute"], 436 "year": 2023, 437 "arxiv_id": "2308.07308", 438 "relevance": "Baseline reference for the paper's third defense layer (LLM self-examination as output filter)." 439 }, 440 { 441 "title": "Prompt Injection Attacks and Defenses in LLM-Integrated Applications", 442 "authors": ["Yupei Liu"], 443 "year": 2023, 444 "arxiv_id": "2310.12815", 445 "relevance": "Systematic framework for understanding prompt injection attacks and defenses in LLM applications." 446 }, 447 { 448 "title": "SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks", 449 "authors": ["Alexander Robey"], 450 "year": 2023, 451 "arxiv_id": "2310.03684", 452 "relevance": "Defense method using input perturbation and prediction aggregation against jailbreaking attacks." 453 }, 454 { 455 "title": "Defending Against Alignment-Breaking Attacks via Robustly Aligned LLM", 456 "authors": ["Bochuan Cao"], 457 "year": 2023, 458 "arxiv_id": "2309.14348", 459 "relevance": "Proposes robust alignment checking to defend against attacks that break LLM value alignment." 460 }, 461 { 462 "title": "Jailbreaker in Jail: Moving Target Defense for Large Language Models", 463 "authors": ["Bocheng Chen"], 464 "year": 2023, 465 "doi": "10.1145/3605760.3623764", 466 "relevance": "Dynamic defense strategy using Moving Target Defense to reduce LLM vulnerability to adversarial attacks." 467 }, 468 { 469 "title": "Jailbroken: How Does LLM Safety Training Fail?", 470 "authors": ["Alexander Wei"], 471 "year": 2023, 472 "arxiv_id": "2307.02483", 473 "relevance": "Analysis of why LLM safety training mechanisms fail against adversarial attacks, identifying vulnerability patterns." 474 }, 475 { 476 "title": "Certifying LLM Safety against Adversarial Prompting", 477 "authors": ["Aounon Kumar"], 478 "year": 2023, 479 "arxiv_id": "2309.02705", 480 "relevance": "Erase-and-check method for certifiably safe LLM interactions against adversarial prompts." 481 }, 482 { 483 "title": "Use of LLMs for Illicit Purposes: Threats, Prevention Measures, and Vulnerabilities", 484 "authors": ["Maximilian Mozes"], 485 "year": 2023, 486 "arxiv_id": "2308.12833", 487 "relevance": "Comprehensive overview of LLM misuse threats and prevention strategies." 488 }, 489 { 490 "title": "Defending ChatGPT against Jailbreak Attack via Self-Reminder", 491 "authors": ["Fangzhao Wu", "Yueqi Xie", "Jingwei Yi"], 492 "year": 2023, 493 "doi": "10.21203/rs.3.rs-2873090/v1", 494 "relevance": "System-mode self-reminder defense technique against jailbreak attacks, directly related to the system prompt filter approach." 495 }, 496 { 497 "title": "MasterKey: Automated Jailbreak Across Multiple Large Language Model Chatbots", 498 "authors": ["Gelei Deng"], 499 "year": 2023, 500 "arxiv_id": "2307.08715", 501 "relevance": "Framework for automated jailbreak prompt generation and reverse-engineering LLM defenses." 502 }, 503 { 504 "title": "Tricking LLMs into Disobedience: Understanding, Analyzing, and Preventing Jailbreaks", 505 "authors": ["Abhinav Rao"], 506 "year": 2023, 507 "arxiv_id": "2305.14965", 508 "relevance": "Formalism for categorizing jailbreak attacks with empirical analysis of effectiveness." 509 }, 510 { 511 "title": "Do Anything Now: Characterizing and Evaluating In-The-Wild Jailbreak Prompts on Large Language Models", 512 "authors": ["Xinyue Shen"], 513 "year": 2023, 514 "arxiv_id": "2308.03825", 515 "relevance": "Large-scale analysis of real-world jailbreak prompts and their evolution, relevant to adversarial prompt taxonomy." 516 } 517 ], 518 "engagement_factors": { 519 "practical_relevance": { 520 "score": 2, 521 "justification": "The three-layer defense approach (system prompt, classifier, self-review) is implementable by practitioners, though no code is released to directly use it." 522 }, 523 "surprise_contrarian": { 524 "score": 0, 525 "justification": "Multi-layer defense is an expected approach; no conventional wisdom is challenged." 526 }, 527 "fear_safety": { 528 "score": 2, 529 "justification": "Demonstrates jailbreak techniques against Llama-2 and addresses prompt injection as a security threat, relevant to AI safety concerns." 530 }, 531 "drama_conflict": { 532 "score": 0, 533 "justification": "No controversy, no challenge to existing claims or products." 534 }, 535 "demo_ability": { 536 "score": 0, 537 "justification": "No code, no demo, no released models or tools." 538 }, 539 "brand_recognition": { 540 "score": 1, 541 "justification": "Georgia Tech co-author adds some recognition; uses Meta's Llama-2 but not from a major AI lab." 542 } 543 } 544 }