scan.json (32571B)
1 { 2 "paper": { 3 "title": "Red Teaming the Mind of the Machine: A Systematic Evaluation of Prompt Injection and Jailbreak Vulnerabilities in LLMs", 4 "authors": ["Chetan Pathade"], 5 "year": 2025, 6 "venue": "arXiv.org", 7 "arxiv_id": "2505.04806", 8 "doi": "10.48550/arXiv.2505.04806" 9 }, 10 "scan_version": 3, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Tested 1,400+ adversarial prompts against GPT-4, Claude 2, Mistral 7B, and Vicuna, reporting attack success rates of 87.2%, 82.5%, 71.3%, and 69.4% respectively. Roleplay-based attacks achieved the highest ASR (89.6%), and prompts successful on one model transferred to others at rates exceeding 50%. Defense frameworks (PromptShield, Palisade, Signed-Prompt) were tested but specific coverage numbers are not reported. The paper categorizes failure modes into partial refusals (34%), hidden compliance (22%), no output (18%), and misleading responses (15%).", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No repository URL, code archive, or any link to source code is provided anywhere in the paper. The semi-automated red-teaming script using LangChain is described but not released." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "The dataset of 1,400+ adversarial prompts is described as curated from public sources but no download link, archive, or data release is provided. The paper does not even include example prompts." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No requirements.txt, Dockerfile, or environment specification is provided. The paper mentions LangChain, Sentence-BERT, and Hugging Face Inference but provides no version or dependency details." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided. The methodology section describes the pipeline at a high level but lacks sufficient detail for independent replication." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results are reported as single point estimates (e.g., '87.2% ASR', '64.1% generalizability'). No confidence intervals, error bars, or uncertainty measures appear anywhere in the paper." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper makes comparative claims (e.g., 'GPT-4 demonstrated the highest vulnerability') solely by comparing raw percentages. No statistical significance tests are reported for any comparison." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": false, 51 "justification": "Only raw ASR percentages are reported. No formal effect sizes (Cohen's d, odds ratios) or contextual baseline comparisons are provided to indicate the magnitude of differences between models or attack types." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper states '1,400+ adversarial prompts' were used but provides no justification for this number, no power analysis, and no discussion of whether this sample is sufficient for the claims made." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or spread measures are reported for any metric. Results appear to be from single runs with no indication of result stability." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Four models are compared against each other (GPT-4, Claude 2, Mistral 7B, Vicuna), providing cross-model baselines. Defense frameworks (PromptShield, Palisade, Signed-Prompt) are also tested as baseline defenses." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": false, 73 "justification": "Claude 2 (July 2023) was substantially outdated by 2025 when Claude 3/3.5 were available. Vicuna-13B is similarly dated. GPT-4 March 2024 is more recent but GPT-4 Turbo/GPT-4o existed. The models do not represent the state of the art at time of publication." 74 }, 75 "ablation_study": { 76 "applies": false, 77 "answer": false, 78 "justification": "This is a vulnerability evaluation study, not a system with removable components. The attack categories serve as experimental conditions rather than ablatable system components." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Four metrics are used: Attack Success Rate (ASR), Prompt Generalizability, Time-to-Bypass, and Failure Mode Classification (Section III.C)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "All output scoring is automated using keyword spotting, GPT-based meta-evaluation, and Sentence-BERT semantic distance (Section III.D). No human evaluation of model outputs is reported." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "All 1,400+ prompts appear to have been tested against all models with no train/test split. The defense evaluation used 'a subset of successful jailbreaks' but selection criteria are not described." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down by model (Table I), attack category (Section IV.B: roleplay 89.6%, logic traps 81.4%, encoding 76.2%, multi-turn 68.7%), scenario (Section IV.C: political 85.5%, legal 79.4%, etc.), and failure mode (Table II)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section IV.E provides a taxonomy of five failure modes: partial refusals (34%), hidden compliance (22%), no output (18%), misleading responses (15%). Table II lists common triggers for each mode." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Multi-turn attacks yielded lower effectiveness (68.7%). Malicious code requests had a 58.3% success rate, lower than other categories. Models blocked many direct malware requests. Prompts exceeding 150 tokens showed a dip in success." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims about categorizing 1,400+ prompts, analyzing success against four LLMs, examining generalizability, and proposing mitigation strategies are all addressed in the paper body, though the evidence quality is uneven." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper makes implicit causal claims: 'model scale and alignment tuning complexity both contribute to attack surface depth' (Section IV.A), 'roleplay dynamics bypass filters by deflecting responsibility' (Section IV.B). These causal attributions are not supported by controlled experiments — the study is observational with no causal identification strategy." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title claims a 'Systematic Evaluation' of LLMs broadly, and the conclusion states 'prompt injection remains an open frontier in LLM safety' and 'current LLM safety mechanisms are insufficiently robust.' These generalizations go well beyond the four specific models tested with specific model snapshots." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No alternative explanations for the results are discussed. The high GPT-4 ASR could be an artifact of the scoring methodology, prompt selection, or the specific model snapshot — none of these alternatives are considered." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "ASR is measured via keyword spotting, GPT-based meta-evaluation, and Sentence-BERT distance from refusal templates, then presented as 'vulnerability.' The gap between this automated proxy and actual harmful model behavior is never discussed. The validity of the hybrid scoring method is not established." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper states 'GPT-4 (OpenAI, March 2024 snapshot)' and 'Claude 2 (Anthropic, July 2023 API version)' but does not provide precise version identifiers (e.g., 'gpt-4-0314' or specific API version strings). Mistral 7B and Vicuna-13B are given without checkpoint hashes or specific releases." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "No actual adversarial prompt text is provided anywhere in the paper. Attack categories are described in natural language (e.g., 'roleplay dynamics,' 'logic traps') but no concrete prompt examples are shown. The reader cannot reconstruct any prompt sent to the models." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No temperature, top-p, max tokens, or other inference parameters are reported for any of the four models tested. Section III.B mentions 'system context initialized per the model's recommended safety guidelines' without specifying what those settings were." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": false, 157 "justification": "A 'semi-automated red-teaming script using the LangChain framework' is mentioned (Section III.D), but the pipeline is described only at a high level. No workflow diagrams, code snippets, retry logic, or detailed tool descriptions are provided." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": false, 162 "justification": "The paper says prompts were 'manually validated, categorized into attack types, and annotated for content sensitivity' but provides no filtering criteria, inter-annotator agreement, or counts at each processing stage. How many prompts were collected initially vs. retained is not stated." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section VII 'Limitations and Future Work' provides a dedicated discussion of study limitations including static model checkpoints, evolving attack tactics, and cultural/linguistic coverage gaps." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "The limitations section mentions specific threats: 'Our evaluation used static model checkpoints and may not account for updates or real-time moderation layers applied in production APIs' and 'cultural and linguistic diversity in prompts remains underrepresented.' These are specific to this study rather than purely generic." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "While limitations are mentioned, the paper does not explicitly state what the results do NOT show. The conclusion still generalizes broadly ('prompt injection is not an edge-case anomaly but a fundamental issue in current-generation LLMs') without bounding this to the four tested models." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw data is released. The 1,400+ adversarial prompts, model outputs, and scoring results are not available for independent verification." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": false, 191 "justification": "Sources are listed (GitHub, JailbreakChat, Reddit, Discord, JailbreakBench, PromptBench) but the collection procedure lacks detail: no query terms, no time windows, no counts per source, no inclusion/exclusion criteria for selecting prompts from these sources." 192 }, 193 "recruitment_methods_described": { 194 "applies": true, 195 "answer": false, 196 "justification": "The prompt selection process from multiple public sources is described only at the source level. How prompts were identified, selected, and deduplicated across these sources is not documented. Selection bias toward certain types of prompts or communities is not discussed." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "The pipeline from raw prompt collection to final ASR computation has undocumented gaps. No numbers are provided at each stage (e.g., how many prompts collected per source, how many passed manual validation, how many were tested per model)." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": false, 207 "answer": false, 208 "justification": "The author is identified as an 'Independent Researcher' with no institutional affiliation. This appears to be clearly unfunded independent work." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "The author's affiliation is listed as 'Independent Researcher, San Jose, CA, USA' with a CMU alumni email. The author has no disclosed affiliation with any of the evaluated model providers." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": false, 217 "answer": false, 218 "justification": "No funder is identified; this appears to be unfunded independent research." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement or financial disclosure is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "This is a red-teaming study testing adversarial prompt effectiveness against safety mechanisms, not evaluating model knowledge on benchmarks. Training data contamination of jailbreak prompts is not the same concern." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "This is a red-teaming study testing defense mechanisms rather than model knowledge on benchmarks." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "This is a red-teaming study testing defense mechanisms rather than model knowledge on benchmarks." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants are involved in this study. All testing is automated prompt injection against LLM APIs." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. The study tests LLM responses to adversarial prompts." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference costs are reported despite testing 1,400+ prompts across four models including paid APIs (GPT-4, Claude 2). Total API costs are not mentioned." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No computational budget is stated. The paper does not report GPU hours for local inference (Mistral, Vicuna), API costs, or total experiment runtime." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of random seeds, temperature sensitivity, or stochastic variation across runs. LLM outputs are stochastic but no sensitivity analysis is reported." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The paper does not state how many times each prompt was tested against each model. It is unclear whether results are from single runs or averaged across multiple attempts." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search budget is reported for the scoring methodology (keyword thresholds, Sentence-BERT distance thresholds) or inference settings." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The hybrid scoring method (keyword spotting + GPT meta-evaluation + Sentence-BERT) is presented without justification for this specific configuration or comparison to alternatives." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "Multiple comparisons are made across 4 models, 4+ attack categories, and 4+ scenarios, but no statistical tests are performed at all, let alone corrections for multiple comparisons." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors curated the adversarial prompt dataset and designed the scoring methodology, then evaluated attack success using that same methodology. The potential for self-comparison bias in prompt selection and scoring is not discussed." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "No compute budget is reported, so performance cannot be assessed as a function of compute. Time-to-Bypass is reported but not in relation to computational cost." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper does not discuss whether its ASR metric (scored via keyword spotting, GPT evaluation, and BERT distance) actually measures what it claims — true model vulnerability. The construct validity of the automated harm detection pipeline is unexamined." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": true, 335 "answer": false, 336 "justification": "GPT-4 and Claude 2 are tested via API with production moderation layers, while Mistral and Vicuna run via local Hugging Face inference without those layers. This infrastructure difference could confound model comparisons but is not discussed." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "The adversarial prompts are sourced from public repositories that models may have encountered during training. A model trained on JailbreakChat prompts might have learned to resist those specific attacks, biasing ASR downward. This temporal issue is not discussed." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information. The system prompts initialized per 'recommended safety guidelines' could interact with prompt effectiveness in ways not controlled for." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "Prompts drawn from the same forums or repositories likely share structural patterns, but independence of test examples is not discussed. Duplicates or near-duplicates across sources could inflate sample size." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or decontamination method is applied. The paper does not check whether models have seen the specific adversarial prompts during training." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "GPT-4 has the highest vulnerability with an ASR of 87.2%, followed by Claude 2 (82.5%), Mistral 7B (71.3%), and Vicuna (69.4%).", 365 "evidence": "Table I in Section IV.A reports model-wise ASR metrics. However, no error bars, confidence intervals, or statistical tests support these specific numbers.", 366 "supported": "weak" 367 }, 368 { 369 "claim": "Roleplay-based prompt injections achieve the highest attack success rate at 89.6%, followed by logic traps (81.4%), encoding tricks (76.2%), and multi-turn dialogues (68.7%).", 370 "evidence": "Section IV.B describes attack category performance with percentages. No statistical tests, variance measures, or per-model breakdowns for these category-level rates are provided.", 371 "supported": "weak" 372 }, 373 { 374 "claim": "Jailbreak prompts that succeed on GPT-4 transfer to Claude 2 in 64.1% of cases and to Vicuna in 59.7% of cases.", 375 "evidence": "Table I and Section IV.D provide transferability percentages. The transferability matrix (Figure 4) is referenced but no detailed cross-model analysis or statistical significance is provided.", 376 "supported": "weak" 377 }, 378 { 379 "claim": "Political content prompts succeeded 85.5% of the time, the highest among scenario-specific domains.", 380 "evidence": "Section IV.C lists scenario-specific ASR: political 85.5%, legal 79.4%, explicit 76.1%, malicious code 58.3%. No breakdown by model or statistical analysis accompanies these numbers.", 381 "supported": "weak" 382 }, 383 { 384 "claim": "Prompts in the 101–150 token range achieve the highest success rate (80.3%), suggesting a 'sweet spot' for adversarial prompts.", 385 "evidence": "Section IV.F reports this finding with reference to Figure 5. No statistical analysis of the relationship between prompt length and success, and no confound analysis (e.g., attack type correlation with length).", 386 "supported": "weak" 387 }, 388 { 389 "claim": "Current defense frameworks have limited coverage against evolved jailbreak attacks.", 390 "evidence": "Section III.E describes testing against PromptShield, Palisade, and Signed-Prompt, but specific defense bypass rates or coverage numbers are not reported in the results section.", 391 "supported": "unsupported" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "No uncertainty quantification", 397 "detail": "All results are single point estimates with no error bars, confidence intervals, standard deviations, or statistical tests. With 1,400+ prompts and stochastic LLM outputs, the absence of any variance measurement undermines the precision of all reported numbers." 398 }, 399 { 400 "flag": "No data or code released", 401 "detail": "Neither the 1,400+ adversarial prompts, the model outputs, the scoring results, nor the evaluation code are released. The study is completely non-reproducible and non-verifiable." 402 }, 403 { 404 "flag": "No actual prompts shown", 405 "detail": "Despite testing 1,400+ prompts, not a single actual prompt example appears in the paper. This makes it impossible to assess the quality of the dataset, the appropriateness of the categorization, or the validity of the reported ASR." 406 }, 407 { 408 "flag": "Unvalidated scoring methodology", 409 "detail": "The hybrid scoring method (keyword spotting + GPT-based meta-evaluation + Sentence-BERT distance) is never validated against human judgments. There is no inter-rater agreement, no false positive/negative analysis, and no discussion of scoring threshold sensitivity." 410 }, 411 { 412 "flag": "Counterintuitive results without explanation", 413 "detail": "GPT-4 reportedly has higher vulnerability (87.2%) than open-source models without robust safety layers (Mistral 71.3%, Vicuna 69.4%). This counterintuitive finding is attributed to GPT-4's 'powerful but permissive instruction-following nature' without rigorous analysis. The scoring methodology itself could produce this pattern." 414 }, 415 { 416 "flag": "Infrastructure confound in model comparison", 417 "detail": "GPT-4 and Claude 2 are tested via APIs with production moderation layers, while Mistral 7B and Vicuna use local Hugging Face inference without such layers. This fundamental difference in testing infrastructure is not controlled for or discussed." 418 }, 419 { 420 "flag": "Outdated model selection", 421 "detail": "Claude 2 (July 2023) was substantially outdated by the 2025 publication date when Claude 3/3.5 were available. Results on deprecated models have limited relevance to current LLM safety." 422 }, 423 { 424 "flag": "Suspiciously clean numbers", 425 "detail": "All key metrics are reported to exactly one decimal place with no variance. The ASR figures (87.2%, 82.5%, 71.3%, 69.4%) and transferability rates appear too clean for a study with 1,400+ prompts across stochastic models." 426 }, 427 { 428 "flag": "Missing figures", 429 "detail": "Section IV states 'The figures, included in the appendix or digital supplement, visually illustrate comparative vulnerability trends.' Figures 1-6 are captioned but described as separate from the main text, raising questions about whether the visualizations were actually produced." 430 }, 431 { 432 "flag": "Defense evaluation without specific results", 433 "detail": "Section III.E describes testing jailbreaks against PromptShield, Palisade, and Signed-Prompt defense frameworks, but the results section never reports specific defense bypass rates or coverage metrics." 434 } 435 ], 436 "cited_papers": [ 437 { 438 "title": "Prompt Injection attack against LLM-integrated Applications", 439 "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li", "Kailong Wang", "Zihao Wang", "Xiaofeng Wang", "Tianwei Zhang", "Yepang Liu", "Haoyu Wang", "Yan Zheng", "Yang Liu"], 440 "arxiv_id": "2306.05499", 441 "relevance": "Foundational work on prompt injection attacks against LLM-integrated applications, directly relevant to AI security." 442 }, 443 { 444 "title": "Jailbreaking ChatGPT via Prompt Engineering: An Empirical Study", 445 "authors": ["Yi Liu", "Gelei Deng", "Zhengzi Xu", "Yuekang Li"], 446 "arxiv_id": "2305.13860", 447 "relevance": "Empirical evaluation of jailbreak strategies against ChatGPT, directly comparable methodology to this paper." 448 }, 449 { 450 "title": "Automatic and Universal Prompt Injection Attacks against Large Language Models", 451 "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang", "Ning Zhang", "Chaowei Xiao"], 452 "arxiv_id": "2403.04957", 453 "relevance": "Proposes automated prompt injection attack generation, key reference for attack taxonomy and LLM safety." 454 }, 455 { 456 "title": "Benchmarking and Defending against Indirect Prompt Injection Attacks on Large Language Models", 457 "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Emre Kiciman", "Guangzhong Sun", "Xing Xie", "Fangzhao Wu"], 458 "arxiv_id": "2312.14197", 459 "relevance": "Benchmark and defense framework for indirect prompt injection, key survey-relevant evaluation methodology." 460 }, 461 { 462 "title": "Formalizing and Benchmarking Prompt Injection Attacks and Defenses", 463 "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"], 464 "year": 2024, 465 "relevance": "Formal framework for prompt injection attacks and defenses presented at USENIX Security, establishes evaluation standards." 466 }, 467 { 468 "title": "SecAlign: Defending Against Prompt Injection with Preference Optimization", 469 "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "David Wagner", "Chuan Guo"], 470 "arxiv_id": "2410.05451", 471 "relevance": "Proposes preference optimization-based defense against prompt injection, relevant defense methodology." 472 }, 473 { 474 "title": "Defeating Prompt Injections by Design", 475 "authors": ["Edoardo Debenedetti", "Ilia Shumailov", "Tianqi Fan", "Jamie Hayes", "Nicholas Carlini", "Daniel Fabian"], 476 "arxiv_id": "2503.18813", 477 "relevance": "Architectural approach to preventing prompt injection, key reference for defense-by-design strategies." 478 }, 479 { 480 "title": "Bypassing Prompt Injection and Jailbreak Detection in LLM Guardrails", 481 "authors": ["William Hackett", "Lewis Birch", "Stefan Trawicki", "Neeraj Suri", "Peter Garraghan"], 482 "arxiv_id": "2504.11168", 483 "relevance": "Demonstrates bypass techniques against guardrail systems, directly relevant to LLM safety evaluation." 484 }, 485 { 486 "title": "PromptShield: Deployable Detection for Prompt Injection Attacks", 487 "authors": ["Dennis Jacob", "Hend Alzahrani", "Zhanhao Hu", "Basel Alomair", "David Wagner"], 488 "arxiv_id": "2501.15145", 489 "relevance": "Deployable prompt injection detection framework, one of the defense systems evaluated in this paper." 490 }, 491 { 492 "title": "Palisade – Prompt Injection Detection Framework", 493 "authors": ["Sahasra Kokkula", "Somanathan R", "Nandavardhan R", "Aashishkumar", "G Divya"], 494 "arxiv_id": "2410.21146", 495 "relevance": "Prompt injection detection framework evaluated as a defense baseline in this paper." 496 }, 497 { 498 "title": "UniGuardian: A Unified Defense for Detecting Prompt Injection, Backdoor Attacks and Adversarial Attacks in Large Language Models", 499 "authors": ["Huawei Lin", "Yingjie Lao", "Tong Geng", "Tan Yu", "Weijie Zhao"], 500 "arxiv_id": "2502.13141", 501 "relevance": "Unified defense framework against multiple LLM attack types including prompt injection." 502 }, 503 { 504 "title": "Operationalizing a Threat Model for Red-Teaming Large Language Models (LLMs)", 505 "authors": ["Apurv Verma", "Satyapriya Krishna"], 506 "arxiv_id": "2407.14937", 507 "year": 2024, 508 "relevance": "Threat modeling framework for LLM red-teaming, directly relevant to systematic adversarial evaluation methodology." 509 } 510 ], 511 "engagement_factors": { 512 "practical_relevance": { 513 "score": 1, 514 "justification": "Categorizes attack types and proposes defenses at a conceptual level, but releases no tools, code, data, or actionable artifacts a practitioner could use." 515 }, 516 "surprise_contrarian": { 517 "score": 1, 518 "justification": "LLM jailbreak vulnerability is well-established; the GPT-4 > open-source vulnerability ranking is counterintuitive but inadequately supported." 519 }, 520 "fear_safety": { 521 "score": 2, 522 "justification": "Reports high attack success rates (87%+) on commercial LLMs including GPT-4 and Claude 2, contributing to AI safety concern discourse." 523 }, 524 "drama_conflict": { 525 "score": 1, 526 "justification": "No controversy or conflict beyond the general 'LLMs are unsafe' framing; no named companies criticized." 527 }, 528 "demo_ability": { 529 "score": 0, 530 "justification": "No code, data, demo, or prompts released — nothing for a reader to try." 531 }, 532 "brand_recognition": { 533 "score": 1, 534 "justification": "Solo independent researcher; tests well-known models (GPT-4, Claude) but is not affiliated with a recognized lab." 535 } 536 } 537 }