scan.json (25857B)
1 { 2 "paper": { 3 "title": "An Early Categorization of Prompt Injection Attacks on Large Language Models", 4 "authors": [ 5 "Sippo Rossi", 6 "Alisia Marianne Michel", 7 "Raghava Rao Mukkamala", 8 "Jason Bennett Thatcher" 9 ], 10 "year": 2024, 11 "venue": "arXiv", 12 "arxiv_id": "2402.00898", 13 "doi": "10.48550/arXiv.2402.00898" 14 }, 15 "scan_version": 3, 16 "active_modules": ["survey_methodology"], 17 "methodology_tags": ["qualitative", "meta-analysis"], 18 "key_findings": "The paper identifies 17 distinct prompt injection attack variations organized into two main branches: 6 classes of direct injections (double character, virtualization, obfuscation, payload splitting, adversarial suffix, instruction manipulation) and 4 classes of indirect injections (active, passive, user-driven, virtual prompt injection). Direct injections primarily aim to bypass content safety filters, while indirect injections have more varied goals resembling traditional cyberattacks. The authors note that even premier AI labs have difficulty fully defending against prompt injections, and that computational approaches to generating new attacks are becoming more mature.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "No code repository or analysis scripts are released. The paper contains no links to any code artifacts." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "No structured dataset of the collected prompt injections is released. The categorization exists only in the paper's tables and appendix, with external links to sources rather than a downloadable dataset." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "No environment specifications are provided. The paper mentions testing on ChatGPT and GPT-3 interfaces but provides no technical setup details." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No reproduction instructions are provided. The methodology section (Section 3) describes the general approach but does not provide step-by-step instructions for reproducing the categorization or the verification tests." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": false, 45 "answer": false, 46 "justification": "Qualitative survey/categorization paper with no quantitative experiments or statistical results." 47 }, 48 "significance_tests": { 49 "applies": false, 50 "answer": false, 51 "justification": "Qualitative survey/categorization paper with no comparative quantitative claims." 52 }, 53 "effect_sizes_reported": { 54 "applies": false, 55 "answer": false, 56 "justification": "Qualitative survey/categorization paper with no quantitative experiments." 57 }, 58 "sample_size_justified": { 59 "applies": false, 60 "answer": false, 61 "justification": "Qualitative survey/categorization paper with no quantitative experiments." 62 }, 63 "variance_reported": { 64 "applies": false, 65 "answer": false, 66 "justification": "Qualitative survey/categorization paper with no quantitative experiments." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": false, 73 "justification": "The paper mentions that Shen et al. (2023) proposed a categorization for jailbreak communities and that Greshake et al. (2023) identified the direct/indirect distinction, but does not systematically compare their categorization against these prior frameworks." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": false, 78 "justification": "No systematic comparison with any prior categorization or survey is provided." 79 }, 80 "ablation_study": { 81 "applies": false, 82 "answer": false, 83 "justification": "Not applicable to a survey/categorization paper — no system components to ablate." 84 }, 85 "multiple_metrics": { 86 "applies": false, 87 "answer": false, 88 "justification": "Not applicable to a survey/categorization paper — no quantitative evaluation performed." 89 }, 90 "human_evaluation": { 91 "applies": false, 92 "answer": false, 93 "justification": "Not applicable to a survey/categorization paper — no system outputs to evaluate." 94 }, 95 "held_out_test_set": { 96 "applies": false, 97 "answer": false, 98 "justification": "Not applicable to a survey/categorization paper — no predictive model or test set." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Tables 2, 3, and 4 provide detailed per-class breakdowns of all 17 prompt injection types across 6 direct and 4 indirect classes, with descriptions, objectives, targets, and sources for each." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 5.5 discusses prompt injections that could not be verified ('we preferred omitting questionable or contested prompt injections') and Section 3 notes that 'due to the rapid patching of these vulnerabilities, they do not always work without additional prompt engineering or in some cases they do not work at all anymore.'" 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper reports that some prompt injections no longer work due to patching, that indirect prompt injections could not be ethically tested, and that some found injections were omitted due to insufficient documentation (Section 3, Section 5.4, Section 5.5)." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "The abstract claims: (1) overview of prompt injection threat — supported by Sections 2 and 4; (2) categorization — supported by Tables 2-4 and Section 4; (3) implications discussion — supported by Section 5. All abstract claims are substantiated in the paper." 121 }, 122 "causal_claims_justified": { 123 "applies": false, 124 "answer": false, 125 "justification": "The paper makes no causal claims. It is a descriptive categorization/taxonomy paper that catalogs and organizes existing prompt injection types." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The title claims broad applicability to 'Large Language Models' but Section 5.5 acknowledges 'most prompt injections have been demonstrated on only one or two LLM interfaces, with ChatGPT and GPT-3 or GPT-4 being by far the most common targets.' The authors assume injections 'could either directly or with moderate to significant altering be applied to other chatbots' without evidence." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper does not discuss alternative categorization frameworks in depth, alternative explanations for why certain injection classes work, or whether the observed patterns could be artifacts of the sources reviewed (e.g., ChatGPT dominance in sources biasing the taxonomy)." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper's claims match the granularity of its observations. It claims to categorize known prompt injection types and directly presents that categorization — no proxy gap exists between what was observed and what is claimed." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper refers to 'ChatGPT', 'GPT-3', 'GPT-4', and 'Bing AI' without specifying exact model versions, snapshot dates, or API versions. No version identifiers like 'gpt-3.5-turbo-0613' are provided." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": false, 152 "justification": "Section 5.4 explicitly states: 'besides the example in Table 1, we refrained from providing the prompt texts for the various prompt injection classes directly.' The actual prompts used for testing are not included in the paper." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": false, 157 "justification": "No hyperparameters (temperature, top-p, etc.) are reported for the ChatGPT or GPT-3 testing. The paper does not describe any API settings used during verification." 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "No agentic scaffolding is used. The paper involves manual testing of prompts through chatbot interfaces." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 3 describes the search and filtering process: keyword searches on Google Scholar, arXiv, GitHub, Medium, Twitter for 'prompt injection' and 'jailbreak'; reviewing jailbreakchat.com and Reddit; academic review from May-September 2023; verification through multiple sources and own testing. Section 2.2 states 123 papers were found via Google Scholar. Filtering criteria are described (must discuss prompt injections specifically, must have multiple sources or be verified by tests)." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 5.5 is titled 'Limitations' and provides substantive discussion of specific limitations across multiple paragraphs." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 5.5 identifies threats specific to this study: (1) categorization is not exhaustive due to rapidly evolving landscape, (2) some prompt injections were omitted due to inability to verify, (3) most injections demonstrated on only ChatGPT/GPT with uncertain generalizability to other LLMs." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": true, 184 "justification": "The paper states specific scope boundaries: indirect prompt injections were not tested for ethical reasons (Section 5.4), temporal scope is May-September 2023, only English-language sources were reviewed (Section 2.2), and the categorization may become obsolete as defenses improve (Section 5.6)." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "No raw data is released. The collected prompt injection examples, sources reviewed, and verification results are not available as downloadable data. Only summary tables appear in the paper." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 3 describes data collection in detail: keyword searches across multiple platforms (Google, Google Scholar, arXiv, GitHub, Medium, Twitter), specific websites reviewed (jailbreakchat.com, Reddit channels ChatGPTJailbreak and ChatGPT), time period (May-June 2023 initial, September 2023 supplementary), and a verification process requiring multiple sources or own testing." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants. The study reviews publicly available sources (academic papers, websites, forums) — no recruitment involved." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": false, 206 "justification": "While Figure 1 shows the categorization process and Section 3 describes the general pipeline, key counts are missing. The paper states 123 academic papers were found and 17 prompt injection types were identified, but does not document how many total prompt injection examples were initially collected, how many were filtered at each stage, or how many were rejected during verification." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No funding acknowledgment or statement appears in the paper." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are clearly listed: Copenhagen Business School (Rossi, Michel, Mukkamala) and Temple University (Thatcher). No evaluated product is affiliated with these institutions." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": false, 222 "answer": false, 223 "justification": "No funding disclosed; appears to be unfunded academic research from two universities." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial interests statement appears in the paper." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": false, 234 "answer": false, 235 "justification": "Survey/categorization paper that does not evaluate a pre-trained model's capability on any benchmark." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Survey/categorization paper that does not evaluate a pre-trained model on any benchmark." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": false, 244 "answer": false, 245 "justification": "Survey/categorization paper that does not evaluate a pre-trained model on any benchmark." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study. It is a literature and web review of prompt injection attacks." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants. The ethics section (5.4) addresses ethical concerns of publishing prompt injection details, not human subjects research." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": false, 288 "answer": false, 289 "justification": "Survey/categorization paper with no computational method of its own." 290 }, 291 "compute_budget_stated": { 292 "applies": false, 293 "answer": false, 294 "justification": "Survey/categorization paper with no significant computational work." 295 } 296 }, 297 "survey_methodology": { 298 "prisma_or_structured_protocol": { 299 "applies": true, 300 "answer": false, 301 "justification": "The paper describes a search strategy (Section 3) with keywords and platforms but does not follow PRISMA or any formal review protocol. No flow diagram with counts at each stage, no protocol registration, and the search strategy is not fully reproducible (no exact queries documented)." 302 }, 303 "quality_assessment_of_sources": { 304 "applies": true, 305 "answer": false, 306 "justification": "The paper does not assess the quality of its source papers or non-academic sources using any rubric. Sources are verified through multiple corroboration and own testing, but no quality scoring is applied. All verified sources are treated equally regardless of their methodological rigor." 307 }, 308 "publication_bias_discussed": { 309 "applies": true, 310 "answer": false, 311 "justification": "No discussion of publication bias. The paper does not consider whether its sources (heavily biased toward ChatGPT/GPT, toward successful attacks, and toward attacks that generate attention online) might skew the resulting categorization." 312 } 313 } 314 }, 315 "claims": [ 316 { 317 "claim": "Seventeen distinct prompt injection attack variations were identified, organized into 6 direct and 4 indirect classes.", 318 "evidence": "Section 4 and Tables 2-4 present the full categorization with descriptions, objectives, and sources for each of the 17 examples across 10 classes.", 319 "supported": "moderate" 320 }, 321 { 322 "claim": "Direct prompt injections primarily aim to bypass security measures in LLM interfaces to produce malicious outputs.", 323 "evidence": "Table 2 shows all 6 direct injection classes share the objective of bypassing security measures; Section 4.1 provides descriptions and demonstrations.", 324 "supported": "moderate" 325 }, 326 { 327 "claim": "Indirect prompt injections have more varied goals resembling traditional cyberattacks, including data exfiltration and misinformation.", 328 "evidence": "Table 3 describes 4 indirect injection classes with distinct objectives: stealing data, tricking LLMs into malicious actions, tricking users, and manipulating training data.", 329 "supported": "moderate" 330 }, 331 { 332 "claim": "Developing a fully safe LLM interface is difficult if not impossible.", 333 "evidence": "Section 5.1 argues this based on the observation that even premier AI labs have difficulty blocking prompt injections and that computational approaches (Zou et al., 2023) are maturing. No direct empirical evidence is provided for the impossibility claim.", 334 "supported": "weak" 335 }, 336 { 337 "claim": "Security measures can be bypassed with relatively simple prompt injections.", 338 "evidence": "Section 4.1 states this and Table 1 provides one example of a simple persona-based bypass. However, most prompt texts are not provided and systematic success rates are not reported.", 339 "supported": "weak" 340 } 341 ], 342 "red_flags": [ 343 { 344 "flag": "Heavy reliance on non-academic sources", 345 "detail": "The categorization draws heavily from Reddit, jailbreakchat.com, blogs, and social media posts — sources without peer review. The paper acknowledges this but the resulting taxonomy may reflect what is popular online rather than what is technically important." 346 }, 347 { 348 "flag": "No quality assessment of sources", 349 "detail": "The survey treats all verified prompt injection sources equally without assessing their methodological quality, reliability, or significance. This launders the signal-to-noise ratio of the underlying sources." 350 }, 351 { 352 "flag": "No quantitative analysis of attack effectiveness", 353 "detail": "No success rates, no systematic comparison of attack effectiveness across models, no metrics on how many attempts were needed. The categorization is purely qualitative with no quantitative validation." 354 }, 355 { 356 "flag": "Actual prompts withheld", 357 "detail": "For ethical reasons (Section 5.4), the paper does not provide the actual prompt texts used for testing. While ethically motivated, this makes independent verification of the categorization impossible." 358 }, 359 { 360 "flag": "ChatGPT/GPT testing bias", 361 "detail": "Most prompt injections were tested only on ChatGPT and GPT-3/4. The categorization may reflect vulnerabilities specific to OpenAI's products rather than universal LLM weaknesses, despite the broad title." 362 } 363 ], 364 "cited_papers": [ 365 { 366 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 367 "authors": ["K. Greshake", "S. Abdelnabi", "S. Mishra", "C. Endres", "T. Holz", "M. Fritz"], 368 "year": 2023, 369 "arxiv_id": "2302.12173", 370 "relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications, including data exfiltration and social engineering scenarios." 371 }, 372 { 373 "title": "Ignore previous prompt: Attack techniques for language models", 374 "authors": ["F. Perez", "I. Ribeiro"], 375 "year": 2022, 376 "arxiv_id": "2211.09527", 377 "relevance": "Early systematic study of prompt injection attack techniques including goal hijacking and prompt leaking against language models." 378 }, 379 { 380 "title": "Exploiting programmatic behavior of LLMs: Dual-use through standard security attacks", 381 "authors": ["D. Kang", "X. Li", "I. Stoica", "C. Guestrin", "M. Zaharia", "T. Hashimoto"], 382 "year": 2023, 383 "arxiv_id": "2302.05733", 384 "relevance": "Demonstrates payload splitting and obfuscation attacks that exploit the programmatic behavior of LLMs for dual-use purposes." 385 }, 386 { 387 "title": "Universal and transferable adversarial attacks on aligned language models", 388 "authors": ["A. Zou", "Z. Wang", "J.Z. Kolter", "M. Fredrikson"], 389 "year": 2023, 390 "arxiv_id": "2307.15043", 391 "relevance": "Introduces computationally generated adversarial suffixes that transfer across multiple LLMs including ChatGPT, Bard, and Claude." 392 }, 393 { 394 "title": "Do anything now: Characterizing and evaluating in-the-wild jailbreak prompts on large language models", 395 "authors": ["X. Shen", "Z. Chen", "M. Backes", "Y. Shen", "Y. Zhang"], 396 "year": 2023, 397 "arxiv_id": "2308.03825", 398 "relevance": "Large-scale characterization and categorization of real-world jailbreak prompts collected from online communities." 399 }, 400 { 401 "title": "Virtual prompt injection for instruction-tuned large language models", 402 "authors": ["J. Yan", "V. Yadav", "S. Li", "L. Chen", "Z. Tang", "H. Wang", "V. Srinivasan", "X. Ren", "H. Jin"], 403 "year": 2023, 404 "arxiv_id": "2307.16888", 405 "relevance": "Demonstrates training-time prompt injection where poisoned instruction-tuning data causes misaligned model behavior without runtime prompt manipulation." 406 }, 407 { 408 "title": "Multi-step jailbreaking privacy attacks on ChatGPT", 409 "authors": ["H. Li", "D. Guo", "W. Fan", "M. Xu", "Y. Song"], 410 "year": 2023, 411 "arxiv_id": "2304.05197", 412 "relevance": "Demonstrates multi-step jailbreaking techniques for extracting private information from ChatGPT." 413 }, 414 { 415 "title": "Evaluating the instruction-following robustness of large language models to prompt injection", 416 "authors": ["Z. Li", "B. Peng", "P. He", "X. Yan"], 417 "year": 2023, 418 "arxiv_id": "2308.10819", 419 "relevance": "Proposes benchmarks for evaluating LLM robustness against prompt injection attacks, relevant to standardized safety testing." 420 }, 421 { 422 "title": "Latent jailbreak: A benchmark for evaluating text safety and output robustness of large language models", 423 "authors": ["H. Qiu", "S. Zhang", "A. Li", "H. He", "Z. Lan"], 424 "year": 2023, 425 "arxiv_id": "2307.08487", 426 "relevance": "Proposes a benchmark specifically for evaluating text safety and output robustness of LLMs against jailbreak-style attacks." 427 }, 428 { 429 "title": "Use of LLMs for illicit purposes: Threats, prevention measures, and vulnerabilities", 430 "authors": ["M. Mozes", "X. He", "B. Kleinberg", "L.D. Griffin"], 431 "year": 2023, 432 "arxiv_id": "2308.12833", 433 "relevance": "Comprehensive review of LLM misuse threats including prompt injection, relevant to understanding the security landscape." 434 }, 435 { 436 "title": "On the opportunities and risks of foundation models", 437 "authors": ["R. Bommasani", "D.A. Hudson", "E. Adeli"], 438 "year": 2021, 439 "arxiv_id": "2108.07258", 440 "relevance": "Foundational position paper on risks and opportunities of large foundation models, providing context for security concerns." 441 } 442 ], 443 "engagement_factors": { 444 "practical_relevance": { 445 "score": 2, 446 "justification": "Provides a categorization that developers can use as a checklist for LLM interface security, though no tools or code are released." 447 }, 448 "surprise_contrarian": { 449 "score": 0, 450 "justification": "Confirms widely known concerns about prompt injection vulnerabilities rather than challenging conventional wisdom." 451 }, 452 "fear_safety": { 453 "score": 2, 454 "justification": "Systematically documents AI security vulnerabilities including data exfiltration and training data poisoning scenarios." 455 }, 456 "drama_conflict": { 457 "score": 1, 458 "justification": "Highlights the cat-and-mouse dynamic between attackers and LLM developers but presents no major controversy." 459 }, 460 "demo_ability": { 461 "score": 0, 462 "justification": "No code, demo, or tool released. Actual attack prompts are deliberately withheld for ethical reasons." 463 }, 464 "brand_recognition": { 465 "score": 1, 466 "justification": "Discusses ChatGPT, GPT-4, and Bing AI prominently, but the paper itself is from CBS/Temple University, not a major AI lab." 467 } 468 } 469 }