scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (17071B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "An Early Categorization of Prompt Injection Attacks on Large Language Models",
      6     "authors": [
      7       "Sippo Rossi",
      8       "Alisia Marianne Michel",
      9       "Raghava Rao Mukkamala",
     10       "Jason Bennett Thatcher"
     11     ],
     12     "year": 2024,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2402.00898",
     15     "doi": "10.48550/arXiv.2402.00898"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract claims to provide an overview and categorization of prompt injections and discuss implications — all three are delivered in the paper's body with Tables 2/3 and Sections 5.1–5.3.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": false,
     27         "answer": false,
     28         "justification": "The paper is a descriptive taxonomy with no causal claims of the form 'X causes Y'; no study design question arises.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Authors explicitly state the categorization is 'not exhaustive,' that most attacks were demonstrated on only one or two LLM interfaces (mainly ChatGPT/GPT-3), and that generalizing to other interfaces requires 'moderate to significant altering.'",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "The paper is a taxonomy with no hypothesis testing; there are no empirical findings for which alternative explanations would be relevant.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper documents the existence and structure of attack types; what is measured (documented examples and tests) matches exactly what is claimed (a taxonomy of prompt injection classes).",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 5.5 is explicitly titled 'Limitations' and spans a full paragraph with multiple distinct concerns.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Authors identify specific threats: the categorization is incomplete because new attack types emerge continuously; most examples were demonstrated on ChatGPT/GPT-3 only and may not transfer to other interfaces; some injections could not be verified due to rapid patching.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper explicitly states it does not catalog which injections have been patched, that indirect injections were not empirically tested due to ethical concerns, and that the categorization excludes injections backed by only one source.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "There is no funding acknowledgement or disclosure anywhere in the paper.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are clearly stated on the first page: Copenhagen Business School and Temple University — both academic institutions with no apparent LLM product connection.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding is disclosed, so this criterion is not applicable.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "There is no competing interests statement or declaration of financial interests anywhere in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The paper defines 'prompt injection' by analogy to SQL injection in the introduction, and explicitly defines 'direct' vs. 'indirect' prompt injections, as well as each of the 10 subclasses in Tables 2 and 3.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The contribution is stated explicitly in Section 1: '(1) describe, document, and provide a comprehensive list of known types of prompt injections; (2) provide a checklist for developers and end users.'",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper's literature review (Section 2.2) directly engages with Perez & Ribeiro (2022), Greshake et al. (2023), Kang et al. (2023), Zou et al. (2023), and Shen et al. (2023), showing how this work extends the prior direct/indirect dichotomy into a finer-grained taxonomy.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "survey": {
    119       "search_and_selection": {
    120         "search_strategy_reproducible": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "The paper describes searching Google Scholar, Google, arXiv, GitHub, Medium, and Twitter/X with the keywords 'prompt injection' and 'jailbreak' over two rounds (May–June 2023 and September 2023), and names specific community sites (jailbreakchat.com, Reddit channels).",
    124           "source": "haiku"
    125         },
    126         "inclusion_exclusion_explicit": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "For academic papers: published May 2022–September 2023, in English, discussing prompt injections as adversarial/security threat. For non-academic: must be documented by multiple sources or independently verified by the authors' own tests.",
    130           "source": "haiku"
    131         },
    132         "prisma_or_structured_protocol": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No mention of PRISMA or any other structured review protocol; no flow diagram of paper screening stages is provided.",
    136           "source": "haiku"
    137         },
    138         "search_terms_provided": {
    139           "applies": true,
    140           "answer": true,
    141           "justification": "The paper explicitly states it used 'prompt injection' and 'jailbreak' as search keywords across all databases.",
    142           "source": "haiku"
    143         },
    144         "databases_listed": {
    145           "applies": true,
    146           "answer": true,
    147           "justification": "Six databases/platforms are named: Google Scholar, Google, arXiv, GitHub, Medium, and Twitter (X), plus jailbreakchat.com and two Reddit communities.",
    148           "source": "haiku"
    149         },
    150         "screening_process_documented": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "The paper reports finding 123 papers in the academic search and then focusing on those discussing prompt injections 'specifically,' but provides no count-by-stage screening table or flow of how 123 papers was reduced to the cited subset.",
    154           "source": "haiku"
    155         },
    156         "review_scope_justified": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "The authors justify the temporal scope (from May 2022 onward) by noting that is when prompt injections were first discovered and reported; the topic's novelty and reliance on preprints and non-academic sources is also explained.",
    160           "source": "haiku"
    161         }
    162       },
    163       "synthesis_quality": {
    164         "conflicting_findings_acknowledged": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The paper presents different attack types as complementary categories and never discusses cases where reviewed sources contradicted each other or offered conflicting empirical results.",
    168           "source": "haiku"
    169         },
    170         "quality_assessment_of_sources": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "No quality rubric or risk-of-bias assessment is applied to the reviewed papers; the only quality filter for non-academic sources is 'multiple corroborating sources and credible screenshots.'",
    174           "source": "haiku"
    175         },
    176         "publication_bias_discussed": {
    177           "applies": true,
    178           "answer": false,
    179           "justification": "Publication bias is never mentioned; the paper notes most sources are preprints but does not discuss how that skews the evidence base.",
    180           "source": "haiku"
    181         },
    182         "quantitative_synthesis_present": {
    183           "applies": true,
    184           "answer": false,
    185           "justification": "The synthesis is purely narrative and taxonomic; no meta-analysis, vote counting, or effect-size aggregation is attempted.",
    186           "source": "haiku"
    187         },
    188         "recommendations_supported_by_evidence": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Developer and end-user recommendations in Sections 5.1–5.2 are directly tied to the identified attack classes (e.g., 'avoid sensitive data in system prompts' follows directly from the documented instruction-manipulation class).",
    192           "source": "haiku"
    193         }
    194       }
    195     }
    196   },
    197   "claims": [
    198     {
    199       "claim": "Prompt injections can be divided into two broad branches: direct (6 classes) and indirect (4 classes), with 17 distinct variations identified.",
    200       "evidence": "Tables 2, 3, and 4 enumerate and define all 17 variations with sources for each.",
    201       "supported": "moderate"
    202     },
    203     {
    204       "claim": "Direct prompt injections primarily aim to bypass content-filtering safeguards, while indirect injections have broader and more varied cyber-attack-like objectives.",
    205       "evidence": "Objectives column in Tables 2 and 3 consistently shows 'bypass security measures' for direct and data exfiltration/manipulation goals for indirect.",
    206       "supported": "strong"
    207     },
    208     {
    209       "claim": "Most documented prompt injection attacks have been demonstrated on ChatGPT, GPT-3, or GPT-4, limiting generalizability to other LLM interfaces.",
    210       "evidence": "Target column in Table 4 shows ChatGPT or GPT-3/4 for 14 of 17 examples; acknowledged as limitation in Section 5.5.",
    211       "supported": "strong"
    212     },
    213     {
    214       "claim": "Developing a fully safe LLM interface against prompt injection is 'difficult if not impossible.'",
    215       "evidence": "Cited as the view of premier AI labs and supported by reference to computational suffix attacks (Zou et al., 2023), but no systematic evidence is presented.",
    216       "supported": "weak"
    217     },
    218     {
    219       "claim": "Virtual prompt injection can misalign a large share of outputs with a very small number of poisoned training examples.",
    220       "evidence": "Attributed entirely to Yan et al. (2023) without independent verification in this paper.",
    221       "supported": "moderate"
    222     }
    223   ],
    224   "methodology_tags": [
    225     "qualitative",
    226     "case-study"
    227   ],
    228   "key_findings": "The paper proposes an early taxonomy of 17 prompt injection attack types organized into two branches: 6 classes of direct injections (double character, virtualization, obfuscation, payload splitting, adversarial suffix, instruction manipulation) and 4 classes of indirect injections (active, passive, user-driven, virtual). Direct injections primarily bypass content filters while indirect injections enable data exfiltration, misinformation, social engineering, and training-data poisoning. The review is based on a mixed-method literature survey combining 123 academic papers with non-academic sources (Reddit, jailbreakchat.com), with partial empirical verification on ChatGPT and GPT-3. The authors conclude that fully preventing prompt injection is currently infeasible and recommend defensive design principles analogous to SQL-injection-safe database practices.",
    229   "red_flags": [
    230     {
    231       "flag": "No PRISMA or structured protocol",
    232       "detail": "The survey methodology is described narratively but follows no recognized systematic review protocol, making it difficult to assess selection bias or reproducibility rigorously."
    233     },
    234     {
    235       "flag": "Heavy reliance on non-peer-reviewed sources",
    236       "detail": "A significant portion of the evidence base is Reddit posts, blog entries, and jailbreakchat.com, with no quality rubric applied to distinguish reliable from unreliable demonstrations."
    237     },
    238     {
    239       "flag": "No screening flow or stage counts",
    240       "detail": "123 academic papers are found but no documentation is provided on how many were excluded at each stage, making the final included set opaque."
    241     },
    242     {
    243       "flag": "Single-platform generalizability",
    244       "detail": "14 of 17 catalogued attack examples target ChatGPT/GPT-3/4 only; the claim that categories apply generally to other LLM interfaces is asserted, not demonstrated."
    245     },
    246     {
    247       "flag": "No funding disclosure",
    248       "detail": "No acknowledgement or funding statement appears anywhere in the paper."
    249     },
    250     {
    251       "flag": "No quality assessment of sources",
    252       "detail": "No risk-of-bias or quality rating is applied to reviewed papers, treating a preprint and a peer-reviewed venue paper as equivalent evidence."
    253     }
    254   ],
    255   "cited_papers": [
    256     {
    257       "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    258       "relevance": "Primary academic source for indirect prompt injection taxonomy; directly foundational to the paper's categorization"
    259     },
    260     {
    261       "title": "Ignore Previous Prompt: Attack Techniques for Language Models",
    262       "relevance": "First systematic academic treatment of goal hijacking and prompt leaking; foundational reference"
    263     },
    264     {
    265       "title": "Exploiting Programmatic Behavior of LLMs: Dual-Use Through Standard Security Attacks",
    266       "relevance": "Introduces payload splitting and obfuscation attacks that form two of the six direct injection classes"
    267     },
    268     {
    269       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    270       "relevance": "Source for adversarial suffix attacks — one of the six direct injection classes and a key automated attack vector"
    271     },
    272     {
    273       "title": "Do Anything Now: Characterizing and Evaluating In-the-Wild Jailbreak Prompts on Large Language Models",
    274       "relevance": "Provides community-level categorization of jailbreak prompts used as prior art for the taxonomy"
    275     },
    276     {
    277       "title": "Virtual Prompt Injection for Instruction-Tuned Large Language Models",
    278       "relevance": "Sole source for the virtual prompt injection class; empirical evidence that few poisoned examples cause large output shifts"
    279     },
    280     {
    281       "title": "Multi-Step Jailbreaking Privacy Attacks on ChatGPT",
    282       "relevance": "Prior academic work on jailbreaking used to situate the direct injection categories"
    283     },
    284     {
    285       "title": "Evaluating the Instruction-Following Robustness of Large Language Models to Prompt Injection",
    286       "relevance": "Early benchmark for measuring LLM robustness to prompt injection — cited in future-work discussion on standardized tests"
    287     }
    288   ],
    289   "engagement_factors": {
    290     "practical_relevance": {
    291       "score": 2,
    292       "justification": "Provides a developer checklist and end-user guidelines directly derived from the taxonomy, though guidance is high-level."
    293     },
    294     "surprise_contrarian": {
    295       "score": 1,
    296       "justification": "Confirms and organizes known threats rather than presenting surprising or counter-intuitive findings."
    297     },
    298     "fear_safety": {
    299       "score": 3,
    300       "justification": "Directly concerns AI safety risks: malware generation, data exfiltration, training-data poisoning — all with real demonstrated examples."
    301     },
    302     "drama_conflict": {
    303       "score": 2,
    304       "justification": "The 'cat-and-mouse' framing and concrete examples (grandma jailbreak, zero-day malware via ChatGPT) carry inherent drama."
    305     },
    306     "demo_ability": {
    307       "score": 2,
    308       "justification": "Many direct injection examples can be attempted (though patching means success varies); Appendix A lists sources with live prompt examples."
    309     },
    310     "brand_recognition": {
    311       "score": 1,
    312       "justification": "Authors are from Copenhagen Business School and Temple University — no famous AI lab affiliation, though ChatGPT/GPT-4/Bing AI are named products throughout."
    313     }
    314   },
    315   "hn_data": {
    316     "threads": [],
    317     "top_points": 0,
    318     "total_points": 0,
    319     "total_comments": 0
    320   }
    321 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs