ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (16247B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "An Early Categorization of Prompt Injection Attacks on Large Language Models",
      6     "authors": [
      7       "Sippo Rossi",
      8       "Alisia Marianne Michel",
      9       "Raghava Rao Mukkamala",
     10       "Jason Bennett Thatcher"
     11     ],
     12     "year": 2024,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2402.00898",
     15     "doi": "10.48550/arXiv.2402.00898"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract claims: (1) overview of prompt injection threat — supported by Sections 2 and 4; (2) categorization — supported by Tables 2-4 and Section 4; (3) implications discussion — supported by Section 5. All abstract claims are substantiated in the paper.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": false,
     27         "answer": false,
     28         "justification": "The paper makes no causal claims. It is a descriptive categorization/taxonomy paper that catalogs and organizes existing prompt injection types.",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The title claims broad applicability to 'Large Language Models' but Section 5.5 acknowledges 'most prompt injections have been demonstrated on only one or two LLM interfaces, with ChatGPT and GPT-3 or GPT-4 being by far the most common targets.' The authors assume injections 'could either directly or with moderate to significant altering be applied to other chatbots' without evidence.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper does not discuss alternative categorization frameworks in depth, alternative explanations for why certain injection classes work, or whether the observed patterns could be artifacts of the sources reviewed (e.g., ChatGPT dominance in sources biasing the taxonomy).",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper's claims match the granularity of its observations. It claims to categorize known prompt injection types and directly presents that categorization — no proxy gap exists between what was observed and what is claimed.",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 5.5 is titled 'Limitations' and provides substantive discussion of specific limitations across multiple paragraphs.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Section 5.5 identifies threats specific to this study: (1) categorization is not exhaustive due to rapidly evolving landscape, (2) some prompt injections were omitted due to inability to verify, (3) most injections demonstrated on only ChatGPT/GPT with uncertain generalizability to other LLMs.",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper states specific scope boundaries: indirect prompt injections were not tested for ethical reasons (Section 5.4), temporal scope is May-September 2023, only English-language sources were reviewed (Section 2.2), and the categorization may become obsolete as defenses improve (Section 5.6).",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding acknowledgment or statement appears in the paper.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are clearly listed: Copenhagen Business School (Rossi, Michel, Mukkamala) and Temple University (Thatcher). No evaluated product is affiliated with these institutions.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding disclosed; appears to be unfunded academic research from two universities.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests statement appears in the paper.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Prompt injection is defined by analogy to SQL injection in the introduction; direct vs. indirect injections are defined; LLMs are distinguished from AI chatbots in Section 2.1.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The contribution is stated explicitly as two-fold: a comprehensive documented list of known prompt injection types, and a categorization checklist for developers and end users.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2.2 reviews prior academic work including Greshake et al. (2023), Perez and Ribeiro (2022), Shen et al. (2023), and Zou et al. (2023), building the taxonomy directly on these foundations.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "survey": {
    119       "search_and_selection": {
    120         "search_strategy_reproducible": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "The methodology mentions searching Google, Google Scholar, arXiv, Github, Medium, and Twitter but provides no Boolean queries, result counts, or exact search strings sufficient for replication.",
    124           "source": "haiku"
    125         },
    126         "inclusion_exclusion_explicit": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "Academic inclusion criteria are stated (English, post-May 2022, adversarial framing) but not applied in a documented fashion; non-academic inclusion relies on ad hoc multi-source corroboration without formal criteria.",
    130           "source": "haiku"
    131         },
    132         "prisma_or_structured_protocol": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No PRISMA flowchart or other structured review protocol is used; the methodology is described narratively without formal screening stages or counts.",
    136           "source": "haiku"
    137         },
    138         "search_terms_provided": {
    139           "applies": true,
    140           "answer": true,
    141           "justification": "The paper explicitly states that searches used 'prompt injection' and 'jailbreak' as keywords across all listed databases.",
    142           "source": "haiku"
    143         },
    144         "databases_listed": {
    145           "applies": true,
    146           "answer": true,
    147           "justification": "Six sources are explicitly named: Google, Google Scholar, arXiv, Github, Medium, and Twitter (now X).",
    148           "source": "haiku"
    149         },
    150         "screening_process_documented": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "The paper states 123 academic papers were found but provides no breakdown of how many were excluded at each stage, no counts for non-academic sources screened, and no screening flowchart.",
    154           "source": "haiku"
    155         },
    156         "review_scope_justified": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "The temporal scope (May 2022 onward) is justified by the emergence of prompt injections at that time; inclusion of non-academic sources is justified by the pre-publication and rapidly evolving nature of the field.",
    160           "source": "haiku"
    161         }
    162       },
    163       "synthesis_quality": {
    164         "conflicting_findings_acknowledged": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No conflicting findings across reviewed sources are discussed; the paper presents a single unified taxonomy without noting where sources disagree on categorization or definitions.",
    168           "source": "haiku"
    169         },
    170         "quality_assessment_of_sources": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "No formal quality rubric is applied; non-academic sources are included if backed by 'multiple sources with credible screenshots', which is an informal and inconsistently documented criterion.",
    174           "source": "haiku"
    175         },
    176         "publication_bias_discussed": {
    177           "applies": true,
    178           "answer": false,
    179           "justification": "Publication bias is not discussed; the authors note that most literature is preprints rather than peer-reviewed but do not address how the direction or completeness of findings might be skewed.",
    180           "source": "haiku"
    181         },
    182         "quantitative_synthesis_present": {
    183           "applies": true,
    184           "answer": false,
    185           "justification": "No quantitative synthesis is performed; the paper produces a descriptive taxonomy of 17 attack types with no counts of prevalence, attack success rates, or effect aggregation.",
    186           "source": "haiku"
    187         },
    188         "recommendations_supported_by_evidence": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Recommendations in Sections 5.1-5.2 are directly tied to specific documented attack classes — e.g., recommending caution with copied prompts is grounded in the documented user-driven injection class.",
    192           "source": "haiku"
    193         }
    194       }
    195     }
    196   },
    197   "claims": [
    198     {
    199       "claim": "Prompt injection attacks can be organized into two branches (direct/indirect) with 10 classes covering 17 documented variants.",
    200       "evidence": "Tables 2-3 and Appendix A document each class and variant with sources; the taxonomy is the core deliverable.",
    201       "supported": "moderate"
    202     },
    203     {
    204       "claim": "Developing a fully safe LLM interface is difficult if not impossible.",
    205       "evidence": "Asserted in Section 5.1 based on premier AI labs failing to prevent attacks; no systematic evidence or success rate data is provided.",
    206       "supported": "weak"
    207     },
    208     {
    209       "claim": "Indirect prompt injections can exfiltrate private user data via browser plugins and email clients.",
    210       "evidence": "Cited to Burgess (2023) and demonstrated via Greshake et al. (2023) examples; evidence is secondary and journalistic rather than peer-reviewed.",
    211       "supported": "moderate"
    212     },
    213     {
    214       "claim": "Virtual prompt injection can misalign a large share of LLM outputs with very few poisoned training examples.",
    215       "evidence": "Attributed to Yan et al. (2023) who demonstrated the attack empirically; the claim is borrowed from cited work.",
    216       "supported": "moderate"
    217     },
    218     {
    219       "claim": "Simple prompt injections can bypass LLM safety measures to produce malicious content including malware.",
    220       "evidence": "Illustrated by the grandmother example in Table 1 and Mulgrew (2023) blog post; verification limited to ChatGPT and GPT-3.",
    221       "supported": "weak"
    222     }
    223   ],
    224   "methodology_tags": [
    225     "qualitative",
    226     "case-study"
    227   ],
    228   "key_findings": "The paper taxonomizes prompt injection attacks into two branches — direct (6 classes: double character, virtualization, obfuscation, payload splitting, adversarial suffix, instruction manipulation) and indirect (4 classes: active, passive, user-driven, virtual prompt injection) — documenting 17 specific verified examples. The review draws heavily on non-peer-reviewed sources due to the topic's novelty at time of writing. The authors conclude that LLM interfaces cannot currently be made fully secure and call for standardized vulnerability test suites and developer best practices analogous to SQL injection defense patterns.",
    229   "red_flags": [
    230     {
    231       "flag": "Non-peer-reviewed sources dominate",
    232       "detail": "The survey heavily relies on blogs, tweets, Reddit posts, and jailbreakchat.com with informal multi-source corroboration as the quality check, with no formal rubric applied."
    233     },
    234     {
    235       "flag": "No PRISMA or structured protocol",
    236       "detail": "No formal screening protocol is used; 123 academic papers are mentioned but no exclusion counts or screening stages are documented."
    237     },
    238     {
    239       "flag": "No competing taxonomy comparison",
    240       "detail": "Shen et al. (2023) already proposed a jailbreak categorization; this paper notes it but does not compare or validate its own scheme against the existing one."
    241     },
    242     {
    243       "flag": "No funding or competing interests disclosure",
    244       "detail": "Neither a funding acknowledgment nor a competing interests statement appears anywhere in the paper."
    245     },
    246     {
    247       "flag": "No quantitative synthesis",
    248       "detail": "Despite surveying 123 papers, no quantitative aggregation of attack prevalence, success rates, or defense effectiveness is provided."
    249     }
    250   ],
    251   "cited_papers": [
    252     {
    253       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    254       "relevance": "Core reference establishing the direct/indirect injection taxonomy that this paper extends"
    255     },
    256     {
    257       "title": "Ignore Previous Prompt: Attack Techniques for Language Models",
    258       "relevance": "Foundational paper identifying goal hijacking and prompt leaking attack classes"
    259     },
    260     {
    261       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    262       "relevance": "Key reference for the adversarial suffix attack class (computational bypass of alignment)"
    263     },
    264     {
    265       "title": "Virtual Prompt Injection for Instruction-Tuned Large Language Models",
    266       "relevance": "Core reference for the virtual prompt injection class via training data poisoning"
    267     },
    268     {
    269       "title": "Exploiting Programmatic Behavior of LLMs: Dual-use through standard security attacks",
    270       "relevance": "Reference for payload splitting and obfuscation attack classes"
    271     },
    272     {
    273       "title": "Do Anything Now: Characterizing and Evaluating In-the-Wild Jailbreak Prompts on Large Language Models",
    274       "relevance": "Prior competing taxonomy for jailbreak communities that this paper builds on"
    275     },
    276     {
    277       "title": "Evaluating the Instruction-Following Robustness of Large Language Models to Prompt Injection",
    278       "relevance": "Benchmark approach for evaluating LLM robustness to prompt injection attacks"
    279     }
    280   ],
    281   "engagement_factors": {
    282     "practical_relevance": {
    283       "score": 2,
    284       "justification": "Provides a categorization that developers can use as a checklist for LLM interface security, though no tools or code are released."
    285     },
    286     "surprise_contrarian": {
    287       "score": 0,
    288       "justification": "Confirms widely known concerns about prompt injection vulnerabilities rather than challenging conventional wisdom."
    289     },
    290     "fear_safety": {
    291       "score": 2,
    292       "justification": "Systematically documents AI security vulnerabilities including data exfiltration and training data poisoning scenarios."
    293     },
    294     "drama_conflict": {
    295       "score": 1,
    296       "justification": "Highlights the cat-and-mouse dynamic between attackers and LLM developers but presents no major controversy."
    297     },
    298     "demo_ability": {
    299       "score": 0,
    300       "justification": "No code, demo, or tool released. Actual attack prompts are deliberately withheld for ethical reasons."
    301     },
    302     "brand_recognition": {
    303       "score": 1,
    304       "justification": "Discusses ChatGPT, GPT-4, and Bing AI prominently, but the paper itself is from CBS/Temple University, not a major AI lab."
    305     }
    306   },
    307   "hn_data": {
    308     "threads": [],
    309     "top_points": 0,
    310     "total_points": 0,
    311     "total_comments": 0
    312   }
    313 }

Impressum · Datenschutz