scan.json (29228B)
1 { 2 "paper": { 3 "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection", 4 "authors": [ 5 "Kai Greshake", 6 "Sahar Abdelnabi", 7 "Shailesh Mishra", 8 "Christoph Endres", 9 "Thorsten Holz", 10 "Mario Fritz" 11 ], 12 "year": 2023, 13 "venue": "AISec@CCS", 14 "arxiv_id": "2302.12173", 15 "doi": "10.1145/3605764.3623985" 16 }, 17 "scan_version": 3, 18 "active_modules": [], 19 "methodology_tags": ["case-study", "qualitative"], 20 "key_findings": "The paper introduces indirect prompt injection (IPI) as a novel attack vector in which adversaries inject malicious prompts into data retrieved by LLM-integrated applications at inference time, enabling remote compromise without direct model access. Through qualitative demonstrations on synthetic GPT-4 applications and real-world Bing Chat, the authors show that IPI can enable data theft, fraud, malware spreading (including self-replicating prompt worms), remote control intrusion, content manipulation, and denial of service. Code completion engines like GitHub Copilot are also vulnerable via injected comments in imported packages, though efficacy is sensitive to context. The paper develops a comprehensive threat taxonomy and argues that current mitigations (RLHF, input filtering) are inadequate against these attacks.", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The paper provides two GitHub repositories: https://github.com/greshake/llm-security (footnote 1) and https://github.com/greshake/lm-safety (Section 5.4). Section 5.4 states 'Our synthetic applications demos are readily available on a public GitHub repository.'" 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "All attack prompts are provided in the Appendix (Prompts 1-20), and the synthetic application demos with prepared content are released on GitHub. Section 1 states they 'share all our demonstrations on our GitHub repository and all developed attack prompts in the Appendix.'" 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "No requirements.txt, Dockerfile, or environment specification is mentioned. The paper mentions using LangChain and OpenAI APIs but does not specify library versions or provide a reproducible environment setup." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "Section 5.4 states the demos are 'easy to adapt to different OpenAI's APIs or models' but no step-by-step reproduction instructions are provided. The Bing Chat experiments are inherently difficult to reproduce due to the dynamic, black-box nature of the system." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "No quantitative metrics are reported. All demonstrations are qualitative — individual examples with screenshots. No confidence intervals or error bars appear anywhere in the paper." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "No statistical significance tests are used. The paper acknowledges in Section 5.2 that 'quantifying our attacks' success rate can be challenging' and leaves quantitative evaluation for future work." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": false, 58 "justification": "No effect sizes are reported. The paper provides no quantitative measurements of attack success rates, severity, or impact magnitude." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "No justification is given for the number of demonstrations or test sessions conducted. The paper does not state how many times each attack was attempted or how many sessions informed each finding." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "No variance or multi-run results are reported. Each demonstration appears to be a single qualitative example. Section 5.4 acknowledges 'exact reproducibility is difficult to guarantee.'" 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": false, 75 "justification": "No baselines are included. The paper does not compare indirect prompt injection against other attack methods, nor does it compare attack success across different defense configurations systematically." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": false, 80 "justification": "No baselines of any kind are used, so contemporaneity is not applicable in practice. The paper references prior work on direct prompt injection (Perez & Ribeiro 2022) but does not compare against it experimentally." 81 }, 82 "ablation_study": { 83 "applies": false, 84 "answer": false, 85 "justification": "This is a taxonomy and demonstration paper, not a system with modular components that could be ablated. The attacks are independent proof-of-concept examples, not a unified system." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": false, 90 "justification": "No quantitative metrics are used at all. The evaluation is entirely qualitative, consisting of screenshots and conversation transcripts showing attack outcomes." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": false, 95 "justification": "No formal human evaluation of attack believability or effectiveness is conducted. Section 5.2 identifies evaluating 'deception and believability' via user studies as future work." 96 }, 97 "held_out_test_set": { 98 "applies": false, 99 "answer": false, 100 "justification": "The paper has no dataset or systematic evaluation framework. Attacks are demonstrated via individual qualitative examples on specific systems, not evaluated against a structured test set." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "The paper organizes demonstrations by threat category (information gathering, fraud, malware, intrusion, manipulated content, availability) with specific examples for each. Section 4.2 provides per-category demonstrations." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Section 4.2.4 notes code completion attacks were 'very sensitive to context' and 'efficacy of our injections was significantly reduced' in larger applications. Section 5.2 acknowledges models may generate 'conspicuously false answers that are widely unbelievable.'" 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper reports that code completion injections had reduced efficacy in larger codebases (Section 4.2.4), that availability attacks did not always work consistently (Section 4.2.6), and that attack prompts sometimes produced unconvincing outputs (Section 5.2)." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "The abstract claims are qualitative (new attack vectors, taxonomy, practical feasibility demonstrations, lack of mitigations) and are all supported by the paper's content: taxonomy in Section 3/Figure 2, demonstrations in Section 4, mitigation discussion in Section 5.6." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "The paper's main causal claims are possibility/existence claims ('indirect prompt injection CAN lead to X'). These are adequately supported by controlled demonstrations where injecting specific prompts causes specific model behaviors. The study design (controlled synthetic apps + real Bing Chat) is appropriate for establishing that these attacks are possible." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": false, 132 "justification": "The title claims 'Compromising Real-World LLM-Integrated Applications' broadly, but testing is limited to Bing Chat, synthetic GPT-4 apps, and GitHub Copilot. The paper could not test on Microsoft 365 Copilot or ChatGPT plugins (acknowledged in Section 5.2). The taxonomy covers threats far beyond what was demonstrated." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper does not discuss alternative explanations for why attacks succeed (e.g., whether it is the instruction-tuning, RLHF, or architectural choices that make models susceptible). No robustness checks or confound analysis is provided." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper demonstrates individual attack instances and claims 'practical viability' and 'feasibility' without distinguishing between showing an attack is possible in a controlled setting and showing it would work reliably at scale in the wild. Section 5.2 acknowledges this gap but the framing does not." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": false, 149 "justification": "The paper mentions 'text-davinci-003' and 'gpt-4' but without specific version dates or API snapshots. For Bing Chat, it says it 'runs on the GPT-4 model' without specifying which version. For Copilot, it says 'OpenAI Codex' without version. Per schema requirements, 'GPT-4' without a snapshot date is insufficient." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": true, 154 "justification": "All attack prompts are provided in full in the Appendix (Prompts 1-20), including the initial system prompts for synthetic applications (Prompts 1-2) and all injection prompts for each attack scenario." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 4.1.1 states 'All attacks are run at a sampling temperature of 0 for reproducibility.' Bing Chat's three modes (creative, balanced, precise) are also noted, though the exact mode used per experiment is not always specified." 160 }, 161 "scaffolding_described": { 162 "applies": true, 163 "answer": true, 164 "justification": "The synthetic application setup is described in detail in Section 4.1.1, including: the LangChain library for text-davinci-003, ReAct prompting, OpenAI chat format for GPT-4, and the full list of integrated tools (Search, View, Retrieve URL, Read/Send Email, Read Address Book, Memory)." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper does not document how prepared content was constructed for the synthetic applications, how Bing Chat sessions were selected for inclusion, or any systematic process for generating the demonstration scenarios." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 5.2 'Limitations' provides substantive discussion of experimental setup limitations, evaluation challenges, and deception/believability concerns across three subsections." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 5.2 discusses specific threats: testing on synthetic applications and local HTML rather than live injections, inability to access other applications (Microsoft 365 Copilot, ChatGPT plugins), challenges in quantifying interactive attack success rates, and models generating 'conspicuously false answers.'" 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": true, 186 "justification": "Section 5.2 explicitly states they could not test on other applications, that demonstrations 'are intrinsically not exhaustive' (Section 4), and that quantitative evaluation is left for future work. Section 5.4 acknowledges 'exact reproducibility is difficult to guarantee with such a black-box system.'" 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": true, 193 "justification": "Attack prompts are in the Appendix, screenshots of Bing Chat outputs are provided (Figures 13-28), conversation transcripts are included (Outputs 1-4), and synthetic application code is on GitHub." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": false, 198 "justification": "The paper does not describe how many sessions were conducted, how demonstrations were selected for inclusion, or what the overall process was for testing each attack. Only individual successful examples are shown." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants were involved. The paper tests attacks on LLM systems (synthetic apps, Bing Chat, Copilot) with no participant recruitment." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": false, 208 "justification": "No systematic data pipeline is documented. The paper presents individual attack demonstrations without describing the overall experimental workflow from attack design to execution to selection of examples for inclusion." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "The Acknowledgements section discloses funding: 'This work was partially funded by ELSA – European Lighthouse on Secure and Safe AI funded by the European Union under grant agreement No. 101070617.'" 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "All author affiliations are listed: Saarland University, CISPA Helmholtz Center for Information Security, and sequire technology GmbH. The affiliation with a security company (sequire) is disclosed." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": true, 225 "justification": "The EU research program (ELSA) is a general research funding body with no financial stake in whether attacks on specific commercial products succeed. The funder is independent of the outcome." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests statement is provided. Authors Greshake and Endres are affiliated with sequire technology GmbH (a security company) which could have commercial interest in demonstrating vulnerabilities, but no financial interest disclosure is made." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": false, 236 "answer": false, 237 "justification": "This paper tests security attacks/defenses against LLM-integrated applications, not model capabilities on benchmarks. Contamination criteria do not apply." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": false, 241 "answer": false, 242 "justification": "No benchmark evaluation of model knowledge is performed. The paper tests attack vectors, not model performance on standard tasks." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": false, 246 "answer": false, 247 "justification": "No benchmarks are used. The paper demonstrates attack feasibility through qualitative proof-of-concept examples, not benchmark evaluations." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants are involved. The paper tests attacks on LLM systems without any human subjects study." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants. The paper discusses ethical considerations in Section 5.1 regarding responsible disclosure but this is not an IRB/ethics review of human subjects research." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in the study." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in the study." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants or experimental conditions requiring randomization." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants or experimental conditions requiring blinding." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in the study." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "No inference costs, API costs, or latency measurements are reported for any of the attack demonstrations despite using paid OpenAI APIs." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "No computational budget is stated. The paper does not report total API spend, number of API calls, or computational resources used for the experiments." 297 } 298 } 299 }, 300 "claims": [ 301 { 302 "claim": "Indirect prompt injection enables adversaries to remotely exploit LLM-integrated applications by injecting prompts into data retrieved at inference time.", 303 "evidence": "Demonstrated through multiple qualitative examples: synthetic GPT-4 applications (Section 4.1.1) and Bing Chat (Section 4.1.2) successfully process and follow injected prompts placed in retrieved content (Prompts 3-20, Outputs 1-4).", 304 "supported": "moderate" 305 }, 306 { 307 "claim": "Retrieved prompts can act as arbitrary code execution, manipulating the application's functionality and controlling API calls.", 308 "evidence": "Section 4.2.4 demonstrates remote control where injected prompts cause the model to fetch new instructions from an attacker's server (Prompt 8, Output 2). Section 4.2.3 shows prompt worms that read contacts and send emails (Prompt 7, Output 1). Multiple demonstrations show control over search, memory, and email APIs.", 309 "supported": "moderate" 310 }, 311 { 312 "claim": "Prompts that are filtered when entered directly by users are not filtered when ingested indirectly via retrieved content.", 313 "evidence": "Section 4.2 states 'prompts that are typically filtered out via the chat interface are not filtered out when injected indirectly' and provides the example that Bing Chat stops sessions for direct jailbreak prompts but succumbs when they are ingested indirectly (footnote 5).", 314 "supported": "weak" 315 }, 316 { 317 "claim": "LLMs can autonomously implement attack goals given only high-level descriptions without pre-programmed attack details.", 318 "evidence": "Observation #1 and Section 4.2.1: the information gathering prompt 'only instructed the model to persuade the user without raising suspicion' with no mention of specific techniques, yet Bing Chat tailored its persuasion based on conversation context. The fraud demonstration (Section 4.2.2) showed the model generating scam persuasion techniques without explicit instruction.", 319 "supported": "weak" 320 }, 321 { 322 "claim": "Indirect prompt injections can be hidden via multi-stage payloads and Base64 encoding to circumvent detection.", 323 "evidence": "Section 4.3.1 demonstrates a multi-stage exploit where a small injection triggers retrieval of a larger payload (Prompt 19, Output 4). Section 4.3.2 shows a Base64-encoded injection successfully decoded and executed by Bing Chat (Prompt 20, Figure 27).", 324 "supported": "moderate" 325 }, 326 { 327 "claim": "Current mitigations including RLHF and input filtering are insufficient against indirect prompt injection.", 328 "evidence": "Section 5.6 discusses limitations of RLHF (citing Wolf et al. 2023 on fundamental impossibility), filtering (attacks succeed on Bing Chat which employs filtering), and other defenses. The paper notes a 'Whack-A-Mole' pattern. However, no systematic evaluation of defense effectiveness is provided.", 329 "supported": "weak" 330 } 331 ], 332 "red_flags": [ 333 { 334 "flag": "No quantitative evaluation", 335 "detail": "All attack demonstrations are qualitative single examples with screenshots and conversation transcripts. No success rates, reliability measurements, or systematic evaluation across multiple trials is provided. The paper explicitly defers quantitative evaluation to future work (Section 5.2)." 336 }, 337 { 338 "flag": "Selection bias in demonstrations", 339 "detail": "Only successful attack examples are shown (apart from acknowledged limitations with code completion). The paper does not report how many attempts were made per attack type or what fraction succeeded. This makes it impossible to assess the practical reliability of the attacks." 340 }, 341 { 342 "flag": "Broad claims from narrow evidence", 343 "detail": "The paper's taxonomy (Figure 2) covers a vast threat landscape, but only a fraction is demonstrated with actual experiments. Many claimed threats (e.g., surveillance, automated defamation, DDoS) are discussed hypothetically without demonstration." 344 }, 345 { 346 "flag": "Undisclosed commercial affiliation interest", 347 "detail": "Two authors are affiliated with sequire technology GmbH, a security consulting company that could benefit commercially from demonstrating that LLM-integrated applications are vulnerable. No competing interests statement is provided." 348 }, 349 { 350 "flag": "Controlled setting limitations", 351 "detail": "Synthetic application experiments use prepared content where 'all interfaces deliver prepared content, and unrelated queries are ignored' (Section 4.1.1). The Bing Chat sidebar experiments use local HTML files rather than actual web-scale injections. The gap between controlled demonstrations and real-world feasibility is acknowledged but not measured." 352 } 353 ], 354 "cited_papers": [ 355 { 356 "title": "Ignore Previous Prompt: Attack Techniques For Language Models", 357 "authors": ["Fábio Perez", "Ian Ribeiro"], 358 "year": 2022, 359 "relevance": "Foundational work on direct prompt injection attacks against LLMs, which this paper extends to indirect injection scenarios." 360 }, 361 { 362 "title": "Exploiting Programmatic Behavior of LLMs: Dual-Use Through Standard Security Attacks", 363 "authors": ["Daniel Kang", "Xuechen Li", "Ion Stoica", "Carlos Guestrin", "Matei Zaharia", "Tatsunori Hashimoto"], 364 "year": 2023, 365 "relevance": "Demonstrates program obfuscation, payload splitting, and virtualization to bypass LLM API filters — techniques that inform the hidden injection methods in this paper." 366 }, 367 { 368 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 369 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Izhak Shafran", "Karthik R Narasimhan", "Yuan Cao"], 370 "year": 2023, 371 "relevance": "Core agentic framework used for tool-augmented LLMs; the paper's synthetic applications use ReAct prompting for text-davinci-003." 372 }, 373 { 374 "title": "Toolformer: Language Models Can Teach Themselves to Use Tools", 375 "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì", "Roberta Raileanu", "Maria Lomeli", "Luke Zettlemoyer", "Nicola Cancedda", "Thomas Scialom"], 376 "year": 2023, 377 "relevance": "Demonstrates training LLMs to output API calls, a capability central to the LLM-integrated applications targeted in this paper." 378 }, 379 { 380 "title": "GPT-4 Technical Report", 381 "authors": ["OpenAI"], 382 "year": 2023, 383 "relevance": "The model powering Bing Chat and the synthetic applications tested in this paper; discusses safety evaluation and jailbreak resistance." 384 }, 385 { 386 "title": "Generative Agents: Interactive Simulacra of Human Behavior", 387 "authors": ["Joon Sung Park", "Joseph C O'Brien", "Carrie J Cai", "Meredith Ringel Morris", "Percy Liang", "Michael S Bernstein"], 388 "year": 2023, 389 "relevance": "Demonstrates autonomous LLM agents with memory and planning that could be targeted by indirect prompt injection; cited as context for multi-agent injection risks." 390 }, 391 { 392 "title": "Training language models to follow instructions with human feedback", 393 "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang", "Diogo Almeida", "Carroll Wainwright"], 394 "year": 2022, 395 "relevance": "RLHF training methodology discussed as a potential but insufficient defense against prompt injection attacks." 396 }, 397 { 398 "title": "Fundamental Limitations of Alignment in Large Language Models", 399 "authors": ["Yotam Wolf", "Noam Wies", "Yoav Levine", "Amnon Shashua"], 400 "year": 2023, 401 "relevance": "Theoretical work cited to argue that alignment/RLHF may be fundamentally unable to prevent all adversarial prompt behaviors." 402 }, 403 { 404 "title": "Large Language Models are Human-Level Prompt Engineers", 405 "authors": ["Yongchao Zhou", "Andrei Ioan Muresanu", "Ziwen Han", "Keiran Paster", "Silviu Pitis", "Harris Chan", "Jimmy Ba"], 406 "year": 2023, 407 "relevance": "Highlights LLMs as analogous to black-box computers executing natural language programs, a framing central to the indirect prompt injection threat model." 408 }, 409 { 410 "title": "Discovering Language Model Behaviors with Model-Written Evaluations", 411 "authors": ["Ethan Perez", "Sam Ringer", "Kamile Lukošiūtė", "Karina Nguyen", "Edwin Chen"], 412 "year": 2022, 413 "relevance": "Evaluates sycophancy in RLHF models and inverse scaling effects, cited in the biased output manipulation attack (Section 4.2.5)." 414 }, 415 { 416 "title": "Spinning Language Models: Risks of Propaganda-As-A-Service and Countermeasures", 417 "authors": ["Eugene Bagdasaryan", "Vitaly Shmatikov"], 418 "year": 2022, 419 "relevance": "Prior work on backdoor attacks against language models for propaganda generation, conceptually related to indirect prompt injection." 420 }, 421 { 422 "title": "Visual Instruction Tuning", 423 "authors": ["Haotian Liu", "Chunyuan Li", "Qingyang Wu", "Yong Jae Lee"], 424 "year": 2023, 425 "relevance": "LLaVA model used to demonstrate visual prompt injection (Figure 28), showing multi-modal injection is possible." 426 }, 427 { 428 "title": "Artificial Influence: An Analysis Of AI-Driven Persuasion", 429 "authors": ["Matthew Burtell", "Thomas Woodside"], 430 "year": 2023, 431 "relevance": "Analyzes AI persuasion capabilities relevant to the fraud and social engineering attacks demonstrated in this paper." 432 } 433 ], 434 "engagement_factors": { 435 "practical_relevance": { 436 "score": 2, 437 "justification": "Security practitioners and LLM application developers can use the taxonomy and attack demonstrations to understand threats, but the paper provides no defensive tool or immediately deployable technique." 438 }, 439 "surprise_contrarian": { 440 "score": 2, 441 "justification": "Challenges the assumption that prompt injection requires direct user access by introducing indirect injection through retrieved data — a new and non-obvious attack surface at the time of publication." 442 }, 443 "fear_safety": { 444 "score": 3, 445 "justification": "Demonstrates alarming attack scenarios including data exfiltration, self-replicating prompt worms, remote model control, and automated social engineering against real-world systems like Bing Chat." 446 }, 447 "drama_conflict": { 448 "score": 2, 449 "justification": "Directly attacks Microsoft's Bing Chat and GitHub Copilot products, arguing that the 'AI-integration race is not accompanied by adequate guardrails and safety evaluations.'" 450 }, 451 "demo_ability": { 452 "score": 2, 453 "justification": "Synthetic application code is on GitHub and all prompts are in the Appendix, but testing requires OpenAI API access and Bing Chat experiments cannot be easily replicated." 454 }, 455 "brand_recognition": { 456 "score": 2, 457 "justification": "Attacks demonstrated on Bing Chat (GPT-4 powered) and GitHub Copilot — well-known products — but the authors are from academic institutions (CISPA, Saarland University), not major AI labs." 458 } 459 } 460 }