ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (29949B)


      1 {
      2   "paper": {
      3     "title": "Defending Against Indirect Prompt Injection Attacks With Spotlighting",
      4     "authors": [
      5       "Keegan Hines",
      6       "Gary Lopez",
      7       "Matthew Hall",
      8       "Federico Zarfati",
      9       "Yonatan Zunger",
     10       "Emre Kıcıman"
     11     ],
     12     "year": 2024,
     13     "venue": "CAMLIS",
     14     "arxiv_id": "2403.14720",
     15     "doi": "10.48550/arXiv.2403.14720"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "Spotlighting — a family of prompt engineering techniques (delimiting, datamarking, encoding) — reduces indirect prompt injection attack success rate from >50% to <2% on GPT-family models. Datamarking (interleaving special tokens in input text) has no detrimental impact on NLP task performance (SQuAD, IMDB, SuperGLUE), while encoding (base64) achieves the lowest ASR but requires high-capacity models like GPT-4 to maintain task performance. The paper draws an analogy to in-band vs. out-of-band signaling in telecommunications, arguing that the fundamental fix requires a multi-channel architecture for LLMs.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No code repository URL, GitHub link, or archive is provided anywhere in the paper. The techniques are described but no implementation is released."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The synthetic corpus of 1000 attack documents is not released. Standard NLP benchmarks (SQuAD, IMDB, SuperGLUE) are public, but the custom attack dataset central to the evaluation is not available."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No environment specifications, requirements files, or dependency information is provided. The paper mentions model names but no software environment details."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions are included. The system prompts are provided as examples, but there are no scripts, commands, or procedures to replicate the full experimental pipeline."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "All results are reported as point estimates (e.g., 'ASR is reduced to 3.10%', 'ASR is reduced to 0.0%'). No confidence intervals, error bars, or uncertainty ranges appear in figures or text."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "Comparative claims like 'datamarking leads a strong reduction in ASR' and 'encoding approach outperforms datamarking' are made without any statistical significance tests — only raw percentage comparisons."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Effect sizes are reported with baseline context throughout: 'ASR is reduced from approximately 50% to below 3%' (Section 5.1), 'ASR is reduced to 0.0%' from ~40% (Figure 4), and NLP task performance with/without transformation (Figure 7). The reader can assess magnitude."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The corpus size of 1000 documents is stated (Section 4.2) but never justified. No power analysis or rationale for why 1000 is sufficient."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No variance, standard deviation, or multiple-run results are reported. All experiments appear to be single-run. The paper mentions examining 'the effect of temperature on XPIA susceptibility' but provides no data on run-to-run variance."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Multiple baselines are included: no defense (Figure 1), instruction-only defense (Figure 2), delimiting (Figure 3), and progressive spotlighting variants. The progression from no defense through each technique provides a clear comparison structure."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "The baselines are all ablation-like variants of the authors' own approach (no defense, instructions-only, delimiting) or trivial baselines. No comparison against other published XPIA defense methods from the literature, despite citing prior work [2] that explored defense approaches."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The three spotlighting variants (delimiting → datamarking → encoding) function as a progressive ablation, showing the incremental benefit of each transformation. The instruction-only condition isolates the effect of prompt instructions from the spotlighting transformations."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Two distinct metric families are used: Attack Success Rate (ASR) for defense effectiveness, and NLP task performance across four benchmarks (SQuAD Q&A accuracy, IMDB Sentiment, SuperGLUE WIC, SuperGLUE BoolQ) for task impact."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No human evaluation is performed. Attack success is determined by automated keyword detection. NLP task performance is measured by automated benchmark metrics only."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "For the primary ASR evaluation on the 1000-document synthetic corpus, there is no mention of held-out test splits. The defense techniques are hand-designed rather than tuned, but no explicit separation of development and evaluation data is described."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down by model (text-davinci-003, GPT-3.5-Turbo, GPT-4) and by task (summarization, Q&A) across Figures 1-6. NLP task impact is shown per-benchmark (SQuAD, IMDB, WIC, BoolQ) in Figures 7-8."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The appendix (Section 8.1) discusses gray-area cases where the model notices but doesn't fall for the attack. Section 5.2 shows encoding degrades GPT-3.5-Turbo performance significantly. Section 5.4 discusses adversary scenarios where defenses could be subverted."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Several negative results are reported: delimiting provides only modest improvement and is not recommended (Section 5.1); encoding severely degrades GPT-3.5-Turbo task performance (Figure 8, bottom); few-shot examples risk overfitting and label leakage (Section 8.2)."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims 'spotlighting reduces the attack success rate from greater than 50% to below 2%' — this is supported by Figures 4-6 showing ASR reductions to 0-3% across models and tasks. The claim of 'minimal impact on task efficacy' is supported by Figure 7 for datamarking, though encoding impacts GPT-3.5 (acknowledged in Section 5.2)."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The causal claim 'spotlighting reduces ASR' is supported by controlled single-variable manipulation: the same model, same dataset, same task, with only the spotlighting transformation varying. This is adequate causal design for the claim."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title 'Defending Against Indirect Prompt Injection Attacks With Spotlighting' is generic, but experiments use only GPT-family models and only synthetic keyword payload attacks. The abstract says 'Using GPT-family models' but broader claims about XPIA defense are not bounded to this narrow attack type."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper does not discuss alternative explanations for why spotlighting works beyond the telecommunications analogy (Section 6). It does not consider whether results are specific to keyword attacks, whether the attack corpus has distributional properties that favor the defense, or other confounds."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The appendix (Section 8.1) explicitly discusses the gap between the keyword-detection proxy (ASR) and actual attack success, distinguishing strict ASR from 'Affected Success Rate' (AffSR) and providing examples of gray-area cases. The proxy nature of the measurement is acknowledged."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Section 4.1 specifies: 'text-davinci-003, GPT-3.5Turbo (June 2023 version) and GPT-4 (June 2023 version).' Versions are identified by snapshot date."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Full system prompt text is provided for delimiting (Section 3.2), datamarking (Section 3.3), encoding (Section 3.4), instruction-only defense (Section 4.2), and few-shot (Section 8.2). These are actual prompt templates with clear placeholder notation."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 4.1 states: 'All experiments are conducted with temperature set to 1.0. We examined the effect of temperature on XPIA susceptibility and found no notable impact.' Temperature is the critical sampling parameter for these experiments."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used. The approach is direct prompt engineering applied to single LLM calls."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper states 'we generated a synthetic dataset of 1000 documents that contain prompt injection attacks' (Section 4.2) with 'variations on a simple keyword payload attack' but provides no details on the generation procedure, templates used, document content, or variation methodology."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. Limitation-like content is scattered across Section 5.3 (recommendations), Section 5.4 (adversary considerations), Section 6 (discussion), and the appendix, but no consolidated section exists."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Specific threats are discussed throughout: adversaries can subvert delimiting if they know the system prompt (Section 5.4); encoding severely degrades weaker models (Section 5.2); few-shot examples risk overfitting to known attack patterns (Section 8.2); attacks without whitespace could bypass datamarking (Section 5.4)."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The paper does not explicitly state what was NOT tested. It doesn't bound claims to keyword attacks only, doesn't acknowledge the absence of testing against sophisticated/adaptive attacks, and doesn't specify which attack types or real-world scenarios are excluded from the evaluation."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "Neither the 1000-document attack corpus nor the raw model responses are available for independent verification."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "The attack corpus generation is described only as 'variations on a simple keyword payload attack' with the keyword 'canary' (Section 4.2). No details on how the 1000 documents were generated, what variation strategies were used, or what the non-attack document content looks like."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. Data is synthetic and from standard benchmarks."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "The pipeline from document generation through spotlighting transformation to ASR measurement is described conceptually but not in reproducible detail. The attack document generation process, exact transformation implementations, and response classification logic are not documented."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding acknowledgment section. All authors are from Microsoft, which implies corporate funding, but no explicit funding disclosure is provided."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "All authors are listed with 'Microsoft' affiliation on the first page. The affiliation is clear and prominent."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "Microsoft has a major partnership with OpenAI and sells Azure OpenAI services. Effective prompt injection defenses increase the commercial viability of Microsoft's LLM products. The employer has a direct financial interest in showing these defenses work."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial disclosure statement is included in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "This paper tests defense techniques against prompt injection, not model knowledge or capability on benchmarks. The benchmark evaluations (SQuAD, IMDB, SuperGLUE) serve only to verify the defense doesn't degrade performance; the comparison is within-model (with vs. without spotlighting), making training contamination equally affecting both conditions."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "The paper evaluates defense effectiveness, not model capability. Any benchmark contamination would affect both the with-spotlighting and without-spotlighting conditions equally, making it irrelevant to the claims."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "The paper tests defenses rather than model knowledge. The primary evaluation uses a custom synthetic attack corpus, and NLP benchmarks are used only for differential comparison."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study. All experiments are automated with synthetic data and benchmark datasets."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "No inference costs, API costs, token consumption, or latency overhead is reported for any of the spotlighting techniques despite the encoding approach requiring the model to decode base64, which may incur additional compute."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No total computational budget, API spend, or hardware details are provided despite running experiments across 3 models, multiple tasks, and 1000-document corpora."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single runs despite using temperature=1.0, which introduces stochastic variation."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The number of experimental runs is never stated. Results are presented as single point estimates with no indication of how many times each experiment was conducted."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No hyperparameter search budget is reported. The choice of marking tokens, prompt phrasing, and encoding method appear to be hand-selected without documenting what alternatives were tried or how selections were made."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": true,
    318         "justification": "Section 5.3 provides justified recommendations: encoding is recommended for high-capacity models based on lowest ASR (Figure 6), datamarking for general use based on strong ASR reduction without task impact (Figures 4, 7). The progression of results across configurations justifies the selection."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": false,
    322         "answer": false,
    323         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors evaluate their own proposed techniques without acknowledging the potential bias of self-evaluation. No independent evaluation or acknowledgment of author-evaluation bias is provided."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "No discussion of compute differences between spotlighting approaches. Encoding requires the model to decode base64, which uses more tokens and compute, but this cost is never quantified or compared against the defense benefit."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": true,
    338         "justification": "Section 4.2 and Appendix 8.1 discuss construct validity of the ASR metric, distinguishing strict ASR from Affected Success Rate (AffSR), providing concrete examples of gray-area cases, and explaining why keyword detection may not capture all attack outcomes."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "No agentic scaffolding is used. Experiments are direct prompt engineering on single LLM calls."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No discussion of temporal leakage. The NLP benchmarks (SQuAD 2016, IMDB 2011, SuperGLUE 2020) predate the models' training, but this is not discussed."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of whether the evaluation setup leaks information. The keyword 'canary' in attack payloads could be a distinctive signal that doesn't generalize to real attacks."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of independence. The 1000 attack documents are 'variations on a simple keyword payload attack' — the degree of similarity between documents and whether results are inflated by non-independence is not addressed."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No leakage detection or prevention methods are applied. No canary strings, membership inference, or decontamination pipelines are used."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "Spotlighting reduces attack success rate from greater than 50% to below 2% with minimal impact on task efficacy.",
    372       "evidence": "Figures 4-6 show ASR reductions across models and tasks: datamarking reduces GPT-3.5-Turbo ASR to 3.10% (summarization) and 8.0% (Q&A); encoding reduces ASR to 0.0% (summarization) and 1.8% (Q&A). Figure 7 shows no detrimental NLP task impact for datamarking.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "Datamarking has no detrimental impact on underlying NLP tasks.",
    377       "evidence": "Figure 7 shows GPT-3.5-Turbo performance across SQuAD Q&A, IMDB Sentiment, SuperGLUE WIC, and SuperGLUE BoolQ is essentially unchanged with datamarking applied.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Encoding is the most effective spotlighting method but requires high-capacity models.",
    382       "evidence": "Figure 6 shows encoding achieves 0.0% ASR in summarization. Figure 8 shows GPT-4 handles encoding well across benchmarks, while GPT-3.5-Turbo suffers significant task performance degradation with encoding.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Delimiting has only modest effects on reducing ASR and is not recommended.",
    387       "evidence": "Figure 3 shows delimiting reduces GPT-3.5-Turbo ASR by roughly half (from ~60% to ~30%), which is significantly less effective than datamarking or encoding.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Few-shot learning can reduce ASR to below 5% but should be used with caution due to overfitting risks.",
    392       "evidence": "Figure 9 and Appendix 8.2 show ASR reduced below 5% with few-shot examples, but the authors warn about correlation between few-shot examples and test data, calling it 'a contemporary version of the classic overfitting problem.'",
    393       "supported": "moderate"
    394     }
    395   ],
    396   "red_flags": [
    397     {
    398       "flag": "Company evaluating its own product ecosystem",
    399       "detail": "All authors are from Microsoft, which has a major commercial partnership with OpenAI. Effective prompt injection defenses directly increase the commercial viability of Microsoft's Azure OpenAI offerings. No conflict of interest statement is included."
    400     },
    401     {
    402       "flag": "Only synthetic keyword attacks tested",
    403       "detail": "The entire ASR evaluation uses a single attack type: keyword payload attacks where the model is urged to output 'canary'. No sophisticated, adaptive, or semantically diverse attacks are tested. Real-world XPIA would involve far more varied and subtle attack payloads, making the reported near-zero ASR potentially misleading."
    404     },
    405     {
    406       "flag": "No error bars or variance on any result",
    407       "detail": "All experiments use temperature=1.0 (introducing stochastic variation) but report only single-point estimates. With 1000 attack documents and stochastic decoding, the variance could be substantial, especially for small ASR values near 0%."
    408     },
    409     {
    410       "flag": "Attack corpus generation not documented",
    411       "detail": "The 1000-document synthetic attack corpus is described only as 'variations on a simple keyword payload attack' with no details on the generation procedure, diversity of variations, or document content. This makes it impossible to assess whether the attack corpus provides a meaningful evaluation."
    412     }
    413   ],
    414   "cited_papers": [
    415     {
    416       "title": "Benchmarking and Defending Against Indirect Prompt Injection Attacks on Large Language Models",
    417       "authors": ["J. Yi", "Y. Xie", "B. Zhu", "K. Hines", "E. Kiciman", "G. Sun", "X. Xie", "F. Wu"],
    418       "year": 2023,
    419       "arxiv_id": "2312.14197",
    420       "relevance": "Direct predecessor to this work — establishes the XPIA benchmark and explores early defense techniques including delimiting."
    421     },
    422     {
    423       "title": "More than you've asked for: A Comprehensive Analysis of Novel Prompt Injection Threats to Application-Integrated Large Language Models",
    424       "authors": ["K. Greshake", "S. Abdelnabi", "S. Mishra", "C. Endres", "T. Holz", "M. Fritz"],
    425       "year": 2023,
    426       "arxiv_id": "2302.12173",
    427       "relevance": "Foundational analysis of indirect prompt injection threats in LLM applications, establishing the XPIA threat model."
    428     },
    429     {
    430       "title": "GPT-4 Technical Report",
    431       "authors": ["OpenAI"],
    432       "year": 2023,
    433       "arxiv_id": "2303.08774",
    434       "relevance": "Technical report for GPT-4, one of the primary models evaluated in this paper's experiments."
    435     },
    436     {
    437       "title": "Llama 2: Open foundation and fine-tuned chat models",
    438       "authors": ["H. Touvron", "L. Martin", "K. Stone"],
    439       "year": 2023,
    440       "arxiv_id": "2307.09288",
    441       "relevance": "Major open-source LLM relevant to understanding the landscape of models vulnerable to prompt injection."
    442     },
    443     {
    444       "title": "Constitutional AI: Harmlessness from AI feedback",
    445       "authors": ["Y. Bai", "S. Kadavath", "S. Kundu"],
    446       "year": 2022,
    447       "arxiv_id": "2212.08073",
    448       "relevance": "Alignment training approach relevant to understanding model safety and instruction-following behavior exploited by prompt injection."
    449     },
    450     {
    451       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    452       "authors": ["A. Zou", "Z. Wang", "N. Carlini", "M. Nasr", "J. Z. Kolter", "M. Fredrikson"],
    453       "year": 2023,
    454       "arxiv_id": "2307.15043",
    455       "relevance": "Demonstrates adversarial attacks on aligned LLMs, including token-based attacks referenced in the UPIA vs XPIA distinction."
    456     },
    457     {
    458       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    459       "authors": ["J. Wei", "X. Wang", "D. Schuurmans"],
    460       "year": 2023,
    461       "arxiv_id": "2201.11903",
    462       "relevance": "Foundational prompting technique that enables the kind of instruction-following behavior exploited by prompt injection attacks."
    463     },
    464     {
    465       "title": "Language models are few-shot learners",
    466       "authors": ["T. B. Brown", "B. Mann", "N. Ryder"],
    467       "year": 2020,
    468       "arxiv_id": "2005.14165",
    469       "relevance": "GPT-3 paper establishing the foundation for the GPT model family evaluated in this work."
    470     },
    471     {
    472       "title": "Code Llama: Open Foundation Models for Code",
    473       "authors": ["B. Roziere", "J. Gehring", "F. Gloeckle"],
    474       "year": 2023,
    475       "arxiv_id": "2308.12950",
    476       "relevance": "Open code generation model relevant to the broader landscape of LLMs vulnerable to prompt injection in code-processing contexts."
    477     },
    478     {
    479       "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models",
    480       "authors": ["S. Yao", "D. Yu", "J. Zhao"],
    481       "year": 2023,
    482       "arxiv_id": "2305.10601",
    483       "relevance": "Advanced reasoning framework for LLMs that increases agent capability and thereby the risk surface for prompt injection attacks."
    484     }
    485   ],
    486   "engagement_factors": {
    487     "practical_relevance": {
    488       "score": 3,
    489       "justification": "Spotlighting techniques (especially datamarking) are immediately usable by any developer building LLM applications — full example prompts are provided."
    490     },
    491     "surprise_contrarian": {
    492       "score": 1,
    493       "justification": "Confirms the intuition that marking input data helps LLMs distinguish instructions from data; the telecom analogy is novel framing but the finding is expected."
    494     },
    495     "fear_safety": {
    496       "score": 2,
    497       "justification": "Highlights that baseline XPIA success rates exceed 50% on major models, underscoring a significant AI security vulnerability, though it then proposes a mitigation."
    498     },
    499     "drama_conflict": {
    500       "score": 0,
    501       "justification": "No controversy or conflict; straightforward defense paper with positive results."
    502     },
    503     "demo_ability": {
    504       "score": 1,
    505       "justification": "Prompt templates are provided for manual replication, but no code, tool, or demo is released."
    506     },
    507     "brand_recognition": {
    508       "score": 2,
    509       "justification": "From Microsoft Research, evaluating GPT-3.5/GPT-4 — recognizable lab and models."
    510     }
    511   }
    512 }

Impressum · Datenschutz