ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28341B)


      1 {
      2   "paper": {
      3     "title": "Can Indirect Prompt Injection Attacks Be Detected and Removed?",
      4     "authors": [
      5       "Yulin Chen",
      6       "Haoran Li",
      7       "Yuan Sui",
      8       "Yufei He",
      9       "Yue Liu",
     10       "Yangqiu Song",
     11       "Bryan Hooi"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv",
     15     "arxiv_id": "2502.16580"
     16   },
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper states in the introduction footnote: 'Code is publicly available at https://github.com/LukeChen-go/indirect-pia-detection.' A working URL is provided."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The benchmark and training data are constructed from publicly available datasets (SQuAD validation, TriviaQA validation, Stanford-Alpaca). The injected instructions were crafted using GPT-4o. The code repository presumably includes the benchmark construction, and the underlying QA datasets are public."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Appendix A.1 states: 'We conduct our experiments using PyTorch 2.1.0. The experiments are performed on a single NVIDIA H100-96G GPU.' Training hyperparameters (learning rate 1e-5, epochs 1, max length 1280 with DeepSpeed) and generation parameters (do_sample false, max_new_tokens 256, max_length 8192) are specified."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "While code is released and hyperparameters are listed, the paper does not provide step-by-step reproduction instructions or a README with commands to run. The information is spread across the paper and appendix but no explicit reproduction guide is mentioned."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "All results in Tables 1-5 report only point estimates (e.g., '99.12%', '97.20%') with no confidence intervals, error bars, or uncertainty quantification."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper makes comparative claims (e.g., 'the segmentation method demonstrates better overall performance', 'stronger models exhibit less severe over-defense') but provides no statistical significance tests. Comparisons are made by looking at raw numbers only."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper reports raw percentages (e.g., true positive rates, removal rates, ASR) but does not report standardized effect sizes. While absolute performance numbers are given, there is no formal effect size reporting such as Cohen's d or odds ratios."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The benchmark contains 900 samples per dataset (Inj-SQuAD and Inj-TriviaQA) and training data of ~18,891-19,000 samples, but there is no justification for why these sizes were chosen or whether they are sufficient for the claims made."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No variance, standard deviation, or results from multiple experimental runs are reported. All results appear to be from single runs with no spread measures."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The paper includes multiple baselines: existing LLMs (Llama3-8B-Instruct, Qwen2-7B-Instruct), open-source detection models (Llama-Guard3-8B, Protect-AI-detector, Prompt-Guard), and defense baselines (Sandwich, Instructional, StruQ). Results are compared in Tables 3 and 5."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Baselines include recent models and methods: Prompt-Guard (Meta 2024), Protect-AI-detector (2024), StruQ (Chen et al. 2024a), Llama3-8B-Instruct, and Qwen2-7B-Instruct. These are contemporary and relevant to the task."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper investigates multiple factors: injection position effects on detection (Figure 6, Section 5.2), injection rate trade-offs (Figure 3), in-domain vs. out-of-domain evaluation (Table 1), and model size effects (Qwen2-0.5B vs. Qwen2-1.5B). These serve as ablation-like analyses of design choices."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The paper uses multiple metrics: true positive rate (detection), false positive rate (over-defense), removal rate, and attack success rate (ASR) for defense evaluation, as described in Section 3.3."
     87       },
     88       "human_evaluation": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "Human evaluation is not relevant here. The paper evaluates automated detection and removal systems where ground truth is known (whether injected instructions are present and whether probes appear in responses). Automated metrics are appropriate."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "The paper uses separate evaluation benchmarks (Inj-SQuAD and Inj-TriviaQA from validation sets) distinct from the training data (constructed from SQuAD and TriviaQA training sets). Out-of-domain evaluation (trained on SQuAD, tested on TriviaQA) is also performed."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Results are broken down by attack type (Naive, Ignore, Escape, Fakecom, Combined), injection position (head, middle, tail), model type, and in-domain vs. out-of-domain scenarios. Tables 2, 3, and 5 provide detailed breakdowns."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper discusses failure cases: over-defense problems (Section 5.2, Table 1), position generalization failures (Figure 6), extraction method struggles with head/middle positions (Section 5.3), and limited generalization against Fakecom attacks (Section 5.3)."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Several negative results are reported: existing detection models fail at indirect injection detection (Table 5), extraction removal performs poorly at head/middle positions, reducing injection rate does not optimally solve over-defense (Figure 3), and both StruQ and filtering methods struggle with Fakecom attacks."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The abstract claims are supported: (1) existing models struggle at indirect detection — supported by Table 5; (2) over-defense is position/domain dependent — supported by Table 1 and Figure 3; (3) segmentation shows better overall removal, extraction better at tail — supported by Table 2; (4) combined filtering is effective — supported by Table 3."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The paper's causal claims are primarily from controlled experiments: 'stronger models exhibit less severe over-defense' (comparing model sizes with same training), position effects (training on specific positions), injection rate effects (Figure 3). These are based on controlled single-variable manipulations."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The title asks 'Can Indirect Prompt Injection Attacks Be Detected and Removed?' broadly, but experiments are limited to QA-based document scenarios with SQuAD/TriviaQA. The paper does not adequately bound its claims to this specific setting. The conclusions section discusses the findings broadly without qualifying that they apply only to this particular benchmark setup."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper does not discuss alternative explanations for its findings. For instance, the over-defense problem on out-of-domain data could be due to domain shift rather than model capacity, but this is not explored. No threats-to-validity or alternative explanations section is present."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper refers to models as 'Llama3-8B-Instruct', 'Qwen2-7B-Instruct', 'Qwen2-0.5B', 'Qwen2-1.5B', 'DeBERTa', 'Llama-Guard3-8B', 'Llama3.2-3B' without specifying exact version snapshots or model card dates. GPT-4o is used for crafting injections but no version/snapshot is given."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper provides actual prompt templates in Tables 8-14 showing the exact format for attack methods and defense baselines. The detection prompt format is also implied through the system/user/data structure shown."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Appendix A.1 reports: learning rate 1e-5, epochs 1, max length 1280, DeepSpeed for training; do_sample false, max_new_tokens 256, max_length 8192 for generation. Training data split ratios (40% clean, 15% head, 30% middle, 15% tail) are specified in Section 3.2."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used. The detection and removal methods are straightforward model inference pipelines without agent loops, tool use, or multi-step reasoning scaffolds."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 3 describes the benchmark and training data construction in detail: source datasets (SQuAD/TriviaQA), sample counts (900 per benchmark, 18,891/19,000 training), injection categories (advertisement/phishing/propaganda, 100 each), injection positions and rates, and the distinction between detection and extraction training data (Ddet and Dext)."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The paper includes a dedicated 'Limitations' section after the conclusion, discussing that the removal methods are 'simple and easy to implement, but their performance is not entirely satisfactory' and that direct prompt injection was not considered."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "The limitations section is brief and mostly forward-looking ('leaving room for future exploration'). It does not discuss specific threats to validity of the current results, such as the narrow benchmark domain (only QA tasks), the artificial nature of the injected instructions (GPT-4o generated), or the limited attack diversity in training."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The limitations mention not considering direct prompt injection and that removal methods need improvement, but do not explicitly state what the results do NOT show. For example, the paper does not state that results may not generalize to non-QA domains, multi-turn conversations, or real-world attack scenarios with more sophisticated injections."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The underlying datasets (SQuAD, TriviaQA) are publicly available, code is released at the GitHub repository, and the benchmark construction process is documented. The crafted injected instructions should be reproducible from the description."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 3.1-3.2 describes how the benchmark and training data were constructed: source datasets, injection categories with counts (Table 6), the process for creating injected instructions using GPT-4o, the tuple structure (p, d, a, x, y), and the training data split ratios."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants are involved. The data is constructed from existing public datasets (SQuAD, TriviaQA, Stanford-Alpaca) and GPT-4o generated injections. This criterion does not apply."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The data pipeline is documented in Sections 3.1-3.2: starting from QA datasets, constructing clean document/injection pairs, splitting into detection training data (Ddet with 40/15/30/15% ratios) and extraction training data (Dext with tripled samples for three positions), and the benchmark construction with 900 samples per dataset."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The Acknowledgment section states: 'The work described in this paper was conducted in full or in part by Dr. Haoran Li, JC STEM Early Career Research Fellow, supported by The Hong Kong Jockey Club Charities Trust.'"
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly listed: National University of Singapore and HKUST. No authors are affiliated with companies whose products are being evaluated."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "The Hong Kong Jockey Club Charities Trust is a philanthropic organization with no financial stake in prompt injection detection outcomes. The funder is independent of the research findings."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "There is no competing interests or financial interests statement in the paper. The Ethical Considerations section discusses adherence to codes of ethics but does not include a declaration of financial interests."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper evaluates LLMs (Llama3-8B-Instruct, Qwen2-7B-Instruct) on detection tasks but does not state their training data cutoff dates. While the benchmark is custom-constructed, the models' ability to detect injections could be influenced by training data composition."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No discussion of whether the evaluated LLMs may have seen SQuAD or TriviaQA data during pre-training, which could affect their detection performance on these domains. The in-domain vs. out-of-domain analysis partially addresses this for trained models but not for the pre-trained LLMs."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "SQuAD (2016) and TriviaQA (2017) were published well before the training cutoffs of the evaluated models. While the injected instructions are custom-crafted, the underlying documents may have been seen during pre-training. This contamination risk is not discussed."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants are involved in this study. It is a benchmark evaluation of automated detection and removal methods."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants are involved. The Ethical Considerations section addresses responsible conduct but not IRB approval, which is not needed."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants are involved in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants are involved in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants are involved in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants are involved in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants are involved in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "The paper proposes filtering pipelines that add detection and removal steps before LLM inference but does not report the latency or computational cost of these additional steps. No wall-clock time, API costs, or throughput numbers are provided."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "While the paper mentions using a single NVIDIA H100-96G GPU (Appendix A.1), it does not state the total training time, GPU hours, or computational budget for the experiments."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "Existing instructed LLMs and open-source detection models struggle to effectively detect indirect prompt injection attacks, while specifically trained models show satisfactory performance.",
    294       "evidence": "Table 5 shows Llama3-8B-Instruct achieves only 78.74% average accuracy on Inj-SQuAD, Qwen2-7B-Instruct 42.54%. Llama-Guard3-8B max 39.11%. Trained DeBERTa reaches 99.12% accounting for over-defense. (Section 5.2)",
    295       "supported": "strong"
    296     },
    297     {
    298       "claim": "The over-defense problem rarely occurs with in-domain documents but occurs with out-of-domain documents, where stronger models and more fluent documents are less prone to this issue.",
    299       "evidence": "Table 1 shows in-domain false positive rates at 0.0-0.44% vs. out-of-domain rates up to 27.33%. Qwen2-1.5B shows fewer over-defense problems than Qwen2-0.5B. SQuAD documents (more fluent) trigger less over-defense than TriviaQA. (Section 5.2)",
    300       "supported": "strong"
    301     },
    302     {
    303       "claim": "The segmentation removal method demonstrates better overall performance than extraction, but extraction excels at removing tail-position injections.",
    304       "evidence": "Table 2 shows extraction-Qwen2-1.5B achieves >=94.66% removal at tail position but only 42.33-67.77% for Fakecom/Combined at head/middle. Segment-DeBERTa achieves 79.88-100% across all positions. (Section 5.3)",
    305       "supported": "strong"
    306     },
    307     {
    308       "claim": "Combining detection and removal as filtering methods is effective for defending against indirect prompt injection attacks, generally outperforming prompt-engineering and fine-tuning baselines.",
    309       "evidence": "Table 3 shows Segment filtering achieves ASR of 0.11% in most configurations (vs. None baseline 4-96%, Sandwich 2.77-57.22%, Instructional 2.22-95.66%). However, both StruQ and filtering struggle against Fakecom attacks at certain positions. (Section 5.3)",
    310       "supported": "moderate"
    311     },
    312     {
    313       "claim": "Detection models trained on data with a single injection position struggle to detect attacks at other positions.",
    314       "evidence": "Figure 6 shows models trained on head-only injection achieve high detection for head attacks but drop significantly for middle and tail attacks (e.g., DeBERTa-OOD drops from 100% head to 20% middle). (Section 5.2)",
    315       "supported": "strong"
    316     },
    317     {
    318       "claim": "The removal methods will not eliminate key information in clean data despite over-defense of detection models.",
    319       "evidence": "Table 4 shows QA task accuracy remains comparable across defense methods: None (77.77-80.11%), Segment (77.77-80.00%), Extraction (77.77-80.00%), with only StruQ showing noticeable degradation (75.22-76.11%). (Section 5.3)",
    320       "supported": "moderate"
    321     }
    322   ],
    323   "methodology_tags": [
    324     "benchmark-eval"
    325   ],
    326   "key_findings": "The paper demonstrates that existing LLMs and open-source detection models are largely ineffective at detecting indirect prompt injection attacks, but models specifically trained on crafted datasets can achieve high detection accuracy (>97%). The study reveals an over-defense trade-off that depends on domain match, model capacity, and injection rate. For removal, segmentation outperforms extraction overall, though extraction excels at tail-position injections. Combined detection-plus-removal filtering achieves near-zero attack success rates in most scenarios, outperforming prior prompt-engineering and fine-tuning defenses.",
    327   "red_flags": [
    328     {
    329       "flag": "No statistical rigor",
    330       "detail": "All results are single-run point estimates with no confidence intervals, error bars, variance across runs, or statistical significance tests. Given the stochastic nature of some components and the relatively small benchmark sizes (900 samples), this is a significant omission."
    331     },
    332     {
    333       "flag": "Narrow benchmark domain",
    334       "detail": "The evaluation is limited to QA-based scenarios using SQuAD and TriviaQA documents. Real-world indirect prompt injection occurs in diverse contexts (emails, web searches, code, etc.) but generalization to these settings is not tested or discussed."
    335     },
    336     {
    337       "flag": "Training only on Naive attacks",
    338       "detail": "Detection and extraction models are trained only on 'Naive attack' data, yet the paper claims generalization to other attack types. The acknowledged failure on Fakecom attacks suggests this training strategy is insufficient, but the implications are understated."
    339     },
    340     {
    341       "flag": "Missing inference cost analysis",
    342       "detail": "Adding a detection-then-removal pipeline has practical implications for latency and cost that are never quantified, making it difficult to assess real-world deployability."
    343     }
    344   ],
    345   "cited_papers": [
    346     {
    347       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    348       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    349       "year": 2023,
    350       "relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications, directly related to the security evaluation of agentic AI systems."
    351     },
    352     {
    353       "title": "Struq: Defending against prompt injection with structured queries",
    354       "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"],
    355       "year": 2024,
    356       "arxiv_id": "2402.06363",
    357       "relevance": "Key defense baseline using adversarial fine-tuning to separate instructions from data, directly compared in this paper's experiments."
    358     },
    359     {
    360       "title": "Defending against indirect prompt injection attacks with spotlighting",
    361       "authors": ["Keegan Hines", "Gary Lopez", "Matthew Hall", "Federico Zarfati", "Yonatan Zunger", "Emre Kiciman"],
    362       "year": 2024,
    363       "arxiv_id": "2403.14720",
    364       "relevance": "Proposes spotlighting as a defense against indirect prompt injection, relevant to the landscape of prompt injection defenses."
    365     },
    366     {
    367       "title": "Ignore previous prompt: Attack techniques for language models",
    368       "authors": ["Fábio Perez", "Ian Ribeiro"],
    369       "year": 2022,
    370       "arxiv_id": "2211.09527",
    371       "relevance": "Early systematic study of prompt injection attack techniques used as a baseline attack method in this paper."
    372     },
    373     {
    374       "title": "Evaluating the instruction-following robustness of large language models to prompt injection",
    375       "authors": ["Zekun Li", "Baolin Peng", "Pengcheng He", "Xifeng Yan"],
    376       "year": 2023,
    377       "relevance": "Evaluates LLM robustness to prompt injection, providing methodology and benchmarks relevant to the survey scope."
    378     },
    379     {
    380       "title": "Automatic and universal prompt injection attacks against large language models",
    381       "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang", "Ning Zhang", "Chaowei Xiao"],
    382       "year": 2024,
    383       "arxiv_id": "2403.04957",
    384       "relevance": "Develops automated prompt injection optimization techniques, representing the evolving attack landscape for LLM security."
    385     },
    386     {
    387       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    388       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
    389       "year": 2024,
    390       "relevance": "Provides formal frameworks for prompt injection evaluation, relevant to benchmarking methodology for LLM security."
    391     },
    392     {
    393       "title": "The instruction hierarchy: Training llms to prioritize privileged instructions",
    394       "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"],
    395       "year": 2024,
    396       "arxiv_id": "2404.13208",
    397       "relevance": "Proposes instruction hierarchy for LLM defense against injection, a fundamental approach to the instruction-data separation problem."
    398     },
    399     {
    400       "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents",
    401       "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"],
    402       "year": 2024,
    403       "arxiv_id": "2403.02691",
    404       "relevance": "Benchmarks indirect prompt injection in agentic LLM settings with tool use, directly relevant to agentic AI security evaluation."
    405     },
    406     {
    407       "title": "Can llms separate instructions from data? and what do we even mean by that?",
    408       "authors": ["Egor Zverev", "Sahar Abdelnabi", "Soroush Tabesh", "Mario Fritz", "Christoph H Lampert"],
    409       "year": 2024,
    410       "arxiv_id": "2403.06833",
    411       "relevance": "Examines the fundamental question of instruction-data separation in LLMs, directly relevant to understanding prompt injection vulnerabilities."
    412     },
    413     {
    414       "title": "Jatmo: Prompt injection defense by task-specific finetuning",
    415       "authors": ["Julien Piet", "Maha Alrashed", "Chawin Sitawarin", "Sizhe Chen", "Zeming Wei", "Elizabeth Sun", "Basel Alomair", "David Wagner"],
    416       "year": 2023,
    417       "arxiv_id": "2312.17673",
    418       "relevance": "Proposes task-specific fine-tuning as a prompt injection defense, an alternative approach to the detection-based methods studied in this paper."
    419     },
    420     {
    421       "title": "Benchmarking and defending against indirect prompt injection attacks on large language models",
    422       "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Keegan Hines", "Emre Kiciman", "Guangzhong Sun", "Xing Xie", "Fangzhao Wu"],
    423       "year": 2023,
    424       "arxiv_id": "2312.14197",
    425       "relevance": "Provides benchmarks and defenses for indirect prompt injection, directly related to the evaluation of LLM security mechanisms."
    426     },
    427     {
    428       "title": "Prompt injection attack against llm-integrated applications",
    429       "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li", "Kailong Wang"],
    430       "year": 2023,
    431       "arxiv_id": "2306.05499",
    432       "relevance": "Studies prompt injection attacks against LLM-integrated applications, relevant to understanding security vulnerabilities in agentic AI systems."
    433     }
    434   ]
    435 }

Impressum · Datenschutz