ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (26878B)


      1 {
      2   "paper": {
      3     "title": "Attention is All You Need to Defend Against Indirect Prompt Injection Attacks in LLMs",
      4     "authors": ["Yinan Zhong", "Qianhao Miao", "Yanjiao Chen", "Jiangyi Deng", "Yushi Cheng", "Wenyuan Xu"],
      5     "year": 2025,
      6     "venue": "NDSS Symposium 2026",
      7     "arxiv_id": "2512.08417"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No GitHub link, Zenodo archive, or repository URL is provided in the paper. The paper mentions the FIPI dataset 'will be made publicly available' (Section I) but does not provide a download link."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper states the FIPI dataset will be 'open-sourced to support further research' (Abstract) and 'will be made publicly available' (Section I), but no actual download link or repository URL is provided. This is a promise of future release, not actual release."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions PyTorch and NVIDIA A100 GPUs (Section VI-A) but does not provide a requirements.txt, Dockerfile, or detailed library version list sufficient to recreate the environment."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No README, reproduction scripts, or step-by-step instructions are provided. The paper describes the experimental setup in Section VI-A but does not provide sufficient detail for an independent researcher to reproduce the results without significant guessing."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "All results (Tables I-XII) report only point estimates (e.g., '99.05% accuracy') with no confidence intervals, error bars, or uncertainty measures."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims RENNERVATE 'outperforms' all baselines but provides no statistical significance tests (no p-values, t-tests, or bootstrap tests). Comparisons are based solely on raw numerical differences."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports percentage improvements with baseline context throughout. For example, Section VI-B states ASR reductions: 'reducing the ASRs by 85.80% on ChatGLM' from specific baseline values (85.9% to 0.1%). Tables II-III show both undefended ASR and defended ASR, allowing readers to compute effect sizes."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is provided for why the FIPI dataset contains 100,000 injected and 10,000 benign instances, or why 1,000 instances were used for sanitization evaluation. No power analysis is discussed."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations, variance, or multi-run statistics are reported anywhere in the paper. All results appear to be single-run numbers."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares against 15 baselines: 4 classifier-based detection methods (Prompt-Guard, ProtectAI-v2, Attention Tracker, TaskTracker), 5 LLM-based detection methods, 3 prompt-modification prevention methods (Sandwich, Spotlighting, Instructional), StruQ (model-modification), and 2 LLM-based sanitization baselines (Section VI-A)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include recent work: Attention Tracker (2024), TaskTracker (2025), ProtectAI-v2 (2024), StruQ (2024), SecAlign (2024). These represent the current state of the art in IPI defense."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section VI-E presents a thorough ablation study examining three components: 2-step attentive pooling, token-level detector, and mean filter. Each is replaced or removed individually, with results in Table X across 6 datasets."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper uses 6 evaluation metrics: Accuracy, False Positive Rate, False Negative Rate (for detection), and Attack Success Rate, Win Rate, Jaccard Similarity (for sanitization), as described in Section VI-A."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Manual evaluation is performed in two places: (1) 1,000 randomly selected FIPI instances are manually evaluated for quality (Section VI-A, Step 5), and (2) 200 randomly sampled instances are manually evaluated to validate the DeepSeek LLM-as-a-judge approach (Section VI-C, achieving 94.50% accuracy)."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section VI-A Step 5 explicitly separates training and testing data: 5,000 injected + 5,000 benign instances for testing, with the remaining 100,000 for training. 'There is no overlap between training and testing data, as they originate from different user instruction-clean data pairs.'"
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Tables II and III break down sanitization performance by attack type (Naive, Escape Characters, Context Ignoring, Fake Completion, Combined). Table V breaks down detection performance by unseen dataset. Table VII breaks down performance by attack method and dataset."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Figure 7 shows and discusses two illustrative failure cases: a false positive (benign role-playing prompt misclassified as injection) and a false negative (naive IPI attack undetected). Section VI-C also discusses utility degradation in certain scenarios (Jfleg-RTE) with explanations."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports utility degradation in certain unseen dataset scenarios (Table IV, Win Rates falling below 10% for J-R and S-M), and explicitly discusses why: 'RENNERVATE struggles to fully eliminate injected-task-specific content' (Section VI-C). The Discussion section also acknowledges limitations with injected content."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims RENNERVATE 'outperforms 15 commercial and academic IPI defense methods, achieving high precision on 5 LLMs and 6 datasets' and is 'transferable to unseen attacks and robust against adaptive adversaries.' These are supported by Tables I-IX, which show results on 5 LLMs, 6 datasets, unseen attacks, and adaptive adversaries."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper's causal claims are primarily through ablation studies (Section VI-E), which use controlled single-variable manipulation to show that removing each component (2-step attentive pooling, token-level detector, mean filter) degrades performance. This is adequate causal design for component contribution claims."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper bounds generalization by specifying the 5 LLMs tested (ChatGLM, Dolly, Falcon, LLaMA2, LLaMA3), the specific datasets used, and the attack types evaluated. The Discussion section explicitly acknowledges limitations including the requirement for attention weight access and issues with injected-task-specific content."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper discusses alternative explanations for observed patterns. For instance, utility degradation in unseen dataset scenarios is attributed to the 'dual nature of injections' containing task-specific content (Section VI-C). The Discussion section considers whether model-modification methods like StruQ are better suited in some contexts, and addresses limitations of the attention-based approach."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper specifies model families and sizes (e.g., 'ChatGLM-6B', 'Dolly-7B', 'Falcon-7B', 'LLaMA2-7B', 'LLaMA3-8B') but does not provide exact snapshot dates or version identifiers. 'GPT-3.5-Turbo' is used without a snapshot date. 'DeepSeek' is referenced without specifying v2, v2.5, or a version identifier (only Table XIII in the appendix mentions 'DeepSeek-v2.5')."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "The Appendix (Sections B, C, D) provides full prompt templates for all LLM-based baselines (Naive Detection, Response-Based Detection, Known-Answer Detection, LLM-Based Sanitization) and the DeepSeek determination prompt. The probe-witness pairs are described in detail."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section VI-A reports detailed hyperparameters: m=32 response tokens, N=2 residual blocks, hidden dim=512, Adam optimizer with lr=1e-3, annealing rate=0.3, batch size=128, mean filter kernel size k=5, threshold=5. GCG uses 500 iterations, Neural Exec uses 250 iterations. Section VI-F provides sensitivity analysis."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "RENNERVATE is not an agentic scaffolding system. It is a detection/sanitization framework that processes attention features through a neural network. No agentic scaffolding is used."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section VI-A describes the 5-step FIPI construction pipeline in detail: (1) preparing benign instances by rewriting duplicates, (2) creating probe-witness pairs, (3) employing IPI attacks with specified distribution ratios, (4) constructing injected instances with randomized positions and token-level labels, (5) splitting into train/test with specific counts."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section VII 'Discussion & Limitations' explicitly discusses limitations including the requirement for attention weight access, challenges with injected-task-specific content, and real-world deployment constraints."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The Discussion section identifies specific threats: (1) RENNERVATE requires internal attention weights, limiting applicability to API-only users; (2) injected-task-specific content may not be fully removed, leading to utility degradation in grammar correction tasks; (3) model-modification methods may be more effective but require LLM retraining."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section VII explicitly states scope boundaries: the approach requires white-box access to attention features, it may not fully remove injected-task-specific content, and it was not tested on multi-modal IPI attacks (identified as future work in Section VIII). The threat model in Section IV bounds what adversary and defender capabilities are assumed."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw data is made available for independent verification. The FIPI dataset is promised for future release but not currently accessible."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section VI-A describes the 5-step data collection pipeline in detail: starting from the SEP dataset (9,160 pairs), using GPT-3.5-Turbo to rewrite duplicates, manually designing 100 probe-witness pairs, employing 7 attack methods with specified distribution ratios, and annotating token-level labels."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants were recruited. The data consists of synthetically constructed prompt injection examples built from the existing SEP benchmark."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full pipeline is documented in Section VI-A Steps 1-5 with specific counts: 9,160 original pairs → 10,000 rewritten pairs → 100,000 injected instances + 10,000 benign instances → 5,000+5,000 test, remainder for training. The 1,000-instance quality check is also described."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The Acknowledgment section states: 'This work is supported by National Natural Science Foundation of China Grant 62271280 and Zhejiang Key Laboratory of Electrical Technology and System on Renewable Energy.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors are listed as affiliated with Zhejiang University. The paper evaluates open-source models (ChatGLM, Dolly, Falcon, LLaMA2, LLaMA3) with no apparent conflict between authors and the models evaluated."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The funders (National Natural Science Foundation of China and Zhejiang Key Laboratory) are academic/governmental bodies with no apparent financial stake in the outcome of IPI defense research."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate pre-trained model capabilities on a benchmark. It trains a separate detector on attention features to classify injections. The LLMs are used as fixed feature extractors, not as models being benchmarked for their learned knowledge."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Same as above — the paper is not benchmarking pre-trained model knowledge. It trains a custom detector and explicitly separates training/test data (Section VI-A Step 5). Contamination in the traditional sense (model saw test data during pre-training) is not applicable here."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Same as above — the evaluation tests a defense system's detection/sanitization capabilities, not a pre-trained model's knowledge. Benchmark contamination of the underlying LLMs is not relevant to the claims being made."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants were involved. The study uses synthetically constructed datasets and automated evaluation."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants were involved."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants were involved."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants were involved."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants were involved."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants were involved."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants were involved."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost, latency, or tokens consumed per example are reported. Section VII mentions the system 'can run efficiently on a single NVIDIA RTX 3090 GPU' but does not quantify inference time or cost per example."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "The paper mentions training on 'two NVIDIA A100 GPUs' (Section VI-A) but does not state total GPU hours, training time, or API costs for the LLM-based baselines. The computational budget for GCG and Neural Exec optimization iterations is also not quantified in terms of wall-clock time."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "RENNERVATE achieves 97.88-99.58% IPI detection accuracy across 5 LLMs, outperforming all 15 baselines.",
    286       "evidence": "Table I shows accuracy of 99.05% (ChatGLM), 97.88% (Dolly), 99.58% (Falcon), 99.43% (LLaMA2), 99.37% (LLaMA3), compared to the best baseline TaskTracker at 95.07% on LLaMA3.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "RENNERVATE reduces Attack Success Rate to near 0% across all 5 LLMs while preserving text utility.",
    291       "evidence": "Tables II-III show total ASR reductions from 60.80-85.90% to 0.00-0.20%. Table IV shows Win Rates of 43.60-46.78% on FIPI (close to 50% parity), and Jaccard Similarity scores of 0.9-1.0 (Figures 4-5).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "RENNERVATE transfers effectively to 5 unseen datasets and 2 unseen attack methods.",
    296       "evidence": "Table V shows 80.20-100.0% detection accuracy on unseen datasets. Table VII shows 92.50-100% detection accuracy against unseen GCG and Neural Exec attacks, with ASRs reduced to 0-7%.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "RENNERVATE is robust against black-box (PAIR, TAP) and white-box (adaptive GCG) adaptive adversaries.",
    301       "evidence": "Table VIII shows PAIR ASR reduced from 94-100% to 0-19%, TAP from 95-100% to 0-9%. Table IX shows adaptive GCG ASR reduced from 93-99.5% to 0-5%.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "The 2-step attentive pooling mechanism improves generalization to unseen datasets.",
    306       "evidence": "Table X shows removing 2-step attentive pooling causes accuracy drops of up to 12.65% on unseen datasets (J-R), despite slight improvement on FIPI training distribution.",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "RENNERVATE has a compact parameter size of 0.5-0.8M parameters.",
    311       "evidence": "Table XIII in the Appendix compares parameter sizes: RENNERVATE at 0.5-0.8M vs. Prompt-Guard at 86M, ProtectAI-v2 at 98M, GPT-3.5 at >=175B.",
    312       "supported": "strong"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval"],
    316   "key_findings": "RENNERVATE is a defense framework against Indirect Prompt Injection (IPI) attacks that leverages attention features for token-level detection and sanitization. It achieves 97.88-99.58% detection accuracy across 5 LLMs, outperforming 15 commercial and academic baselines, while reducing attack success rates to near 0% with minimal utility loss. The system demonstrates transferability to unseen datasets and unseen attack methods (including gradient-based attacks like GCG and Neural Exec), and robustness against adaptive adversaries using both black-box (PAIR, TAP) and white-box strategies. A key limitation is that it requires access to the target LLM's attention weights and may not fully remove injected-task-specific content.",
    317   "red_flags": [
    318     {
    319       "flag": "No variance or multi-run statistics",
    320       "detail": "All results appear to be single-run numbers with no standard deviations, confidence intervals, or multi-seed experiments reported. For a machine learning system with stochastic training, this makes it impossible to assess result stability."
    321     },
    322     {
    323       "flag": "No statistical significance tests",
    324       "detail": "Claims of outperformance over baselines are based purely on comparing point estimates. No statistical tests are performed to determine whether differences are significant."
    325     },
    326     {
    327       "flag": "No code or data released",
    328       "detail": "Despite claims that the FIPI dataset will be open-sourced, no repository URL or download link is provided. Code for RENNERVATE is also not released. This prevents independent verification of all reported results."
    329     },
    330     {
    331       "flag": "No inference cost quantification",
    332       "detail": "The paper does not report inference latency, throughput, or cost per example for RENNERVATE, despite requiring attention feature extraction from the target LLM for every input. This makes practical deployment assessment difficult."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    338       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
    339       "year": 2024,
    340       "relevance": "Foundational benchmark for IPI attacks and defenses, used as the basis for FIPI dataset construction."
    341     },
    342     {
    343       "title": "StruQ: Defending against prompt injection with structured queries",
    344       "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"],
    345       "year": 2024,
    346       "arxiv_id": "2402.06363",
    347       "relevance": "Model-modification IPI defense baseline achieving comparable performance to RENNERVATE on LLaMA2."
    348     },
    349     {
    350       "title": "Aligning LLMs to be robust against prompt injection",
    351       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "Chuan Guo"],
    352       "year": 2024,
    353       "arxiv_id": "2410.05451",
    354       "relevance": "SecAlign approach using alignment techniques for IPI defense during fine-tuning."
    355     },
    356     {
    357       "title": "Attention Tracker: Detecting prompt injection attacks in LLMs",
    358       "authors": ["Kuo-Han Hung", "Ching-Yun Ko", "Ambrish Rawat", "I Chung", "Winston H Hsu", "Pin-Yu Chen"],
    359       "year": 2024,
    360       "arxiv_id": "2411.00348",
    361       "relevance": "Prior attention-based IPI detection method used as a baseline, demonstrating the attention feature approach for prompt injection."
    362     },
    363     {
    364       "title": "Get my drift? Catching LLM task drift with activation deltas",
    365       "authors": ["Sahar Abdelnabi", "Aideen Fay", "Giovanni Cherubin", "Ahmed Salem", "Mario Fritz", "Andrew Paverd"],
    366       "year": 2025,
    367       "relevance": "TaskTracker method using LLM activations for task drift detection, strongest classifier baseline."
    368     },
    369     {
    370       "title": "Universal and transferable adversarial attacks on aligned language models",
    371       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J Zico Kolter", "Matt Fredrikson"],
    372       "year": 2023,
    373       "arxiv_id": "2307.15043",
    374       "relevance": "GCG attack method used as unseen attack baseline for evaluating defense transferability."
    375     },
    376     {
    377       "title": "Neural Exec: Learning (and learning from) execution triggers for prompt injection attacks",
    378       "authors": ["Dario Pasquini", "Martin Strohmeier", "Carmela Troncoso"],
    379       "year": 2024,
    380       "relevance": "Gradient-based IPI attack used as unseen attack baseline for evaluating defense transferability."
    381     },
    382     {
    383       "title": "Jailbreaking black box large language models in twenty queries",
    384       "authors": ["Patrick Chao", "Alexander Robey", "Edgar Dobriban", "Hamed Hassani", "George J Pappas", "Eric Wong"],
    385       "year": 2023,
    386       "arxiv_id": "2310.08419",
    387       "relevance": "PAIR adaptive attack method adapted for IPI evaluation of defense robustness."
    388     },
    389     {
    390       "title": "Tree of attacks: Jailbreaking black-box LLMs automatically",
    391       "authors": ["Anay Mehrotra", "Manolis Zampetakis", "Paul Kassianik", "Blaine Nelson", "Hyrum Anderson", "Yaron Singer", "Amin Karbasi"],
    392       "year": 2023,
    393       "arxiv_id": "2312.02119",
    394       "relevance": "TAP adaptive attack extending PAIR with tree search, used for robustness evaluation of IPI defenses."
    395     },
    396     {
    397       "title": "Can LLMs separate instructions from data? And what do we even mean by that?",
    398       "authors": ["Egor Zverev", "Sahar Abdelnabi", "Soroush Tabesh", "Mario Fritz", "Christoph H Lampert"],
    399       "year": 2024,
    400       "arxiv_id": "2403.06833",
    401       "relevance": "SEP benchmark for IPI evaluation, used as the foundation dataset for constructing FIPI."
    402     },
    403     {
    404       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    405       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    406       "year": 2023,
    407       "relevance": "Seminal work on indirect prompt injection attacks against LLM-integrated applications."
    408     },
    409     {
    410       "title": "Benchmarking and defending against indirect prompt injection attacks on large language models",
    411       "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Emre Kiciman", "Guangzhong Sun", "Xing Xie", "Fangzhao Wu"],
    412       "year": 2023,
    413       "arxiv_id": "2312.14197",
    414       "relevance": "BIPIA benchmark and adversarial training defense for IPI attacks."
    415     },
    416     {
    417       "title": "InjecGuard: Benchmarking and mitigating over-defense in prompt injection guardrail models",
    418       "authors": ["Hao Li", "Xiaogeng Liu", "Chaowei Xiao"],
    419       "year": 2024,
    420       "arxiv_id": "2410.22770",
    421       "relevance": "Addresses the false positive problem in IPI detection, relevant to defense evaluation methodology."
    422     }
    423   ]
    424 }

Impressum · Datenschutz