ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27744B)


      1 {
      2   "paper": {
      3     "title": "When AI Meets the Web: Prompt Injection Risks in Third-Party AI Chatbot Plugins",
      4     "authors": [
      5       "Yigitcan Kaya",
      6       "Anton Landerer",
      7       "Stijn Pletinckx",
      8       "Michelle Zimmermann",
      9       "Christopher Kruegel",
     10       "Giovanni Vigna"
     11     ],
     12     "year": 2025,
     13     "venue": "IEEE Symposium on Security and Privacy 2026",
     14     "arxiv_id": "2511.05797"
     15   },
     16   "scan_version": 2,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["observational", "benchmark-eval"],
     19   "key_findings": "Large-scale study of 17 third-party chatbot plugins across 10,000+ websites reveals that 8 plugins (used by 8,000 websites) fail to enforce integrity of conversation history in HTTP requests, enabling adversaries to forge system/assistant messages and boost attack success 3-8x. 15 plugins indiscriminately scrape third-party content (e.g., customer reviews) for RAG without distinguishing trusted from untrusted sources, and 13% of sampled e-commerce sites already expose chatbots to this indirect injection risk. Hardened system prompts reduce task hijacking but are ineffective against tool hijacking attacks, which succeed at high rates even with user-role-only injection.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No repository URL, code archive, or link to released source code is mentioned anywhere in the paper."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No dataset release is mentioned. The collected website lists, extracted system prompts, and experimental results are not made publicly available."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No environment specification, dependency list, or setup details are provided beyond mentioning a local WordPress instance and OpenAI API."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No reproduction instructions or scripts are provided. The experimental setup is described narratively but lacks step-by-step replication guidance."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Tables 3 and 4 report raw counts of successful attacks out of 10 trials. No confidence intervals or error bars are provided."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper makes comparative claims (e.g., 'injections via the user role are less effective') based on comparing raw counts without any statistical significance tests."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper provides contextual effect sizes: '3–8×' improvement from history forging (abstract), '25–100% of cases, compared to just 0–25%' (§1), and percentage breakdowns throughout §7 results."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No justification for why 10 repetitions per configuration, 300 websites for system prompt extraction, or 100 e-commerce sites for real-world exposure assessment were chosen."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Results are reported as counts out of 10 trials. No standard deviation, variance, or spread measure is reported across experimental runs."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "User-role injection serves as the baseline (representing secure plugin behavior) against which system and assistant role injections are compared in Tables 3 and 4."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Experiments use 11 contemporary commercial LLMs including GPT-4o, GPT-4.1, o4-mini, Claude Sonnet 4, and Gemini 2.5 Pro — all current models as of 2025."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The paper systematically varies injection role (system/assistant/user), system prompt design (insecure/hardened/hardened-specific), RAG insertion mode (5 modes), and content wrapping, isolating each factor's contribution."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Three distinct attack tasks are evaluated: system prompt extraction (SPE), task hijacking (TaH), and tool hijacking (ToH) for direct injection; context hijacking for indirect injection."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "System prompt extraction success was manually labeled by checking for key features (imperative instructions, role assignments, behavioral guidelines). Spot-checks of 200 websites and manual validation of 100 e-commerce sites were performed."
     95       },
     96       "held_out_test_set": {
     97         "applies": false,
     98         "answer": false,
     99         "justification": "This is a security evaluation with controlled experiments, not a ML task with train/test splits. The concept of held-out test sets does not apply."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Tables 3 and 4 provide detailed breakdowns per model, per injection role, per system prompt type, and per content insertion mode. Averages by provider are also reported."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper discusses chatbot deployment failures (§4.2: quota limits, deprecated models), attack failures (refusal to comply), and cases where defenses work (user-role injections blocked by hardened prompts)."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Key negative results include: hardened system prompts do not protect against tool hijacking (§7.1), content wrapping has model-dependent effectiveness (§7.2), and ignore+instruct prompts were least effective (Appendix B)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims are well-supported: 8 plugins lacking integrity checks (Table 2), 3-8x boost in attack success (Table 3), 13% e-commerce exposure (§5.3), 15 plugins with scraping tools (Table 2)."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Causal claims ('this oversight amplifies the impact') are justified through controlled experiments that systematically vary single factors (injection role, prompt type) while holding others constant (§7)."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The paper bounds findings to the 17 studied plugins and 11 tested LLMs. It acknowledges the dataset 'may underestimate the full extent' (§3) and notes custom tools 'cannot be evaluated without invasive probing' (§6.2)."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "§8.3 discusses distribution shift as an alternative explanation for why defenses appear stronger in controlled settings. The paper also considers that model capability vs. robustness may be a trade-off (§7.2 SA mode analysis)."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper directly measures attack success (whether the LLM produced the adversary-intended response) and claims vulnerability. The measurement matches the claimed outcome with no proxy gap."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Models are listed as '4o-mini', '4o', '4.1-mini', '4.1', 'o4-mini', 'haiku-3.5', 'sonnet-3.5', 'sonnet-4', '2.0-flash', '2.5-flash', '2.5-pro' — marketing names without snapshot dates or API version identifiers."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Appendix A (Listings 1-11) provides the full text of all injected prompts, trigger messages, system prompt templates, and tool instruction formats used in experiments."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Temperature 0.5 is stated for all experiments (§7.1, §7.2). Chunking parameters for RAG (600-token chunks, 300-token overlaps, text-embedding-3-large) are specified in §7."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. The experiments test direct LLM API interactions through plugin interfaces."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "§3 documents plugin selection criteria (5 exclusion rules), website detection via HTML markers, Common Crawl scanning methodology, and §4.2 describes spot-check validation methodology with counts."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "There is no dedicated limitations or threats-to-validity section. Limitations are scattered across §5.3 ('our findings likely underestimate'), §3, and the meta-review's §D.4."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Specific threats are discussed: dataset may underestimate ecosystem (§3), 62.5% functionality rate means many chatbots are non-functional (§4.2), 5-query limit per chatbot may miss exposures (§5.3), and the meta-review notes defense robustness against adaptive attacks is unclear (§D.4)."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The paper explicitly excludes enterprise solutions (Zendesk, Tidio, Intercom) in §3, states 'we do not aim to develop new prompt injection methods' (§7), and excludes attacks leveraging classical web vulnerabilities (§5 threat model)."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "No raw data (website lists, extracted system prompts, experimental logs) is released for independent verification."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "§3 details the data collection: Common Crawl August 2024 snapshot scanning for HTML markers, WHOIS lookups for domain registration dates, Cloudflare API for content categorization, and manual spot-checks of 200 websites."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Website sampling is described: Common Crawl scanning with specific plugin HTML markers, keyword search of WP marketplace yielding 116 results filtered by 5 exclusion criteria to 7 WP plugins, plus search engine/blog post discovery for 10 generic plugins (§3)."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The pipeline is documented: CC snapshot → HTML marker matching → 10,417 unique sites → spot-check validation (200 sites) → automated validation (P1/P2/P4) → functionality rates reported. System prompt extraction: 300 sites → injection → manual labeling → 219 extractions (73% success) (§6.1)."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "§10 discloses funding from NSF (grant 2229876), Department of Homeland Security, IBM, and U.S. Intelligence Community Postdoctoral Fellowship."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "All authors are affiliated with UC Santa Barbara. No author affiliations with the evaluated plugins or LLM providers."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "NSF, DHS, IBM, and IC postdoctoral fellowship are government/academic funders with no financial stake in whether specific chatbot plugins or LLMs are found to be vulnerable."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is included in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "This paper tests prompt injection attack success rates against LLMs, not model knowledge on benchmarks. Contamination of training data is not relevant to whether an LLM follows injected instructions."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "The paper tests security vulnerabilities (attack success), not model knowledge that could be contaminated by training data overlap."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No benchmark contamination concern exists — the study evaluates attack robustness, not learned knowledge on test problems."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants. The study analyzes websites and LLM outputs, not human subjects."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants. The paper does discuss ethical considerations for web scanning (§6.1) but this is not a human subjects study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in the study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "The paper runs experiments with 11 commercial LLMs across hundreds of configurations but does not report API costs, tokens consumed, or total experimental cost. Only UGCBuster cost (~$0.05/page) is mentioned."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No total computational budget is stated for the controlled experiments or the Common Crawl scanning pipeline."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "Results are reported as counts out of 10 runs at temperature 0.5. No seed sensitivity analysis or results across different random seeds are provided."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "Explicitly stated: 'we set the sampling temperature to 0.5 and repeat the experiment 10 times' (§7.1) and 'Each configuration is run 10 times' (§7.2)."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "'We generate five variants per goal and select the most effective based on trials with a subset of models' (§7) — but the number of models in the subset and total compute for selection are not reported."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "The paper states prompt variants were selected based on 'trials with a subset of models' but does not specify which models, how many trials, or show results for non-selected variants."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "The paper makes many comparisons across 11 models, 3 prompt types, 3 roles, and 3 attack tasks without any statistical tests or multiple comparison corrections."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": false,
    326         "answer": false,
    327         "justification": "The authors evaluate third-party commercial LLMs and plugins, not their own system. There is no self-comparison bias to address for the main experiments."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": false,
    331         "answer": false,
    332         "justification": "The experiments test attack success rates, not model performance as a function of compute. Compute differences between models are not a confound for security evaluation."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": true,
    337         "justification": "The paper grounds experiments in real-world observations: attack tasks (SPE, TaH, ToH) are motivated by real incidents (§7.1), system prompts are derived from actual deployments (§6.1), and tool configurations mirror real plugin defaults (§6.2)."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No agentic scaffolding is involved. LLMs are called directly through plugin APIs."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": false,
    348         "answer": false,
    349         "justification": "The study tests attack robustness, not model knowledge. Temporal leakage (model having seen test solutions during training) is not relevant to prompt injection success rates."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": false,
    353         "answer": false,
    354         "justification": "Not applicable — the study does not evaluate model capability on a knowledge benchmark where feature leakage would be meaningful."
    355       },
    356       "non_independence_addressed": {
    357         "applies": false,
    358         "answer": false,
    359         "justification": "Not applicable — no train/test split with potential non-independence issues."
    360       },
    361       "leakage_detection_method": {
    362         "applies": false,
    363         "answer": false,
    364         "justification": "Not applicable — no benchmark knowledge evaluation where leakage detection would be relevant."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "8 of 17 plugins (used by 8,000 websites) fail to enforce integrity of conversation history in HTTP requests, enabling adversaries to forge messages in elevated roles.",
    371       "evidence": "Table 2 documents which plugins allow injection into system and assistant roles. §5.2 details the vulnerability mechanism with examples in Figure 3.",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "Direct prompt injection into non-user roles boosts attack success by 3-8x compared to user-role-only injection.",
    376       "evidence": "Table 3 shows system/assistant role injections achieve ~60%/~30% SPE success vs ~3% for user role; TaH achieves ~98%/~59% vs ~10% for user role under insecure prompts.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "~13% of e-commerce websites have already exposed their chatbots to third-party content via RAG ingestion of customer reviews.",
    381       "evidence": "§5.3: Manual analysis of 100 randomly selected e-commerce sites using P1; 41 allow reviews, 36 display them, 13 chatbots incorporated review content in responses.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "Hardened system prompts effectively block task hijacking but offer limited protection against tool hijacking attacks.",
    386       "evidence": "Table 3: TaH drops from ~98% to ~24% (system role) with hardened-specific prompts, but ToH increases in some configurations. Tool hijacking succeeds at 20-100% even via user role for Gemini models.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "The chatbot plugin ecosystem grew nearly 50% in 2025 alone, from 10,417 sites (Aug 2024) to 17,474 sites (Apr 2025).",
    391       "evidence": "Table 1 and Figure 2 show longitudinal growth across Common Crawl snapshots from January 2023 to April 2025.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "Gemini models are the least robust against prompt injection attacks across all tested scenarios.",
    396       "evidence": "Tables 3 and 4: Gemini models show highest attack success rates, with user-role injections succeeding consistently and indirect attacks via tool role reaching up to 100% success.",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "Tool instruction hardening reduces tool hijacking attack success rates by 40-75%.",
    401       "evidence": "§8.2 states this result from repeating §7 experiments with LLM-hardened tool instructions, but detailed results are not shown in the paper.",
    402       "supported": "moderate"
    403     }
    404   ],
    405   "red_flags": [
    406     {
    407       "flag": "No artifacts released",
    408       "detail": "Despite analyzing 10,000+ websites and running extensive controlled experiments, no code, data, website lists, or experimental logs are released. This limits reproducibility and independent verification."
    409     },
    410     {
    411       "flag": "No statistical tests",
    412       "detail": "All comparisons are based on raw counts out of 10 trials without confidence intervals, significance tests, or variance measures. With only 10 repetitions per configuration, observed differences could be due to sampling noise."
    413     },
    414     {
    415       "flag": "Prompt selection on test models",
    416       "detail": "Attack prompts were selected based on effectiveness 'with a subset of models' (§7), introducing potential selection bias. The subset of models and non-selected prompt variants are not reported."
    417     },
    418     {
    419       "flag": "Plugin anonymization limits verification",
    420       "detail": "Plugins are anonymized as P1-P17, preventing independent verification of the claimed vulnerabilities. While reasonable for responsible disclosure, it reduces transparency."
    421     }
    422   ],
    423   "cited_papers": [
    424     {
    425       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    426       "authors": ["K. Greshake", "S. Abdelnabi", "S. Mishra", "C. Endres", "T. Holz", "M. Fritz"],
    427       "year": 2023,
    428       "arxiv_id": "2302.12173",
    429       "relevance": "Foundational work on indirect prompt injection in LLM-integrated applications, directly relevant to the threat model studied."
    430     },
    431     {
    432       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    433       "authors": ["Y. Liu", "Y. Jia", "R. Geng", "J. Jia", "N. Z. Gong"],
    434       "year": 2024,
    435       "relevance": "Formalizes prompt injection attacks and evaluates defenses, providing the benchmark framework this paper builds upon."
    436     },
    437     {
    438       "title": "The instruction hierarchy: Training LLMs to prioritize privileged instructions",
    439       "authors": ["E. Wallace", "K. Xiao", "R. Leike", "L. Weng", "J. Heidecke", "A. Beutel"],
    440       "year": 2024,
    441       "arxiv_id": "2404.13208",
    442       "relevance": "Describes the instruction hierarchy defense that plugins undermine by allowing injection into privileged roles."
    443     },
    444     {
    445       "title": "Struq: Defending against prompt injection with structured queries",
    446       "authors": ["S. Chen", "J. Piet", "C. Sitawarin", "D. Wagner"],
    447       "year": 2024,
    448       "arxiv_id": "2402.06363",
    449       "relevance": "Proposes structured query defense against prompt injection; content wrapping evaluated in this paper is related."
    450     },
    451     {
    452       "title": "DataSentinel: A game-theoretic detection of prompt injection attacks",
    453       "authors": ["Y. Liu", "Y. Jia", "J. Jia", "D. Song", "N. Z. Gong"],
    454       "year": 2025,
    455       "relevance": "Game-theoretic prompt injection detection tested as alternative prompt type in this paper's experiments (Appendix B)."
    456     },
    457     {
    458       "title": "AgentDojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents",
    459       "authors": ["E. Debenedetti", "J. Zhang", "M. Balunovic", "L. Beurer-Kellner", "M. Fischer", "F. Tramèr"],
    460       "year": 2024,
    461       "relevance": "Dynamic benchmark for evaluating prompt injection in agentic settings; complementary to this paper's real-world plugin focus."
    462     },
    463     {
    464       "title": "PoisonedRAG: Knowledge corruption attacks to retrieval-augmented generation of large language models",
    465       "authors": ["W. Zou", "R. Geng", "B. Wang", "J. Jia"],
    466       "year": 2024,
    467       "arxiv_id": "2402.07867",
    468       "relevance": "Demonstrates RAG poisoning attacks that this paper shows are practically viable through plugin scraping vulnerabilities."
    469     },
    470     {
    471       "title": "A new era in LLM security: Exploring security concerns in real-world LLM-based systems",
    472       "authors": ["F. Wu", "N. Zhang", "S. Jha", "P. McDaniel", "C. Xiao"],
    473       "year": 2024,
    474       "arxiv_id": "2402.18649",
    475       "relevance": "Surveys security concerns in real-world LLM systems, providing context for the plugin vulnerability ecosystem."
    476     },
    477     {
    478       "title": "Jatmo: Prompt injection defense by task-specific finetuning",
    479       "authors": ["J. Piet", "M. Alrashed", "C. Sitawarin", "S. Chen", "Z. Wei", "E. Sun", "B. Alomair", "D. Wagner"],
    480       "year": 2024,
    481       "relevance": "Proposes task-specific finetuning as prompt injection defense; represents the class of advanced defenses the paper argues plugins won't adopt."
    482     },
    483     {
    484       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    485       "authors": ["P. Lewis", "E. Perez", "A. Piktus"],
    486       "year": 2020,
    487       "relevance": "Foundational RAG paper; RAG is the primary mechanism through which plugins ingest external content, creating indirect injection risk."
    488     },
    489     {
    490       "title": "Ignore this title and HackAPrompt: Exposing systemic vulnerabilities of LLMs through a global prompt hacking competition",
    491       "authors": ["S. V. Schulhoff"],
    492       "year": 2023,
    493       "relevance": "Global competition exposing LLM prompt vulnerabilities; informs the attack strategy design in this paper."
    494     },
    495     {
    496       "title": "A closer look at system prompt robustness",
    497       "authors": ["N. Mu", "J. Lu", "M. Lavery", "D. Wagner"],
    498       "year": 2025,
    499       "arxiv_id": "2502.12197",
    500       "relevance": "Analyzes system prompt robustness with realistic prompt datasets; complements this paper's real-world system prompt characterization."
    501     }
    502   ]
    503 }

Impressum · Datenschutz