ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28068B)


      1 {
      2   "paper": {
      3     "title": "Defense against Prompt Injection Attacks via Mixture of Encodings",
      4     "authors": [
      5       "Ruiyi Zhang",
      6       "David Sullivan",
      7       "Kyle Jackson",
      8       "Pengtao Xie",
      9       "Mei Chen"
     10     ],
     11     "year": 2025,
     12     "venue": "North American Chapter of the Association for Computational Linguistics",
     13     "arxiv_id": "2504.07467",
     14     "doi": "10.48550/arXiv.2504.07467"
     15   },
     16   "scan_version": 3,
     17   "active_modules": [
     18     "experimental_rigor",
     19     "data_leakage"
     20   ],
     21   "methodology_tags": [
     22     "benchmark-eval"
     23   ],
     24   "key_findings": "The mixture of encodings defense (combining plaintext, Base64, and Caesar cipher encodings) achieves among the lowest prompt injection attack success rates while maintaining high NLP task performance, unlike single-encoding defenses which degrade helpfulness. On GPT-4o, the method achieves 0-1.5% ASR across all attack datasets while retaining near-baseline NLP performance. The 3.46x inference cost overhead is the main trade-off. Results generalize to Qwen-2.5-72B-Instruct.",
     25   "checklist": {
     26     "artifacts": {
     27       "code_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "GitHub repository provided: https://github.com/ruz048/MoEMEnT (stated in Section 1)."
     31       },
     32       "data_released": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "All datasets used are publicly available standard benchmarks: BIPIA, MMLU, SQuAD, Hellaswag, MGSM, SamSum, WMT, IMDB, WildGuard, WebQ. Sources cited in Appendix F."
     36       },
     37       "environment_specified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No environment specifications, requirements.txt, or dependency details are provided in the paper."
     41       },
     42       "reproduction_instructions": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No step-by-step reproduction instructions are provided in the paper. Only a GitHub link is given without description of how to run experiments."
     46       }
     47     },
     48     "statistical_methodology": {
     49       "confidence_intervals_or_error_bars": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Tables 1, 2, 5, and 6 report only point estimates with no confidence intervals or error bars."
     53       },
     54       "significance_tests": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "Claims of 'outperforms' are made by comparing raw numbers in tables. No statistical significance tests (p-values, t-tests, etc.) are used."
     58       },
     59       "effect_sizes_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "Raw metric values are shown in tables, but the paper never reports effect sizes or quantifies the magnitude of improvement. Claims like 'significantly outperforms' lack quantification."
     63       },
     64       "sample_size_justified": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Dataset sizes are listed in tables but never justified. For the Qwen experiments, 3,000 samples were 'randomly selected' from each dataset with no rationale for this number."
     68       },
     69       "variance_reported": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No standard deviations, variance, or multi-run results are reported anywhere in the paper. All results appear to be single-run."
     73       }
     74     },
     75     "evaluation_design": {
     76       "baselines_included": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Multiple baselines compared: No Defense, Datamark, Ignoring, Base64, and Caesar cipher (Tables 1-2, described in Appendix E)."
     80       },
     81       "baselines_contemporary": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Baselines include Hines et al. 2024 (spotlighting/Base64), Yi et al. 2023 (BIPIA/Datamark/Ignoring), and Liu et al. 2024b. These are recent and relevant to the prompt injection defense field."
     85       },
     86       "ablation_study": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The three individual components (plaintext/No Defense, Base64, Caesar) are each evaluated separately as baselines, and the full mixture is compared against them. Appendix B also reports preliminary experiments with alternative encodings (Atbash, ASCII, Morse, Base32, Base58) and why they were rejected."
     90       },
     91       "multiple_metrics": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Two distinct evaluation dimensions: attack success rate (safety) and NLP task performance (helpfulness) across 9 different tasks with different metrics (accuracy, F1, BLEU, ROUGE, etc.)."
     95       },
     96       "human_evaluation": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "No human evaluation is included. All evaluation is automated via benchmark metrics. Human evaluation of output quality or attack detection would be relevant but is not performed."
    100       },
    101       "held_out_test_set": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper states they use 'validation or test splits' from 9 datasets for the helpfulness benchmark (Section 5.1)."
    105       },
    106       "per_category_breakdown": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Results broken down per-dataset for safety (Email, Table, Abstract, Code in Table 1) and per-task for helpfulness (9 NLP tasks in Table 2)."
    110       },
    111       "failure_cases_discussed": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "Base64 failure modes are discussed (Figure 3b, Appendix A), but the authors do not discuss where their own mixture-of-encodings method fails. Table 2 shows degradation on some tasks (e.g., WebQ drops from 29.7 to 25.3 for GPT-4o) but this is not discussed."
    115       },
    116       "negative_results_reported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Appendix B reports negative results: Atbash achieved only 1.6 BLEU on WMT and 3.5% on MGSM; Base32, Base58, ASCII, and Morse code all had specific weaknesses. These are encodings tried and abandoned."
    120       }
    121     },
    122     "claims_and_evidence": {
    123       "abstract_claims_supported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Abstract claims of 'one of the lowest attack success rates' and 'high performance across all NLP tasks, outperforming existing character encoding-based defense methods' are supported by Tables 1 and 2. The hedging with 'one of' is appropriate since Caesar alone sometimes beats it on GPT-4."
    127       },
    128       "causal_claims_justified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The causal claim is that the mixture-of-encodings approach causes improved safety-helpfulness trade-off. The controlled experimental design (same models, same data, varying only the defense method) is adequate for this type of claim."
    132       },
    133       "generalization_bounded": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The abstract claims the method works 'across all NLP tasks' but only 9 specific tasks are tested. The title and abstract are general ('prompt injection attacks') without bounding claims to the specific models (GPT-4, GPT-4o, Qwen) or attack types (50 from BIPIA) tested."
    137       },
    138       "alternative_explanations_discussed": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "No alternative explanations for the results are discussed. The paper does not consider why the mixture strategy works beyond the intuition that different encodings create disagreement. No confounds are addressed."
    142       },
    143       "proxy_outcome_distinction": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The paper measures attack success rate and NLP task accuracy, and frames these as safety and helpfulness respectively. The measurements match the granularity of the claims without proxy gaps."
    147       }
    148     },
    149     "setup_transparency": {
    150       "model_versions_specified": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Exact model versions specified in Section 5.2: 'GPT-4 (turbo-2024-04-09)', 'GPT-4o (2024-05-13)', and 'Qwen-2.5-72B-Instruct'."
    154       },
    155       "prompts_provided": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Meta-prompts MP1 and MP2 are provided in Table 3 (Appendix D) with actual prompt text. The prompt structure P1-P4 is formalized in Sections 3-4."
    159       },
    160       "hyperparameters_reported": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "Caesar cipher shift of 3 is stated, but LLM API parameters (temperature, top-p, max tokens) are not reported anywhere in the paper."
    164       },
    165       "scaffolding_described": {
    166         "applies": false,
    167         "answer": false,
    168         "justification": "No agentic scaffolding is used. The method is a prompt-level encoding and aggregation strategy."
    169       },
    170       "data_preprocessing_documented": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "The paper lists datasets and sizes but does not describe how external text was preprocessed before encoding, how attack injections were inserted into external data, or filtering steps. For Qwen, only 'randomly selecting 3,000 samples' is stated without selection criteria."
    174       }
    175     },
    176     "limitations_and_scope": {
    177       "limitations_section_present": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 7 is titled 'Limitation' and discusses computational overhead (3.46x inference cost), noting the method is 'less suitable for time-sensitive applications.' Appendix H provides detailed cost comparison."
    181       },
    182       "threats_to_validity_specific": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The limitation about inference cost is specific to this method: 'additional computational overhead introduced by processing multiple input prompts' with a concrete cost comparison in Table 4 (3.46x). They also note parallelization as mitigation."
    186       },
    187       "scope_boundaries_stated": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No explicit statements about what the results do NOT show. The paper does not bound its claims to specific model families, attack types, or settings. No discussion of settings where the method might not work beyond time-sensitive applications."
    191       }
    192     },
    193     "data_integrity": {
    194       "raw_data_available": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "Raw experimental outputs (model responses, per-example results) are not available. Only aggregate results are reported in tables."
    198       },
    199       "data_collection_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Data sources are clearly described: BIPIA benchmark with 50 attack types (30 text-based, 20 code-based) across 4 datasets (Appendix F). Helpfulness benchmark uses 9 public datasets with sizes stated."
    203       },
    204       "recruitment_methods_described": {
    205         "applies": false,
    206         "answer": false,
    207         "justification": "No human participants. All data comes from standard public benchmarks."
    208       },
    209       "data_pipeline_documented": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "The pipeline from raw datasets to final results is not fully documented. How attack prompts are constructed from BIPIA, how responses are classified as successful attacks, and evaluation procedures are not detailed."
    213       }
    214     },
    215     "conflicts_of_interest": {
    216       "funding_disclosed": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No funding disclosure or acknowledgments section is present. The footnote notes it was Ruiyi's internship at Microsoft, but no formal funding statement."
    220       },
    221       "affiliations_disclosed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "Author affiliations clearly listed: UC San Diego and Microsoft. Footnote explicitly states 'This work was done as Ruiyi's internship project at Microsoft.'"
    225       },
    226       "funder_independent_of_outcome": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The work was done at Microsoft (internship), and Microsoft has a commercial interest in LLM safety solutions and is a major investor in OpenAI whose models are primarily evaluated. The funder is not independent of the outcome."
    230       },
    231       "financial_interests_declared": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No competing interests statement or financial disclosures are present in the paper."
    235       }
    236     },
    237     "contamination": {
    238       "training_cutoff_stated": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "Training cutoff dates for GPT-4, GPT-4o, and Qwen-2.5-72B are not stated, despite evaluating on benchmarks like MMLU (2021) and SQuAD (2016) that predate these models."
    242       },
    243       "train_test_overlap_discussed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "No discussion of whether benchmark examples appeared in model training data. This is relevant because contamination could differentially affect plaintext vs. encoded conditions."
    247       },
    248       "benchmark_contamination_addressed": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "MMLU (2021), SQuAD (2016), Hellaswag (2019), and other benchmarks were published well before GPT-4/GPT-4o training. Contamination could inflate no-defense scores relative to encoded conditions (where memorized answers may not help), but this is not discussed."
    252       }
    253     },
    254     "human_studies": {
    255       "pre_registered": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "irb_or_ethics_approval": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "demographics_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "inclusion_exclusion_criteria": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "randomization_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "blinding_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       },
    285       "attrition_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human participants in this study."
    289       }
    290     },
    291     "cost_and_practicality": {
    292       "inference_cost_reported": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Table 4 (Appendix H) reports relative inference costs: their method is 3.46x the baseline. Cost formula explained as '(output tokens × 4 + input tokens)' following LLM API pricing conventions."
    296       },
    297       "compute_budget_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No total API spend, GPU hours, or absolute computational budget is stated. Only relative cost multipliers are provided."
    301       }
    302     },
    303     "experimental_rigor": {
    304       "seed_sensitivity_reported": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "No mention of random seeds or seed sensitivity. Results appear to be from single runs."
    308       },
    309       "number_of_runs_stated": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "The number of experimental runs is never stated. It is unclear whether results are from single runs or averaged over multiple runs."
    313       },
    314       "hyperparameter_search_budget": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "No hyperparameter search budget reported. The Caesar shift of 3 is used without justification for this choice. Alternative encodings were explored (Appendix B) but no systematic search budget is described."
    318       },
    319       "best_config_selection_justified": {
    320         "applies": true,
    321         "answer": true,
    322         "justification": "Appendix B provides explicit justification for selecting Base64 and Caesar over alternatives (Atbash, ASCII, Morse, Base32, Base58), with specific performance numbers showing why alternatives were rejected."
    323       },
    324       "multiple_comparison_correction": {
    325         "applies": false,
    326         "answer": false,
    327         "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable."
    328       },
    329       "self_comparison_bias_addressed": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "The authors implement and evaluate their own method and baselines without acknowledging self-evaluation bias. No independent evaluation or discussion of this potential bias."
    333       },
    334       "compute_budget_vs_performance": {
    335         "applies": true,
    336         "answer": true,
    337         "justification": "Table 4 reports relative inference costs (3.46x for their method), and the limitations section discusses the cost-performance trade-off explicitly: 'the significant performance gain of our method justifies this trade-off.'"
    338       },
    339       "benchmark_construct_validity": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of whether BIPIA actually measures real-world prompt injection defense effectiveness, or whether the 9 NLP benchmarks adequately represent 'helpfulness.' The validity of ASR as a security metric is not questioned."
    343       },
    344       "scaffold_confound_addressed": {
    345         "applies": false,
    346         "answer": false,
    347         "justification": "No scaffolding is involved. The method is a prompt-level encoding strategy."
    348       }
    349     },
    350     "data_leakage": {
    351       "temporal_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "Not discussed. GPT-4 and GPT-4o were trained after MMLU (2021), SQuAD (2016), Hellaswag (2019), and BIPIA (2023) were published, creating temporal leakage risk."
    355       },
    356       "feature_leakage_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "Not discussed. The encoding itself could leak information (e.g., the no-defense condition may benefit from memorized benchmark answers that are unavailable in encoded conditions)."
    360       },
    361       "non_independence_addressed": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "Not discussed. No analysis of whether BIPIA attack patterns overlap with model training data."
    365       },
    366       "leakage_detection_method": {
    367         "applies": true,
    368         "answer": false,
    369         "justification": "No leakage detection or prevention method is applied."
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "The mixture of encodings method achieves one of the lowest attack success rates under prompt injection attacks.",
    376       "evidence": "Table 1 shows ASR of 1.20-6.79% on GPT-4 and 0-1.50% on GPT-4o across 4 datasets, competitive with or better than Base64 and Caesar individually.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "The method maintains high NLP task performance, outperforming existing character encoding-based defense methods on helpfulness.",
    381       "evidence": "Table 2 shows the method significantly outperforms Base64 and Caesar on 9 NLP tasks. E.g., GPT-4 MMLU: 77.2% (ours) vs 44.6% (Base64) vs 63.1% (Caesar). MGSM: 36.8% vs 19.1% vs 7.3%.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "The method reaches comparable performance to LLMs without any defense mechanism on helpfulness benchmarks.",
    386       "evidence": "Table 2 shows close performance to no-defense on most tasks (e.g., GPT-4 MMLU: 77.2 vs 83.0, MGSM: 36.8 vs 38.6), but noticeable drops remain on some tasks.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "The approach generalizes to open-source models.",
    391       "evidence": "Tables 5 and 6 show results on Qwen-2.5-72B-Instruct on 3 safety datasets and 3 NLP tasks, showing the same pattern of improved safety-helpfulness trade-off.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "The inference cost overhead (3.46x) is justified by performance gains.",
    396       "evidence": "Table 4 shows cost comparison; Tables 1-2 show performance gains. The limitations section discusses this trade-off and notes parallel processing as mitigation.",
    397       "supported": "weak"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "No error bars or variance",
    403       "detail": "All results appear to be from single runs with no uncertainty quantification. Tables 1, 2, 5, and 6 report only point estimates. With no variance information, it is impossible to assess whether observed differences are meaningful or within noise."
    404     },
    405     {
    406       "flag": "No statistical tests",
    407       "detail": "Claims of 'outperforms' and 'significantly outperforms' are based solely on comparing raw numbers without any statistical significance testing. With single-run results and many comparisons, some apparent differences could be noise."
    408     },
    409     {
    410       "flag": "Undisclosed conflicts of interest",
    411       "detail": "The work was done as a Microsoft internship. Microsoft has commercial interest in LLM safety solutions and is a major investor in OpenAI (whose models are primarily evaluated). No COI statement or funding disclosure is provided."
    412     },
    413     {
    414       "flag": "Benchmark contamination ignored",
    415       "detail": "Helpfulness benchmarks (MMLU 2021, SQuAD 2016, Hellaswag 2019) predate GPT-4/GPT-4o training. Contamination could differentially affect conditions: the no-defense baseline could benefit from memorized answers while encoded conditions cannot, systematically understating the helpfulness cost of encoding."
    416     },
    417     {
    418       "flag": "Own method's failures not discussed",
    419       "detail": "Table 2 shows degradation on multiple tasks (e.g., GPT-4o WebQ: 25.3 vs 29.7 no-defense, GPT-4 MMLU: 77.2 vs 83.0) but these are not acknowledged or discussed as failure modes."
    420     }
    421   ],
    422   "cited_papers": [
    423     {
    424       "title": "GPT-4 Technical Report",
    425       "authors": [
    426         "Josh Achiam",
    427         "Steven Adler",
    428         "Barret Zoph"
    429       ],
    430       "year": 2023,
    431       "relevance": "Foundational LLM evaluated in the experiments; important context for LLM capability and safety evaluation."
    432     },
    433     {
    434       "title": "Formalizing and Benchmarking Prompt Injection Attacks and Defenses",
    435       "authors": [
    436         "Yupei Liu",
    437         "Yuqi Jia",
    438         "Runpeng Geng",
    439         "Jinyuan Jia",
    440         "Neil Zhenqiang Gong"
    441       ],
    442       "year": 2024,
    443       "relevance": "Formalizes prompt injection attacks and defenses, providing the framework this paper builds on."
    444     },
    445     {
    446       "title": "Defending against Indirect Prompt Injection Attacks with Spotlighting",
    447       "authors": [
    448         "Keegan Hines",
    449         "Gary Lopez",
    450         "Matthew Hall",
    451         "Federico Zarfati",
    452         "Yonatan Zunger",
    453         "Emre Kiciman"
    454       ],
    455       "year": 2024,
    456       "arxiv_id": "2403.14720",
    457       "relevance": "Proposes Base64 defense (spotlighting) which is the primary baseline this paper extends."
    458     },
    459     {
    460       "title": "Benchmarking and Defending against Indirect Prompt Injection Attacks on Large Language Models",
    461       "authors": [
    462         "Jingwei Yi",
    463         "Yueqi Xie",
    464         "Bin Zhu"
    465       ],
    466       "year": 2023,
    467       "relevance": "Creates the BIPIA benchmark used for safety evaluation and proposes datamark/ignoring defenses used as baselines."
    468     },
    469     {
    470       "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions",
    471       "authors": [
    472         "Eric Wallace",
    473         "Kai Xiao",
    474         "Reimar H. Leike",
    475         "Lilian Weng",
    476         "Johannes Heidecke",
    477         "Alex Beutel"
    478       ],
    479       "year": 2024,
    480       "relevance": "Proposes training-based defense against prompt injection via instruction hierarchy, complementary approach to encoding-based defenses."
    481     },
    482     {
    483       "title": "Tensor Trust: Interpretable Prompt Injection Attacks from an Online Game",
    484       "authors": [
    485         "Sam Toyer",
    486         "Olivia Watkins",
    487         "Ethan Adrian Mendes"
    488       ],
    489       "year": 2024,
    490       "relevance": "Introduces prompt injection attack methods and a dataset from adversarial human interaction, relevant to LLM safety evaluation."
    491     },
    492     {
    493       "title": "Baseline Defenses for Adversarial Attacks against Aligned Language Models",
    494       "authors": [
    495         "Neel Jain",
    496         "Avi Schwarzschild",
    497         "Yuxin Wen"
    498       ],
    499       "year": 2024,
    500       "relevance": "Proposes baseline defense methods against adversarial attacks on LLMs, directly relevant to the prompt injection defense landscape."
    501     },
    502     {
    503       "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    504       "authors": [
    505         "Kai Greshake",
    506         "Sahar Abdelnabi",
    507         "Shailesh Mishra"
    508       ],
    509       "year": 2023,
    510       "relevance": "Demonstrates real-world prompt injection vulnerabilities in LLM-integrated applications, establishing the threat model this paper defends against."
    511     },
    512     {
    513       "title": "Jailbroken: How Does LLM Safety Training Fail?",
    514       "authors": [
    515         "Alexander Wei",
    516         "Nika Haghtalab",
    517         "Jacob Steinhardt"
    518       ],
    519       "year": 2023,
    520       "relevance": "Analyzes LLM safety training failures including understanding of encoded text, relevant to the encoding-based defense approach."
    521     },
    522     {
    523       "title": "GPT-4 is Too Smart to be Safe: Stealthy Chat with LLMs via Cipher",
    524       "authors": [
    525         "Youliang Yuan",
    526         "Wenxiang Jiao",
    527         "Wenxuan Wang"
    528       ],
    529       "year": 2024,
    530       "relevance": "Demonstrates LLM understanding of ciphers including Caesar, directly motivating the use of Caesar cipher as an encoding in this defense."
    531     },
    532     {
    533       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    534       "authors": [
    535         "Patrick Lewis",
    536         "Ethan Perez",
    537         "Aleksandara Piktus"
    538       ],
    539       "year": 2020,
    540       "arxiv_id": "2005.11401",
    541       "relevance": "Foundational RAG paper establishing the paradigm of LLMs accessing external content, which creates the vulnerability prompt injection exploits."
    542     }
    543   ],
    544   "engagement_factors": {
    545     "practical_relevance": {
    546       "score": 2,
    547       "justification": "Proposes a usable defense technique against prompt injection with open-source code, applicable to anyone building LLM-powered apps with external data."
    548     },
    549     "surprise_contrarian": {
    550       "score": 1,
    551       "justification": "The ensemble-of-encodings idea is novel but the finding that combining defenses improves robustness is not particularly surprising."
    552     },
    553     "fear_safety": {
    554       "score": 2,
    555       "justification": "Prompt injection is a major security concern for deployed LLM applications, and the paper demonstrates concrete attack/defense scenarios."
    556     },
    557     "drama_conflict": {
    558       "score": 0,
    559       "justification": "No controversy, no challenge to specific companies or claims — straightforwardly proposes an improvement over existing defenses."
    560     },
    561     "demo_ability": {
    562       "score": 2,
    563       "justification": "Code is publicly available on GitHub (MoEMEnT) and the technique can be reproduced with API access, though it requires benchmark setup."
    564     },
    565     "brand_recognition": {
    566       "score": 1,
    567       "justification": "Microsoft internship project evaluating on GPT-4/GPT-4o, but the authors and lab are not widely known and the venue is academic NLP."
    568     }
    569   }
    570 }

Impressum · Datenschutz