scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32632B)
      1 {
      2   "paper": {
      3     "title": "Formalizing and Benchmarking Prompt Injection Attacks and Defenses",
      4     "authors": [
      5       "Yupei Liu",
      6       "Yuqi Jia",
      7       "Runpeng Geng",
      8       "Jinyuan Jia",
      9       "Neil Zhenqiang Gong"
     10     ],
     11     "year": 2023,
     12     "venue": "USENIX Security Symposium",
     13     "arxiv_id": "2310.12815"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "The paper proposes the first formal framework for prompt injection attacks and shows that a Combined Attack (merging escape characters, context ignoring, and fake completion) achieves the highest average ASV of 0.75 on GPT-4, outperforming all individual attacks (0.62–0.70). Systematic evaluation of 10 defenses across 10 LLMs and 7 tasks finds no existing defense is sufficient: prevention-based defenses have limited effectiveness or incur large utility losses, and detection-based defenses either miss many attacks or have high false positive rates. Larger LLMs are more vulnerable to prompt injection (Pearson r=0.63 between model size and ASV).",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper provides a public GitHub repository: 'we make our platform public at https://github.com/liu00222/Open-Prompt-Injection' (Section 1, abstract)."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "All 7 datasets used are publicly available standard benchmarks: SST2, MRPC, HSOL, RTE, SMS Spam, Jfleg, and Gigaword. The paper specifies which splits are used for each purpose (Section 6.1, Appendix A)."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No requirements.txt, Dockerfile, or detailed environment setup is provided in the paper. The paper mentions using Azure OpenAI Studio API and specific model names but does not specify library versions or environment dependencies."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper releases a GitHub platform but does not include step-by-step reproduction instructions in the paper itself. No 'Reproducing Results' section, README commands, or scripts are described in the paper text."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "All results (ASV, MR, PNA-T, FPR, FNR) in Tables 4–9 and throughout the paper are reported as point estimates without confidence intervals or error bars."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims 'Combined Attack outperforms other attacks' and makes numerous comparisons across attacks, defenses, and models, but no statistical significance tests (p-values, t-tests, etc.) are used. All comparisons are based solely on comparing raw numbers."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Raw ASV/MR values are reported with full baseline context. For example, Table 4 shows Combined Attack (0.75) vs Naive Attack (0.62) vs Fake Completion (0.70) on GPT-4. Per-task breakdowns in Tables 5–6 and defense comparisons in Tables 7–8 provide magnitude context for all differences."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper uses 100 examples per task for target and injected data, and randomly samples 100 pairs for ASV/MR computation (Section 6.1), but provides no justification for why 100 was chosen and no power analysis."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No variance, standard deviation, or spread measures are reported. For open-source LLMs, a fixed seed produces deterministic single-run results. For closed-source LLMs, temperature is set to 0.1 and the paper only notes 'non-determinism has a small impact' without quantifying it (Section 6.1)."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Five attacks are compared (Naive, Escape Characters, Context Ignoring, Fake Completion, Combined Attack) and 10 defenses are benchmarked. The 'No defense' baseline is included for defense evaluation (Table 7a)."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "All attacks and defenses are from 2022–2023 works (references [4,8,9,11,14,23,25,30,31,34,35,40,43,50,51]), which were contemporary at the time of publication."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The Combined Attack is itself an ablation: individual components (Escape Characters, Context Ignoring, Fake Completion) are evaluated separately and then in combination (Table 4, Figure 2). Additional ablations study impact of in-context learning examples (Figure 4), number of tokens in injected data (Figure 7), and number of tokens in injected instruction (Figure 8)."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Five evaluation metrics are used: PNA-T, PNA-I, ASV, MR for attacks/prevention, and FPR/FNR for detection-based defenses (Section 6.1, Equations 2–6)."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "All evaluation is fully automated using accuracy, ROUGE-1, and GLEU score metrics. No human evaluation of attack success or defense quality is performed."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper carefully ensures no overlap between target data, injected data, in-context learning examples, and clean data for PPL threshold selection. Appendix A details the data splitting procedure across all datasets."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Extensive per-category breakdowns are provided: per-target-task, per-injected-task, and per-LLM results in Tables 5–6, Tables 12–20, and per-defense breakdowns in Tables 21–32."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper discusses where attacks fail (e.g., summarization as injected task achieves lowest MR of 0.67 in Table 6b), where defenses fail (all defenses shown insufficient in Tables 7–8), and specific failure modes of each detection method (Section 6.3)."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Key negative findings are prominent: all existing defenses are insufficient (Section 6.3), paraphrasing sacrifices utility (PNA-T decreases by 0.14, Table 7b), naive LLM-based detection has very high FPR (up to 0.93, Table 8b), and PPL detection misses nearly all attacks (FNR up to 1.00, Table 8a)."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims are supported: the framework is formalized (Section 4), the new Combined Attack is designed and shown effective (Tables 4–6), systematic evaluation covers 5 attacks, 10 defenses, 10 LLMs, 7 tasks (Section 6), and no existing defense is shown sufficient (Tables 7–8)."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The main causal claim — that combining attack strategies improves effectiveness — is justified through controlled single-variable manipulation: all attacks are tested on the same data/models with only the attack strategy varying (Table 4, Figure 2). Ablation studies (Figures 4, 7, 8) also use controlled manipulation."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Claims are bounded to the tested setting: '5 prompt injection attacks and 10 defenses with 10 LLMs and 7 tasks.' The paper specifies which models, datasets, and tasks were tested and does not claim results generalize beyond these. The threat model is explicitly scoped (Section 3)."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper does not discuss alternative explanations for its findings. For example, it speculates that larger models are more vulnerable because they are 'more powerful in following instructions' (Section 6.2) but does not consider other factors. Section 8 discusses future work, not alternative explanations for observed results."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper's metrics directly measure what is claimed. ASV measures whether the LLM accomplishes the injected task (attack success), MR compares responses with/without attack, and FPR/FNR directly measure detection accuracy. No proxy gap exists between measurements and claims."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "Some models have specific versions (Vicuna-33b-v1.3, PaLM 2 text-bison-001, Llama-2-13b-chat), but the primary model GPT-4 is referenced without a snapshot date or API version. GPT-3.5-Turbo and Bard also lack version specificity (Table 3, Section 6.1)."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Table 11 in Appendix provides the exact instruction prompt and injected instruction text for all 7 tasks. Table 1 shows example compromised data for each attack. The GPT-4 API message format is also specified (Section 6.1)."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "The paper reports temperature=0.1 for closed-source LLMs and fixed random seed for open-source LLMs (Section 6.1), but does not report top-p, max_tokens, or other API parameters that significantly affect output."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. The paper directly queries LLMs with prompts containing instruction + data, with no tools, retry logic, or multi-step workflows."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Appendix A provides detailed data selection procedures: which dataset splits are used, how 100 examples are sampled, how label conflicts are handled when target and injected tasks are the same classification task, and how in-context examples are selected without overlap."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 8 'Discussion and Limitations' provides substantive discussion covering four specific limitations: lack of optimization-based attacks, unexplored fine-tuning defenses, absence of recovery mechanisms, and limited detection prompt exploration."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 8 discusses specific threats: 'All existing prompt injection attacks are limited to heuristics' (not optimization-based), known-answer detection was 'limited to a specific detection prompt,' and fine-tuned LLMs 'may still be vulnerable to new attacks that were not considered during fine-tuning.' These are specific to this study."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 3 explicitly scopes the attacker's background knowledge ('we assume the attacker does not know such internal details'), Section 7 distinguishes prompt injection from jailbreaking, and Section 8 states specific things not tested: optimization-based attacks, fine-tuning defenses, recovery mechanisms, and alternative detection prompts."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The underlying benchmark datasets are publicly available, and the platform code is released, but raw experimental outputs (individual model responses, intermediate results) are not provided for independent verification."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section 6.1 and Appendix A describe in detail which datasets are used, how examples are sampled (100 per task, uniform random without replacement), how label conflicts are resolved, and how data splits are assigned."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. All data comes from standard public NLP benchmarks (SST2, MRPC, HSOL, RTE, SMS Spam, Jfleg, Gigaword)."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The full pipeline is documented: dataset selection → example sampling (100 per task) → label conflict resolution for same-task scenarios → separate sampling for in-context examples and PPL thresholds with no-overlap guarantees → 100 random pair sampling for ASV/MR computation (Section 6.1, Appendix A)."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Acknowledgements section lists NSF grants (No. 2112562, 1937786, 2131859, 2125977, 1937787), ARO grant (No. W911NF2110182), and Microsoft Azure credits."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are clearly stated: Penn State University and Duke University. No authors are affiliated with the companies whose models are evaluated."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Primary funding is from NSF and ARO, which are government agencies independent of the evaluated products. Microsoft Azure credits provided computing resources but the paper's finding that GPT-4 is vulnerable to injection is not favorable to Microsoft."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is included in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "The paper tests prompt injection attack/defense effectiveness, not model knowledge or capability on benchmarks. The core metric (ASV) measures whether the LLM follows injected instructions, not whether it knows correct benchmark answers."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "The paper evaluates attacks and defenses rather than model knowledge. Whether models have seen the NLP benchmark data does not undermine the core attack effectiveness measurements."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "Contamination is structurally less relevant here: the paper measures instruction-following behavior under adversarial conditions, not model capability on benchmark tasks."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study. All experiments involve automated querying of LLMs on benchmark datasets."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No API costs, tokens consumed, or wall-clock time are reported despite querying 10 LLMs across 49 task combinations, 5 attacks, and 10 defenses."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No total computational budget, GPU hours, or API spend is reported. The paper acknowledges using Azure credits but does not quantify the compute used."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "For open-source LLMs, a single fixed seed is used. For closed-source LLMs, temperature=0.1 is used. No sensitivity analysis across multiple seeds is performed (Section 6.1)."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The paper does not explicitly state the number of experimental runs. Fixed seed implies single deterministic runs for open-source models, but this is never explicitly stated as 'one run' or 'averaged over K runs.'"
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No hyperparameter search is described. Attack parameters (escape characters, task-ignoring text, fake response) use fixed templates. The PPL detection threshold is set via a principled procedure, but no search budget is reported for other configuration choices."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "All five attacks are evaluated transparently without cherry-picking. Defense configurations follow original papers. The PPL detection threshold is selected via a principled FPR-based procedure on separate clean data (Section 5.2). No config selection bias is evident."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The paper makes hundreds of comparisons across 10 LLMs, 49 task combinations, 5 attacks, and 10 defenses, but no statistical significance tests are performed at all, let alone corrections for multiple comparisons."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors implement their own versions of all attacks and defenses and propose the Combined Attack that outperforms all others. They do not acknowledge the bias of evaluating their own system against their own implementations of baselines."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper compares models ranging from 7B to 1.5T parameters and finds larger models are more vulnerable, but does not control for or report compute budget differences. No performance-vs-compute curves are provided."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper does not discuss whether ASV/MR on standard NLP tasks actually captures real-world prompt injection risk. The gap between benchmarked attack success on controlled tasks and real-world LLM application vulnerability is not addressed."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is used. All experiments involve direct prompting of LLMs via API or local inference."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The paper tests GPT-4 and other models on datasets like SST2 (2013), MRPC (2005), and SMS Spam (2011), which are almost certainly in training data. This is not discussed. While the core attack metric (ASV) is less affected, PNA baseline performance could be artificially inflated."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Not discussed. The evaluation setup provides the LLM with full task instruction and data, which matches the intended use case, but potential information leakage through benchmark memorization is not considered."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "Not discussed. Target data and injected data are sampled from the same datasets (though without overlap), and independence from training data is not verified."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention methods are applied. No canary strings, membership inference tests, or decontamination pipelines are used."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "The Combined Attack (merging escape characters, context ignoring, and fake completion) outperforms all individual prompt injection attacks, achieving an average ASV of 0.75 on GPT-4 versus 0.62–0.70 for other attacks.",
    370       "evidence": "Table 4 shows average ASVs: Naive 0.62, Escape Characters 0.66, Context Ignoring 0.65, Fake Completion 0.70, Combined 0.75. Figure 2 shows per-task breakdowns confirming Combined Attack leads in nearly all task combinations.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "No existing prevention-based defense is sufficient: they either have limited attack prevention or incur large utility losses.",
    375       "evidence": "Table 7a shows ASV/MR remain high under all defenses (e.g., sandwich prevention: avg ASV still 0.26–0.70 depending on task). Table 7b shows paraphrasing decreases PNA-T by 0.14 on average; delimiters decrease it by 0.08. Grammar correction PNA-T drops to 0.00–0.01 under paraphrasing and delimiters.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "No existing detection-based defense is sufficient: they either miss a large fraction of attacks or have high false positive rates.",
    380       "evidence": "Table 8a shows PPL detection has FNR up to 1.00; windowed PPL detection up to 0.99. Naive LLM-based detection achieves FNR=0.00 but FPR up to 0.93 (Table 8b). Known-answer detection is best but still has FNR up to 0.12 for grammar correction (Table 8a) and misses many attacks in specific scenarios (Table 9).",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Larger LLMs are more vulnerable to prompt injection attacks, with Pearson correlation of 0.63 between model size and average ASV.",
    385       "evidence": "Figure 3 shows ASV and MR for each LLM ordered by model size. GPT-4 (1.5T) achieves highest ASV, while smaller models (7B) achieve lower ASV. Pearson correlation between ASV and model size is 0.63, MR and model size is 0.64 (Section 6.2).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Adding in-context learning examples for the target task has a small impact on Combined Attack effectiveness.",
    390       "evidence": "Figure 4 shows ASV remains similar across 0–5 in-context learning examples for all target/injected task combinations on GPT-4 (Section 6.2).",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Combined Attack is consistently effective across different LLMs, target tasks, and injected tasks, with average ASV=0.62 and MR=0.78 across all 10 LLMs and 49 task combinations.",
    395       "evidence": "Tables 5, 12–20 show detailed results per LLM. Table 6a shows similar ASV across target tasks (0.59–0.64). Table 6b shows variation by injected task (0.34–0.89). Section 6.2 reports the overall averages.",
    396       "supported": "strong"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "No error bars or uncertainty quantification",
    402       "detail": "All results across all tables and figures are single-run point estimates. For open-source LLMs, a single fixed seed is used; for closed-source, temperature=0.1 with no quantification of the 'small impact' of non-determinism. With 100 randomly sampled pairs per evaluation, sampling variance is not characterized."
    403     },
    404     {
    405       "flag": "No statistical significance tests",
    406       "detail": "Hundreds of comparisons are made across attacks, defenses, models, and tasks, but no significance tests are performed. Claims like 'Combined Attack outperforms other attacks' rely solely on comparing raw numbers."
    407     },
    408     {
    409       "flag": "Self-comparison bias",
    410       "detail": "The authors propose the Combined Attack and implement all baseline attacks and defenses themselves. The Combined Attack, which is their contribution, outperforms all others. They do not acknowledge the potential bias of evaluating their own system against their own implementations of prior work."
    411     },
    412     {
    413       "flag": "Benchmark construct validity not discussed",
    414       "detail": "ASV measured on controlled NLP tasks (sentiment analysis, spam detection, etc.) may not reflect real-world prompt injection risk in deployed LLM applications with more complex prompts, multi-turn interactions, and diverse data types."
    415     }
    416   ],
    417   "cited_papers": [
    418     {
    419       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    420       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    421       "year": 2023,
    422       "relevance": "Foundational paper on indirect prompt injection attacks against real-world LLM-integrated applications, directly related to the threat model formalized in this work."
    423     },
    424     {
    425       "title": "Ignore previous prompt: Attack techniques for language models",
    426       "authors": ["Fábio Perez", "Ian Ribeiro"],
    427       "year": 2022,
    428       "relevance": "Early work on context ignoring attacks against language models, one of the key attack strategies formalized and benchmarked in this paper."
    429     },
    430     {
    431       "title": "Baseline defenses for adversarial attacks against aligned language models",
    432       "authors": ["Neel Jain", "Avi Schwarzschild", "Yuxin Wen", "Gowthami Somepalli", "John Kirchenbauer"],
    433       "year": 2023,
    434       "relevance": "Proposes paraphrasing and retokenization defenses originally for jailbreaking, extended to prompt injection defense in this benchmark."
    435     },
    436     {
    437       "title": "Jatmo: Prompt injection defense by task-specific finetuning",
    438       "authors": ["Julien Piet", "Maha Alrashed", "Chawin Sitawarin", "Sizhe Chen", "Zeming Wei", "Elizabeth Sun", "Basel Alomair", "David Wagner"],
    439       "year": 2024,
    440       "relevance": "Proposes fine-tuning a non-instruction-tuned LLM as defense against prompt injection, a concurrent defense approach noted in the related work."
    441     },
    442     {
    443       "title": "Benchmarking and defending against indirect prompt injection attacks on large language models",
    444       "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Keegan Hines", "Emre Kiciman", "Guangzhong Sun", "Xing Xie", "Fangzhao Wu"],
    445       "year": 2023,
    446       "relevance": "Concurrent work on benchmarking indirect prompt injection attacks and defenses, complementary to this paper's formalization and evaluation framework."
    447     },
    448     {
    449       "title": "Jailbroken: How does llm safety training fail?",
    450       "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"],
    451       "year": 2023,
    452       "relevance": "Analyzes how LLM safety training fails against jailbreaking attacks, distinct from but related to prompt injection attacks studied here."
    453     },
    454     {
    455       "title": "Universal and transferable adversarial attacks on aligned language models",
    456       "authors": ["Andy Zou", "Zifan Wang", "J. Zico Kolter", "Matt Fredrikson"],
    457       "year": 2023,
    458       "relevance": "Proposes optimization-based adversarial attacks on LLMs that transfer across models, related to the future work on optimization-based prompt injection discussed in this paper."
    459     },
    460     {
    461       "title": "StruQ: Defending against prompt injection with structured queries",
    462       "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"],
    463       "year": 2024,
    464       "relevance": "Proposes structured query defense against prompt injection, a concurrent defense approach noted in the related work."
    465     },
    466     {
    467       "title": "Prompt injection attack against LLM-integrated applications",
    468       "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li", "Kailong Wang", "Tianwei Zhang"],
    469       "year": 2023,
    470       "relevance": "Studies prompt injection attacks from the perspective of malicious users against LLM-integrated applications, complementary attack scenario to the one formalized here."
    471     },
    472     {
    473       "title": "Extracting training data from large language models",
    474       "authors": ["Nicholas Carlini", "Florian Tramèr", "Eric Wallace"],
    475       "year": 2021,
    476       "relevance": "Demonstrates privacy attacks on LLMs via training data extraction, part of the broader LLM security landscape contextualizing prompt injection."
    477     },
    478     {
    479       "title": "PromptBench: Towards evaluating the robustness of large language models on adversarial prompts",
    480       "authors": ["Kaijie Zhu", "Jindong Wang", "Jiaheng Zhou", "Zichen Wang"],
    481       "year": 2023,
    482       "relevance": "Benchmarks LLM robustness against adversarial prompts (distinct from prompt injection), complementary evaluation framework for LLM security."
    483     },
    484     {
    485       "title": "Evaluating the susceptibility of pre-trained language models via handcrafted adversarial examples",
    486       "authors": ["Hezekiah J. Branch", "Jonathan Rodriguez Cefalu", "Jeremy McHugh"],
    487       "year": 2022,
    488       "relevance": "Early evaluation of language model susceptibility to adversarial inputs, one of the attack approaches formalized in this benchmark."
    489     }
    490   ],
    491   "engagement_factors": {
    492     "practical_relevance": {
    493       "score": 3,
    494       "justification": "Releases an open-source benchmark platform (Open-Prompt-Injection) that practitioners can directly use to test their LLM applications against prompt injection attacks and evaluate defenses."
    495     },
    496     "surprise_contrarian": {
    497       "score": 1,
    498       "justification": "Confirms the widely-held belief that prompt injection is a serious threat and existing defenses are insufficient, rather than challenging conventional wisdom."
    499     },
    500     "fear_safety": {
    501       "score": 3,
    502       "justification": "Systematically demonstrates that LLM-integrated applications are vulnerable to prompt injection (OWASP #1 threat) and no existing defense is sufficient, with higher vulnerability in larger/more capable models."
    503     },
    504     "drama_conflict": {
    505       "score": 1,
    506       "justification": "Straightforward security research without controversy or claims that challenge specific companies or products."
    507     },
    508     "demo_ability": {
    509       "score": 2,
    510       "justification": "GitHub repository with the benchmark platform is publicly available for cloning and running, though not a pip-installable package or live demo."
    511     },
    512     "brand_recognition": {
    513       "score": 2,
    514       "justification": "Published at USENIX Security (top-tier security venue) and evaluates GPT-4, PaLM 2, Bard, and other well-known models. Authors from Penn State and Duke."
    515     }
    516   }
    517 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs