ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (34592B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Formalizing and Benchmarking Prompt Injection Attacks and Defenses",
      6     "authors": [
      7       "Yupei Liu",
      8       "Yuqi Jia",
      9       "Runpeng Geng",
     10       "Jinyuan Jia",
     11       "Neil Zhenqiang Gong"
     12     ],
     13     "year": 2023,
     14     "venue": "USENIX Security Symposium",
     15     "arxiv_id": "2310.12815",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims are supported: the framework is formalized (Section 4), the new Combined Attack is designed and shown effective (Tables 4–6), systematic evaluation covers 5 attacks, 10 defenses, 10 LLMs, 7 tasks (Section 6), and no existing defense is shown sufficient (Tables 7–8).",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The main causal claim — that combining attack strategies improves effectiveness — is justified through controlled single-variable manipulation: all attacks are tested on the same data/models with only the attack strategy varying (Table 4, Figure 2). Ablation studies (Figures 4, 7, 8) also use controlled manipulation.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Claims are bounded to the tested setting: '5 prompt injection attacks and 10 defenses with 10 LLMs and 7 tasks.' The paper specifies which models, datasets, and tasks were tested and does not claim results generalize beyond these. The threat model is explicitly scoped (Section 3).",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss alternative explanations for its findings. For example, it speculates that larger models are more vulnerable because they are 'more powerful in following instructions' (Section 6.2) but does not consider other factors. Section 8 discusses future work, not alternative explanations for observed results.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper's metrics directly measure what is claimed. ASV measures whether the LLM accomplishes the injected task (attack success), MR compares responses with/without attack, and FPR/FNR directly measure detection accuracy. No proxy gap exists between measurements and claims.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 8 'Discussion and Limitations' provides substantive discussion covering four specific limitations: lack of optimization-based attacks, unexplored fine-tuning defenses, absence of recovery mechanisms, and limited detection prompt exploration.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section 8 discusses specific threats: 'All existing prompt injection attacks are limited to heuristics' (not optimization-based), known-answer detection was 'limited to a specific detection prompt,' and fine-tuned LLMs 'may still be vulnerable to new attacks that were not considered during fine-tuning.' These are specific to this study.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section 3 explicitly scopes the attacker's background knowledge ('we assume the attacker does not know such internal details'), Section 7 distinguishes prompt injection from jailbreaking, and Section 8 states specific things not tested: optimization-based attacks, fine-tuning defenses, recovery mechanisms, and alternative detection prompts.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgements section lists NSF grants (No. 2112562, 1937786, 2131859, 2125977, 1937787), ARO grant (No. W911NF2110182), and Microsoft Azure credits.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly stated: Penn State University and Duke University. No authors are affiliated with the companies whose models are evaluated.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Primary funding is from NSF and ARO, which are government agencies independent of the evaluated products. Microsoft Azure credits provided computing resources but the paper's finding that GPT-4 is vulnerable to injection is not favorable to Microsoft.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is included in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are formally defined: 'prompt injection attack' (Definition 1), 'target task,' 'injected task,' 'LLM-Integrated Application,' 'compromised data,' and all evaluation metrics (PNA, ASV, MR, FPR, FNR) are precisely defined with formal equations.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly states three contributions in the introduction: (1) attack formalization framework with new Combined Attack, (2) systematic benchmark for attacks across 5 attacks × 10 LLMs × 7 tasks, (3) evaluation of 10 defenses with open-source platform.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 7 situates the work relative to prior prompt injection attacks, defenses, jailbreaking, privacy, and poisoning attacks. The paper explicitly shows how existing attacks fit as special cases in the framework and extends prior jailbreaking defenses to the prompt injection setting.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper provides a public GitHub repository: 'we make our platform public at https://github.com/liu00222/Open-Prompt-Injection' (Section 1, abstract).",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All 7 datasets used are publicly available standard benchmarks: SST2, MRPC, HSOL, RTE, SMS Spam, Jfleg, and Gigaword. The paper specifies which splits are used for each purpose (Section 6.1, Appendix A).",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements.txt, Dockerfile, or detailed environment setup is provided in the paper. The paper mentions using Azure OpenAI Studio API and specific model names but does not specify library versions or environment dependencies.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The paper releases a GitHub platform but does not include step-by-step reproduction instructions in the paper itself. No 'Reproducing Results' section, README commands, or scripts are described in the paper text.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results (ASV, MR, PNA-T, FPR, FNR) in Tables 4–9 and throughout the paper are reported as point estimates without confidence intervals or error bars.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper claims 'Combined Attack outperforms other attacks' and makes numerous comparisons across attacks, defenses, and models, but no statistical significance tests (p-values, t-tests, etc.) are used. All comparisons are based solely on comparing raw numbers.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Raw ASV/MR values are reported with full baseline context. For example, Table 4 shows Combined Attack (0.75) vs Naive Attack (0.62) vs Fake Completion (0.70) on GPT-4. Per-task breakdowns in Tables 5–6 and defense comparisons in Tables 7–8 provide magnitude context for all differences.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The paper uses 100 examples per task for target and injected data, and randomly samples 100 pairs for ASV/MR computation (Section 6.1), but provides no justification for why 100 was chosen and no power analysis.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or spread measures are reported. For open-source LLMs, a fixed seed produces deterministic single-run results. For closed-source LLMs, temperature is set to 0.1 and the paper only notes 'non-determinism has a small impact' without quantifying it (Section 6.1).",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Five attacks are compared (Naive, Escape Characters, Context Ignoring, Fake Completion, Combined Attack) and 10 defenses are benchmarked. The 'No defense' baseline is included for defense evaluation (Table 7a).",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "All attacks and defenses are from 2022–2023 works (references [4,8,9,11,14,23,25,30,31,34,35,40,43,50,51]), which were contemporary at the time of publication.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The Combined Attack is itself an ablation: individual components (Escape Characters, Context Ignoring, Fake Completion) are evaluated separately and then in combination (Table 4, Figure 2). Additional ablations study impact of in-context learning examples (Figure 4), number of tokens in injected data (Figure 7), and number of tokens in injected instruction (Figure 8).",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Five evaluation metrics are used: PNA-T, PNA-I, ASV, MR for attacks/prevention, and FPR/FNR for detection-based defenses (Section 6.1, Equations 2–6).",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "All evaluation is fully automated using accuracy, ROUGE-1, and GLEU score metrics. No human evaluation of attack success or defense quality is performed.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "The paper carefully ensures no overlap between target data, injected data, in-context learning examples, and clean data for PPL threshold selection. Appendix A details the data splitting procedure across all datasets.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Extensive per-category breakdowns are provided: per-target-task, per-injected-task, and per-LLM results in Tables 5–6, Tables 12–20, and per-defense breakdowns in Tables 21–32.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "The paper discusses where attacks fail (e.g., summarization as injected task achieves lowest MR of 0.67 in Table 6b), where defenses fail (all defenses shown insufficient in Tables 7–8), and specific failure modes of each detection method (Section 6.3).",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Key negative findings are prominent: all existing defenses are insufficient (Section 6.3), paraphrasing sacrifices utility (PNA-T decreases by 0.14, Table 7b), naive LLM-based detection has very high FPR (up to 0.93, Table 8b), and PPL detection misses nearly all attacks (FNR up to 1.00, Table 8a).",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Some models have specific versions (Vicuna-33b-v1.3, PaLM 2 text-bison-001, Llama-2-13b-chat), but the primary model GPT-4 is referenced without a snapshot date or API version. GPT-3.5-Turbo and Bard also lack version specificity (Table 3, Section 6.1).",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Table 11 in Appendix provides the exact instruction prompt and injected instruction text for all 7 tasks. Table 1 shows example compromised data for each attack. The GPT-4 API message format is also specified (Section 6.1).",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "The paper reports temperature=0.1 for closed-source LLMs and fixed random seed for open-source LLMs (Section 6.1), but does not report top-p, max_tokens, or other API parameters that significantly affect output.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used. The paper directly queries LLMs with prompts containing instruction + data, with no tools, retry logic, or multi-step workflows.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Appendix A provides detailed data selection procedures: which dataset splits are used, how 100 examples are sampled, how label conflicts are handled when target and injected tasks are the same classification task, and how in-context examples are selected without overlap.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The underlying benchmark datasets are publicly available, and the platform code is released, but raw experimental outputs (individual model responses, intermediate results) are not provided for independent verification.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 6.1 and Appendix A describe in detail which datasets are used, how examples are sampled (100 per task, uniform random without replacement), how label conflicts are resolved, and how data splits are assigned.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. All data comes from standard public NLP benchmarks (SST2, MRPC, HSOL, RTE, SMS Spam, Jfleg, Gigaword).",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The full pipeline is documented: dataset selection → example sampling (100 per task) → label conflict resolution for same-task scenarios → separate sampling for in-context examples and PPL thresholds with no-overlap guarantees → 100 random pair sampling for ASV/MR computation (Section 6.1, Appendix A).",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "The paper tests prompt injection attack/defense effectiveness, not model knowledge or capability on benchmarks. The core metric (ASV) measures whether the LLM follows injected instructions, not whether it knows correct benchmark answers.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": false,
    301           "answer": false,
    302           "justification": "The paper evaluates attacks and defenses rather than model knowledge. Whether models have seen the NLP benchmark data does not undermine the core attack effectiveness measurements.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "Contamination is structurally less relevant here: the paper measures instruction-following behavior under adversarial conditions, not model capability on benchmark tasks.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study. All experiments involve automated querying of LLMs on benchmark datasets.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No API costs, tokens consumed, or wall-clock time are reported despite querying 10 LLMs across 49 task combinations, 5 attacks, and 10 defenses.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No total computational budget, GPU hours, or API spend is reported. The paper acknowledges using Azure credits but does not quantify the compute used.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "For open-source LLMs, a single fixed seed is used. For closed-source LLMs, temperature=0.1 is used. No sensitivity analysis across multiple seeds is performed (Section 6.1).",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The paper does not explicitly state the number of experimental runs. Fixed seed implies single deterministic runs for open-source models, but this is never explicitly stated as 'one run' or 'averaged over K runs.'",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "No hyperparameter search is described. Attack parameters (escape characters, task-ignoring text, fake response) use fixed templates. The PPL detection threshold is set via a principled procedure, but no search budget is reported for other configuration choices.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": true,
    392           "justification": "All five attacks are evaluated transparently without cherry-picking. Defense configurations follow original papers. The PPL detection threshold is selected via a principled FPR-based procedure on separate clean data (Section 5.2). No config selection bias is evident.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "The paper makes hundreds of comparisons across 10 LLMs, 49 task combinations, 5 attacks, and 10 defenses, but no statistical significance tests are performed at all, let alone corrections for multiple comparisons.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The authors implement their own versions of all attacks and defenses and propose the Combined Attack that outperforms all others. They do not acknowledge the bias of evaluating their own system against their own implementations of baselines.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "The paper compares models ranging from 7B to 1.5T parameters and finds larger models are more vulnerable, but does not control for or report compute budget differences. No performance-vs-compute curves are provided.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "The paper does not discuss whether ASV/MR on standard NLP tasks actually captures real-world prompt injection risk. The gap between benchmarked attack success on controlled tasks and real-world LLM application vulnerability is not addressed.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": false,
    421           "answer": false,
    422           "justification": "No scaffolding is used. All experiments involve direct prompting of LLMs via API or local inference.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "The paper tests GPT-4 and other models on datasets like SST2 (2013), MRPC (2005), and SMS Spam (2011), which are almost certainly in training data. This is not discussed. While the core attack metric (ASV) is less affected, PNA baseline performance could be artificially inflated.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "Not discussed. The evaluation setup provides the LLM with full task instruction and data, which matches the intended use case, but potential information leakage through benchmark memorization is not considered.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "Not discussed. Target data and injected data are sampled from the same datasets (though without overlap), and independence from training data is not verified.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No leakage detection or prevention methods are applied. No canary strings, membership inference tests, or decontamination pipelines are used.",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "Combined Attack consistently outperforms all other prompt injection attacks across different LLMs and tasks",
    457       "evidence": "Table 4: Combined Attack achieves highest average ASV (0.75) vs Fake Completion (0.70), Escape Characters (0.66), Context Ignoring (0.65), Naive (0.62) on GPT-4. Table 10 confirms same ordering on PaLM 2.",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "No existing prevention-based defense is sufficient against prompt injection attacks",
    462       "evidence": "Table 7a shows all prevention defenses still yield high ASV/MR; Table 7b shows paraphrasing incurs average PNA-T decrease of 0.14. No defense both prevents attacks and preserves utility at acceptable levels.",
    463       "supported": "strong"
    464     },
    465     {
    466       "claim": "Known-answer detection is the most effective existing detection method but still misses large fractions of compromised data in many settings",
    467       "evidence": "Table 8a shows known-answer detection has lowest average FNR for most tasks, but Table 9 shows high FNRs for grammar correction target task (e.g., 0.53 for Naive Attack, 0.76 for Context Ignoring).",
    468       "supported": "strong"
    469     },
    470     {
    471       "claim": "Larger LLMs are more vulnerable to prompt injection attacks",
    472       "evidence": "Figure 3 and associated analysis report Pearson r=0.63 between model size and average ASV, r=0.64 for MR. However this is cross-model correlation confounding size with architecture and training.",
    473       "supported": "moderate"
    474     },
    475     {
    476       "claim": "Prompt injection attacks remain effective regardless of the number of in-context learning examples for the target task",
    477       "evidence": "Figure 4 shows Combined Attack achieves similar ASV across 0-5 in-context learning examples for most target/injected task combinations on GPT-4.",
    478       "supported": "strong"
    479     }
    480   ],
    481   "methodology_tags": [
    482     "benchmark-eval"
    483   ],
    484   "key_findings": "The paper establishes the first systematic benchmark for prompt injection attacks in LLM-integrated applications, testing 5 attacks and 10 defenses across 10 LLMs and 7 NLP tasks. The proposed Combined Attack (merging escape characters, context ignoring, and fake completion strategies) achieves the highest attack success rate (average ASV=0.75 on GPT-4) across all tested configurations. No existing defense is sufficient: prevention-based defenses either fail to stop attacks or incur unacceptable utility losses on clean data (paraphrasing degrades target task PNA by 0.14 on average), while detection-based defenses either miss most attacks or generate excessive false positives. Counterintuitively, larger LLMs appear more vulnerable to prompt injection (Pearson r=0.63), suggesting instruction-following capability is a double-edged sword.",
    485   "red_flags": [
    486     {
    487       "flag": "No statistical significance testing",
    488       "detail": "All comparative claims (Combined Attack outperforms baselines, known-answer detection best among detectors) are based on point estimates with no confidence intervals, error bars, or significance tests across any of the extensive tables."
    489     },
    490     {
    491       "flag": "Benchmark contamination unaddressed",
    492       "detail": "Evaluation uses pre-existing NLP benchmarks (SST2, SMS Spam, etc.) that were almost certainly in GPT-4's and PaLM 2's training data, potentially inflating baseline task performance metrics (PNA-I). Not discussed as a limitation."
    493     },
    494     {
    495       "flag": "GPT-4 and Bard versions unspecified",
    496       "detail": "GPT-4 and Bard lack snapshot dates; results cannot be exactly reproduced as closed-source model behavior changes across API versions."
    497     },
    498     {
    499       "flag": "Causal claim from cross-model correlation",
    500       "detail": "The finding that 'larger LLMs are more vulnerable' is based on Pearson r=0.63 across 10 models where size is confounded with architecture, instruction-tuning method, RLHF, and provider differences."
    501     },
    502     {
    503       "flag": "Sample size unjustified",
    504       "detail": "100 examples per task and 100 randomly sampled pairs for ASV/MR computation are used without power analysis or justification for statistical adequacy."
    505     }
    506   ],
    507   "cited_papers": [
    508     {
    509       "title": "Ignore Previous Prompt: Attack Techniques for Language Models",
    510       "relevance": "Early prompt injection attack (Context Ignoring strategy), treated as a special case in this paper's framework"
    511     },
    512     {
    513       "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    514       "relevance": "Foundational empirical work on indirect prompt injection in real-world deployed applications"
    515     },
    516     {
    517       "title": "Baseline Defenses for Adversarial Attacks Against Aligned Language Models",
    518       "relevance": "Source of paraphrasing and retokenization defenses, originally for jailbreaking, extended to prompt injection in this paper"
    519     },
    520     {
    521       "title": "Jatmo: Prompt Injection Defense by Task-Specific Finetuning",
    522       "relevance": "Concurrent defense approach using task-specific fine-tuning, noted as future work direction not evaluated in benchmark"
    523     },
    524     {
    525       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    526       "relevance": "Jailbreaking attack work providing contrast to prompt injection threat model distinction"
    527     },
    528     {
    529       "title": "GPT-4 Technical Report",
    530       "relevance": "Primary LLM evaluated and used as default model in benchmark experiments"
    531     },
    532     {
    533       "title": "LLaMA 2: Open Foundation and Fine-tuned Chat Models",
    534       "relevance": "Open-source LLMs (7B and 13B chat variants) included in the benchmark evaluation"
    535     },
    536     {
    537       "title": "Detecting Language Model Attacks with Perplexity",
    538       "relevance": "Source of PPL and Windowed PPL detection defenses evaluated and found insufficient in the paper"
    539     }
    540   ],
    541   "engagement_factors": {
    542     "practical_relevance": {
    543       "score": 3,
    544       "justification": "Releases an open-source benchmark platform (Open-Prompt-Injection) that practitioners can directly use to test their LLM applications against prompt injection attacks and evaluate defenses."
    545     },
    546     "surprise_contrarian": {
    547       "score": 1,
    548       "justification": "Confirms the widely-held belief that prompt injection is a serious threat and existing defenses are insufficient, rather than challenging conventional wisdom."
    549     },
    550     "fear_safety": {
    551       "score": 3,
    552       "justification": "Systematically demonstrates that LLM-integrated applications are vulnerable to prompt injection (OWASP #1 threat) and no existing defense is sufficient, with higher vulnerability in larger/more capable models."
    553     },
    554     "drama_conflict": {
    555       "score": 1,
    556       "justification": "Straightforward security research without controversy or claims that challenge specific companies or products."
    557     },
    558     "demo_ability": {
    559       "score": 2,
    560       "justification": "GitHub repository with the benchmark platform is publicly available for cloning and running, though not a pip-installable package or live demo."
    561     },
    562     "brand_recognition": {
    563       "score": 2,
    564       "justification": "Published at USENIX Security (top-tier security venue) and evaluates GPT-4, PaLM 2, Bard, and other well-known models. Authors from Penn State and Duke."
    565     }
    566   },
    567   "hn_data": {
    568     "threads": [
    569       {
    570         "hn_id": "42051518",
    571         "title": "Enhancing Long Context Performance in LLMs Through Inner Loop Query Mechanism",
    572         "points": 2,
    573         "comments": 0,
    574         "url": "https://news.ycombinator.com/item?id=42051518"
    575       },
    576       {
    577         "hn_id": "41894717",
    578         "title": "Decoding Emotions: Unveiling Facial Expressions Through Acoustic Sensing",
    579         "points": 2,
    580         "comments": 0,
    581         "url": "https://news.ycombinator.com/item?id=41894717"
    582       },
    583       {
    584         "hn_id": "38515649",
    585         "title": "Teaching Robots to Build Simulations of Themselves",
    586         "points": 2,
    587         "comments": 0,
    588         "url": "https://news.ycombinator.com/item?id=38515649"
    589       },
    590       {
    591         "hn_id": "47012965",
    592         "title": "Show HN: Agent Hypervisor – Reality Virtualization for AI Agents",
    593         "points": 1,
    594         "comments": 0,
    595         "url": "https://news.ycombinator.com/item?id=47012965"
    596       },
    597       {
    598         "hn_id": "37960618",
    599         "title": "Prompt Injection Attacks and Defenses in LLM-Integrated Applications",
    600         "points": 1,
    601         "comments": 0,
    602         "url": "https://news.ycombinator.com/item?id=37960618"
    603       },
    604       {
    605         "hn_id": "42044202",
    606         "title": "VibeCheck: Discover and Quantify Qualitative Differences in LLMs",
    607         "points": 1,
    608         "comments": 0,
    609         "url": "https://news.ycombinator.com/item?id=42044202"
    610       },
    611       {
    612         "hn_id": "38476635",
    613         "title": "User-Like Bots for Cognitive Automation",
    614         "points": 1,
    615         "comments": 0,
    616         "url": "https://news.ycombinator.com/item?id=38476635"
    617       },
    618       {
    619         "hn_id": "12644412",
    620         "title": "Semantic Measures Comparison Language Units, Concepts from Text and Knowledge Base",
    621         "points": 1,
    622         "comments": 0,
    623         "url": "https://news.ycombinator.com/item?id=12644412"
    624       }
    625     ],
    626     "top_points": 2,
    627     "total_points": 11,
    628     "total_comments": 0
    629   }
    630 }

Impressum · Datenschutz