scan.json (30334B)
1 { 2 "paper": { 3 "title": "Separator Injection Attack: Uncovering Dialogue Biases in Large Language Models Caused by Role Separators", 4 "authors": [ 5 "Xitao Li", 6 "Haijun Wang", 7 "Jiang Wu", 8 "Ting Liu" 9 ], 10 "year": 2025, 11 "venue": "arXiv.org", 12 "arxiv_id": "2504.05689", 13 "doi": "10.48550/arXiv.2504.05689" 14 }, 15 "scan_version": 3, 16 "active_modules": [ 17 "experimental_rigor", 18 "data_leakage" 19 ], 20 "methodology_tags": [ 21 "benchmark-eval" 22 ], 23 "key_findings": "Role separators in conversational LLMs create a strong positional bias toward the nearest instruction, with an average PBI of -0.909 when a user separator is inserted between two tasks. The Separator Injection Attack (SIA) exploits this bias to improve manual prompt injection by an average of 18.2% ASR, and boosts the automatic method TAP to 100% ASR while reducing queries from 28 to 8.8. Existing defenses (token filtering, sandwich reminder, StruQ) are partially effective but each has exploitable weaknesses.", 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "No code repository URL is provided anywhere in the paper. No GitHub link, Zenodo archive, or supplementary materials link is mentioned." 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": true, 34 "justification": "The paper uses publicly available standard benchmarks: MRPC, RTE, SST2, SMS, OpenBookQA, CommonsenseQA, MMLU, ARC, JFLEG, HSOL, and Gigaword. All are well-known public datasets." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "The paper mentions 'NVIDIA A6000 GPUs with 48GB VRAM' for MGCG experiments (Appendix C.2), but provides no requirements.txt, Dockerfile, library versions, or detailed environment setup." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "No step-by-step reproduction instructions are provided. While attack prompt templates are given in Table 12, there are no scripts, commands, or README-style instructions to replicate the experiments." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": false, 51 "justification": "The main results in Table 6 report only point estimates for ASR with no confidence intervals or error bars. Figure 3 shows min-max error bands for MGCG across seeds, but this covers only a small subset of experiments." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": false, 56 "justification": "No statistical significance tests are used. Claims that SIA 'improves' over baselines are based solely on comparing raw ASR numbers without any hypothesis testing." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "The paper reports percentage improvements with baseline context: 'SIA improves by 24.9% in the Tricky task and 24.5% in the Dangerous task' (Section 5.2). Table 6 shows absolute ASR values for both baselines and SIA variants, allowing readers to assess effect magnitude." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "The paper states 'Each dataset is sampled with 200 cases' (Section 5.1) but provides no justification for why 200 was chosen and no power analysis." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": false, 71 "justification": "For MGCG, three experiments with different seeds are reported with min-max error bands (Appendix C.2, Figure 3). However, the main manual attack results in Table 6 — the paper's primary contribution — report single-run numbers with no variance or spread measures." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "Three manual baselines (naive, combined, repeated) from Liu et al. (2024) and two automatic baselines (TAP, MGCG) are compared against SIA variants (Section 5.1)." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": true, 83 "justification": "MGCG (Liu et al., 2024) and TAP (Mehrotra et al., 2023) are recent and represent competitive automatic prompt injection methods. Manual baselines are from 2022-2024 literature." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": true, 88 "justification": "Figure 2 tests all six SIA variants (SIA-base, SIA-Thank, SIA-Refuse, SIA-Reappear, SIA-Follow) across four task types on the MRPC dataset, isolating which variant works best for each task." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "Multiple metrics are used across different experiments: ASR for attack evaluation, PBI and TBI for bias quantification (Section 3.2), accuracy for robustness analysis (Section 3.1), attention scores for interpretability (Section 3.3), and average queries for TAP efficiency." 94 }, 95 "human_evaluation": { 96 "applies": true, 97 "answer": false, 98 "justification": "No human evaluation is included. ASR is measured automatically by string matching (goal hijacking) or key presence (prompt leaking). Bias evaluation uses GPT-4o-mini as judge (Appendix B.1)." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": true, 103 "justification": "Appendix B.1 states: 'Our evaluation is conducted on the test sets, with a sample of 200 cases, but when the test set is unavailable, we opt to use the dev set as a substitute.'" 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "Table 6 provides per-model (Llama2, Llama3, Vicuna, Qwen2) and per-task (SK, SP, TK, DG) breakdowns. Figure 8 in the appendix shows per-dataset preference distributions." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 5.2 'Counter-intuitive Drop Analysis' discusses cases where SIA decreases ASR (red values in Table 6), attributing it to 'lost-in-the-middle' effects with repeated characters. Appendix B.2 includes a case study of incomplete role-flipping." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": true, 118 "justification": "Table 6 explicitly marks performance drops in red (e.g., Llama2 SK with naive -0.029, repeated method drops up to -8%). The paper discusses these as 'counter-intuitive drop outliers' rather than hiding them." 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "Abstract claims 'average gain of 18.2% for manual methods' and 'enhances the attack success rate to 100% with automatic methods' are supported by Table 6 (average across manual baselines) and Figure 3 (TAP+SIA achieving 1.000 ASR)." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper's core causal claim is that SIA improves attack success. This is tested via controlled single-variable manipulation: each baseline method is tested with and without SIA, holding everything else constant (Table 6). The attention analysis (Section 3.3, Table 4) provides mechanistic evidence for the positional bias claim." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": false, 135 "justification": "The abstract and conclusion make broad claims about 'LLMs' exhibiting 'significant vulnerabilities,' but open-source experiments use only 7-8B parameter models. The title claims to uncover 'Dialogue Biases in Large Language Models' but does not test models above 8B parameters. Black-box experiments on ChatGPT/GPT-4o-mini partially mitigate this but are limited to the naive/combined/repeated baselines only." 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper discusses 'lost-in-the-middle' as an explanation for counter-intuitive drops, but does not consider alternative explanations for the main findings. For instance, the positional bias could be an artifact of the specific benchmarks used, or the attention-based explanation could be confounded by other factors in the model architecture." 141 }, 142 "proxy_outcome_distinction": { 143 "applies": true, 144 "answer": true, 145 "justification": "The paper measures ASR directly (whether target strings appear in output), which directly matches the claim of attack success rate. No proxy gap exists between measurement and framing." 146 } 147 }, 148 "setup_transparency": { 149 "model_versions_specified": { 150 "applies": true, 151 "answer": false, 152 "justification": "Models are identified as 'Llama2-7B', 'Llama3-8B', 'Vicuna-7B', 'Qwen2-7B', 'ChatGPT', 'GPT-4o-mini', and 'Mistral-7B' — family names with sizes but no exact version strings, snapshot dates, or API versions. Model behavior changes across versions." 153 }, 154 "prompts_provided": { 155 "applies": true, 156 "answer": true, 157 "justification": "Table 12 (Appendix) provides the full text of all attack prompts (SIA variants), baseline injection prompts (naive, combined, repeated), evaluation prompts for correctness and bias judgment, and the special token stealing prompt for black-box attacks." 158 }, 159 "hyperparameters_reported": { 160 "applies": true, 161 "answer": false, 162 "justification": "MGCG hyperparameters are reported (token length 50, batch size 128, top-k 128, 1000 steps) and TAP settings (branching factor 3, depth/width 10) in Appendix C.2. However, temperature, top-p, and sampling settings for the four main LLMs in manual attack experiments are not reported, which significantly affect output." 163 }, 164 "scaffolding_described": { 165 "applies": false, 166 "answer": false, 167 "justification": "No agentic scaffolding is used. The experiments involve direct prompting of LLMs." 168 }, 169 "data_preprocessing_documented": { 170 "applies": true, 171 "answer": false, 172 "justification": "The paper mentions sampling 200 cases per dataset and constructing pairwise sets (Table 11), but does not describe the sampling procedure (random? stratified?) or any preprocessing of the benchmark data before use." 173 } 174 }, 175 "limitations_and_scope": { 176 "limitations_section_present": { 177 "applies": true, 178 "answer": false, 179 "justification": "There is no dedicated Limitations or Threats to Validity section. The paper structure is Introduction, Background, Empirical Study, Separator Injection Attack, Experiment, and Conclusion. The conclusion briefly mentions the weakness is 'inevitable' but does not discuss study limitations." 180 }, 181 "threats_to_validity_specific": { 182 "applies": true, 183 "answer": false, 184 "justification": "No specific threats to validity are discussed anywhere in the paper. There is no consideration of potential confounds, evaluation limitations, or methodological weaknesses." 185 }, 186 "scope_boundaries_stated": { 187 "applies": true, 188 "answer": false, 189 "justification": "The paper does not explicitly state what the results do NOT show. No boundaries on model size generalization, task type limitations, or settings where SIA might not apply." 190 } 191 }, 192 "data_integrity": { 193 "raw_data_available": { 194 "applies": true, 195 "answer": false, 196 "justification": "No raw experimental data (model outputs, individual response logs) is released. Only aggregated ASR numbers are shown in tables." 197 }, 198 "data_collection_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section 5.1 describes the experimental setup: 4 models × 7 datasets × 200 samples × 10 injection cases = 56,000 queries. Dataset sources and task types are clearly identified. Appendix B.1 describes dataset selection criteria." 202 }, 203 "recruitment_methods_described": { 204 "applies": false, 205 "answer": false, 206 "justification": "No human participants. All data comes from standard public NLP benchmarks." 207 }, 208 "data_pipeline_documented": { 209 "applies": true, 210 "answer": true, 211 "justification": "Section 5.1 documents the pipeline: benchmark selection, 200-case sampling per dataset, 10 injection cases per sample, evaluation via string matching (goal hijacking) or key presence (prompt leaking). Table 11 in the appendix documents dataset pair construction for bias experiments." 212 } 213 }, 214 "conflicts_of_interest": { 215 "funding_disclosed": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding information, acknowledgments section, or grant numbers are mentioned anywhere in the paper." 219 }, 220 "affiliations_disclosed": { 221 "applies": true, 222 "answer": true, 223 "justification": "All four authors are listed as affiliated with Xi'an Jiaotong University. They evaluate third-party models (Llama, Vicuna, Qwen, GPT), not their own products." 224 }, 225 "funder_independent_of_outcome": { 226 "applies": true, 227 "answer": false, 228 "justification": "No funding is disclosed, so independence of funder cannot be assessed." 229 }, 230 "financial_interests_declared": { 231 "applies": true, 232 "answer": false, 233 "justification": "No competing interests statement or financial disclosure appears in the paper." 234 } 235 }, 236 "contamination": { 237 "training_cutoff_stated": { 238 "applies": false, 239 "answer": false, 240 "justification": "This is a red-teaming study that tests prompt injection attacks against LLMs. The evaluation measures attack success rate, not model knowledge or capability on benchmarks. The standard NLP datasets serve as context for the attack, not as capability benchmarks." 241 }, 242 "train_test_overlap_discussed": { 243 "applies": false, 244 "answer": false, 245 "justification": "The paper tests attack methodology, not model knowledge on benchmarks. Whether models have seen MRPC or ARC in training does not affect whether a prompt injection attack succeeds." 246 }, 247 "benchmark_contamination_addressed": { 248 "applies": false, 249 "answer": false, 250 "justification": "Benchmark contamination is not relevant to evaluating prompt injection attack effectiveness. The correctness of the model's answer to the normal task is not the metric of interest." 251 } 252 }, 253 "human_studies": { 254 "pre_registered": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study. All experiments involve automated evaluation of LLMs." 258 }, 259 "irb_or_ethics_approval": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants. The study involves prompting LLMs with standard benchmarks and attack strings." 263 }, 264 "demographics_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 }, 269 "inclusion_exclusion_criteria": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants." 273 }, 274 "randomization_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants." 278 }, 279 "blinding_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants." 283 }, 284 "attrition_reported": { 285 "applies": false, 286 "answer": false, 287 "justification": "No human participants." 288 } 289 }, 290 "cost_and_practicality": { 291 "inference_cost_reported": { 292 "applies": true, 293 "answer": false, 294 "justification": "Total query counts are stated (56,000 for manual, and per-case queries for TAP: 28 vs 8.8), but no API costs, dollar amounts, wall-clock time, or cost-per-example figures are reported." 295 }, 296 "compute_budget_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "NVIDIA A6000 GPUs with 48GB VRAM are mentioned for MGCG (Appendix C.2), but total GPU hours, training time, or overall computational budget are not quantified." 300 } 301 }, 302 "experimental_rigor": { 303 "seed_sensitivity_reported": { 304 "applies": true, 305 "answer": false, 306 "justification": "For MGCG only, 'We conducted three experiments using different seeds and plotted the mean loss with a min-max error band' (Appendix C.2). However, the main manual attack results in Table 6 — comprising the bulk of the paper's claims — do not report any seed sensitivity." 307 }, 308 "number_of_runs_stated": { 309 "applies": true, 310 "answer": false, 311 "justification": "Three runs are stated for MGCG (Appendix C.2). No run count is stated for the main manual attack experiments or the bias study, which comprise the majority of reported results." 312 }, 313 "hyperparameter_search_budget": { 314 "applies": true, 315 "answer": false, 316 "justification": "No hyperparameter search is described. The preliminary experiment on MRPC (Figure 2) selects the best SIA variant per task, but no broader hyperparameter search budget is reported." 317 }, 318 "best_config_selection_justified": { 319 "applies": true, 320 "answer": true, 321 "justification": "Section 5.2 describes selecting SIA-base for prompt leaking and SIA-Reappear for goal hijacking based on preliminary results on the MRPC dataset (Figure 2), providing a clear selection rationale." 322 }, 323 "multiple_comparison_correction": { 324 "applies": true, 325 "answer": false, 326 "justification": "The paper makes numerous comparisons (4 models × 4 tasks × 3 baselines = 48 comparisons in Table 6 alone) without any correction for multiple comparisons." 327 }, 328 "self_comparison_bias_addressed": { 329 "applies": true, 330 "answer": false, 331 "justification": "The authors evaluate their own SIA method against baseline implementations without acknowledging the bias of evaluating their own system or discussing whether baseline implementations were optimally tuned." 332 }, 333 "compute_budget_vs_performance": { 334 "applies": true, 335 "answer": true, 336 "justification": "Figure 3 shows TAP ASR as a function of cumulative queries (demonstrating SIA reduces queries from 28 to 8.8 for the same or better ASR) and MGCG loss as a function of training epochs, reporting performance relative to compute." 337 }, 338 "benchmark_construct_validity": { 339 "applies": true, 340 "answer": false, 341 "justification": "The paper does not discuss whether ASR on the chosen NLP benchmarks (MRPC, RTE, etc.) as attack contexts reflects real-world prompt injection risk. No discussion of whether these task-based attacks translate to practical application scenarios." 342 }, 343 "scaffold_confound_addressed": { 344 "applies": false, 345 "answer": false, 346 "justification": "No scaffolding is used. Direct prompting of LLMs only." 347 } 348 }, 349 "data_leakage": { 350 "temporal_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether models' training data includes the benchmark datasets or attack patterns. The standard benchmarks (MRPC 2005, SST2 2013, etc.) predate all tested models' training cutoffs." 354 }, 355 "feature_leakage_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of whether the evaluation setup leaks information. For the robustness experiments (Section 3.1), having seen benchmark answers in training could inflate accuracy scores." 359 }, 360 "non_independence_addressed": { 361 "applies": true, 362 "answer": false, 363 "justification": "No discussion of whether train and test splits are independent or whether benchmark examples could appear in model training data." 364 }, 365 "leakage_detection_method": { 366 "applies": true, 367 "answer": false, 368 "justification": "No leakage detection or prevention methods are employed. No canary strings, membership inference, or decontamination analysis." 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "SIA achieves an average ASR gain of 18.2% for manual prompt injection methods across models and tasks.", 375 "evidence": "Table 6 shows ASR improvements across 4 models, 4 tasks, and 3 baseline methods. Specific gains: 24.9% for Tricky, 24.5% for Dangerous, 8.7% for SecretKey, 14.7% for SystemPrompt (Section 5.2).", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "SIA enhances TAP's attack success rate to 100% while reducing average queries from 28 to 8.8 per case.", 380 "evidence": "Figure 3 (top) shows TAP+SIA achieving cumulative ASR of 1.000 with 8.8 queries vs. TAP alone at 0.857 with 28 queries, tested on Llama2 with target '[ACTION] sudo rm -rf /*' (Section 5.2, Appendix C.2).", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Inserting a U-SEP between two tasks creates a strong positional bias toward the nearest instruction, with average PBI of -0.909.", 385 "evidence": "Table 2 shows PBI values across 4 models and 2 task types with U-SEP, averaging -0.909. Compared to 0.252 average PBI without separator (Section 3.2).", 386 "supported": "strong" 387 }, 388 { 389 "claim": "The generation separator plays a more critical role than the initial separator in affecting model behavior.", 390 "evidence": "Figure 1 shows Groups 1 and 3 (same generation separator) synchronize performance changes, as do Groups 2 and 4. Table 3 attention scores show more pronounced shifts at the latter separator position (Section 3.1, 3.3).", 391 "supported": "strong" 392 }, 393 { 394 "claim": "MGCG combined with SIA boosts ASR from 23.3% to 97.7%.", 395 "evidence": "Figure 3 (bottom) shows MGCG+SIA achieving lowest loss bound. 'MGCG with SIA achieves the lowest loss bound, and boosts the ASR from 23.3% to 97.7%' (Section 5.2). Tested with 3 seeds.", 396 "supported": "moderate" 397 }, 398 { 399 "claim": "Existing defenses (token filtering, sandwich reminder, StruQ) have significant weaknesses against SIA.", 400 "evidence": "Figure 5 and Section 5.4: sandwich reminder increases system prompt leakage from 12.4% to 25.3%. StruQ shows strong first-position bias (PBI 0.896, Table 8). Token filtering can be bypassed by inserting characters between separator tokens (Table 9 shows bypass still outperforms baselines).", 401 "supported": "moderate" 402 } 403 ], 404 "red_flags": [ 405 { 406 "flag": "No limitations section", 407 "detail": "The paper has no dedicated limitations or threats-to-validity section. For a security paper proposing a new attack, this is a significant omission — the scope of applicability, potential defenses, and experimental limitations are not explicitly acknowledged." 408 }, 409 { 410 "flag": "Narrow model size range with broad claims", 411 "detail": "Open-source experiments use only 7-8B parameter models (Llama2-7B, Llama3-8B, Vicuna-7B, Qwen2-7B), but claims extend to 'LLMs' generally. Larger models may have different robustness properties. Black-box tests on ChatGPT/GPT-4o-mini partially address this but are limited." 412 }, 413 { 414 "flag": "No statistical significance tests", 415 "detail": "Across 48+ comparisons in Table 6, improvements are assessed by raw ASR differences without any significance testing. Some reported 'improvements' are small enough to be within noise (e.g., 0.003, 0.007)." 416 }, 417 { 418 "flag": "Main results appear to be single-run", 419 "detail": "While MGCG experiments use 3 seeds, the main manual attack results in Table 6 (the paper's primary contribution, covering 56,000 queries) do not report multiple runs or variance. ASR could vary across runs due to model sampling stochasticity." 420 }, 421 { 422 "flag": "No code release", 423 "detail": "Despite proposing a concrete attack framework with specific implementation details, no code repository is provided for independent verification or replication." 424 } 425 ], 426 "cited_papers": [ 427 { 428 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 429 "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"], 430 "year": 2023, 431 "relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications, directly relevant to understanding prompt injection threat models." 432 }, 433 { 434 "title": "StruQ: Defending against prompt injection with structured queries", 435 "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"], 436 "year": 2024, 437 "arxiv_id": "2402.06363", 438 "relevance": "Proposes structured query defense against prompt injection via fine-tuning on special tokens, evaluated as a defense baseline in this paper." 439 }, 440 { 441 "title": "Automatic and universal prompt injection attacks against large language models", 442 "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang", "Ning Zhang", "Chaowei Xiao"], 443 "year": 2024, 444 "arxiv_id": "2403.04957", 445 "relevance": "Introduces MGCG, a momentum-enhanced optimization attack for universal prompt injection, used as an automatic attack baseline." 446 }, 447 { 448 "title": "Tree of attacks: Jailbreaking black-box LLMs automatically", 449 "authors": ["Anay Mehrotra", "Manolis Zampetakis", "Paul Kassianik", "Blaine Nelson", "Hyrum Anderson", "Yaron Singer", "Amin Karbasi"], 450 "year": 2023, 451 "arxiv_id": "2312.02119", 452 "relevance": "Introduces TAP, an automatic jailbreaking method using tree-of-thought reasoning, used as an automatic attack baseline combined with SIA." 453 }, 454 { 455 "title": "Ignore previous prompt: Attack techniques for language models", 456 "authors": ["Fábio Perez", "Ian Ribeiro"], 457 "year": 2022, 458 "arxiv_id": "2211.09527", 459 "relevance": "Early taxonomy of prompt injection attack techniques including goal hijacking and prompt leaking, foundational to the attack categories evaluated." 460 }, 461 { 462 "title": "The instruction hierarchy: Training LLMs to prioritize privileged instructions", 463 "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"], 464 "year": 2024, 465 "arxiv_id": "2404.13208", 466 "relevance": "Proposes instruction hierarchy as a defense against prompt injection by training models to distinguish privilege levels of instructions." 467 }, 468 { 469 "title": "Ignore this title and HackAPrompt: Exposing systemic vulnerabilities of LLMs through a global prompt hacking competition", 470 "authors": ["Sander Schulhoff", "Jeremy Pinto", "Anaum Khan", "Louis-François Bouchard", "Chenglei Si"], 471 "year": 2023, 472 "relevance": "Large-scale prompt injection competition revealing systemic LLM vulnerabilities; SecretKey task format used in this paper's evaluation." 473 }, 474 { 475 "title": "Evaluating the instruction-following robustness of large language models to prompt injection", 476 "authors": ["Zekun Li", "Baolin Peng", "Pengcheng He", "Xifeng Yan"], 477 "year": 2023, 478 "arxiv_id": "2308.10819", 479 "relevance": "Evaluates LLM robustness to prompt injection from an instruction-following perspective, directly related to the multi-task instruction-following gap studied here." 480 }, 481 { 482 "title": "Benchmarking and defending against indirect prompt injection attacks on large language models", 483 "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Keegan Hines", "Emre Kiciman", "Guangzhong Sun", "Xing Xie", "Fangzhao Wu"], 484 "year": 2023, 485 "arxiv_id": "2312.14197", 486 "relevance": "Comprehensive benchmark and defense evaluation for indirect prompt injection, providing datasets and evaluation methodology used in this paper." 487 }, 488 { 489 "title": "Universal and transferable adversarial attacks on aligned language models", 490 "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J Zico Kolter", "Matt Fredrikson"], 491 "year": 2023, 492 "arxiv_id": "2307.15043", 493 "relevance": "Introduces GCG, the gradient-based adversarial suffix attack on which MGCG is built, foundational to automatic prompt injection methods." 494 } 495 ], 496 "engagement_factors": { 497 "practical_relevance": { 498 "score": 2, 499 "justification": "Security practitioners could use SIA patterns for red-teaming LLM applications, and the defense analysis informs deployment decisions." 500 }, 501 "surprise_contrarian": { 502 "score": 1, 503 "justification": "Role separators as attack vectors is somewhat known but the systematic quantification of positional bias (PBI -0.909) and 100% ASR with TAP adds empirical grounding." 504 }, 505 "fear_safety": { 506 "score": 2, 507 "justification": "Demonstrates prompt injection achieving up to 100% ASR and shows existing defenses have exploitable weaknesses, raising concerns about deployed LLM security." 508 }, 509 "drama_conflict": { 510 "score": 1, 511 "justification": "Notes 'weak awareness of the risks associated with special tokens' from platforms like OpenAI, Claude, and Langchain, implying provider negligence." 512 }, 513 "demo_ability": { 514 "score": 0, 515 "justification": "No code released, no demo available, no pip-installable tool." 516 }, 517 "brand_recognition": { 518 "score": 1, 519 "justification": "Tests on ChatGPT and GPT-4o-mini, and mentions Claude and OpenAI in security discussion, but authors are from Xi'an Jiaotong University." 520 } 521 } 522 }