scan.json (31248B)
1 { 2 "paper": { 3 "title": "Robustness via Referencing: Defending against Prompt Injection Attacks by Referencing the Executed Instruction", 4 "authors": [ 5 "Yulin Chen", 6 "Haoran Li", 7 "Yuan Sui", 8 "Yue Liu", 9 "Yufei He", 10 "Yangqiu Song", 11 "Bryan Hooi" 12 ], 13 "year": 2025, 14 "venue": "arXiv", 15 "arxiv_id": "2504.20472", 16 "doi": "10.48550/arXiv.2504.20472" 17 }, 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "No code repository URL is provided in the paper. No GitHub link, Zenodo archive, or any other code release is mentioned." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The paper uses publicly available datasets: AlpacaFarm [12], SQuAD [32], TriviaQA [20], and the injected datasets Inj-SQuAD and Inj-TriviaQA from [8]. All datasets are either standard public benchmarks or previously published." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "Appendix A mentions 'PyTorch 2.1.0' and 'single NVIDIA H100 GPU' with some generation settings, but no requirements.txt, Dockerfile, Python version, CUDA version, or other library versions are provided. Not enough detail to recreate the environment." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The methodology is described but without actionable reproduction steps." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "All results in Tables 1-9 are reported as point estimates (percentages) with no confidence intervals, error bars, or ± notation." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "The paper claims their method 'outperforms' baselines based solely on comparing numerical values in tables. No statistical significance tests (p-values, t-tests, etc.) are reported." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "The paper reports specific percentage improvements with baseline context, e.g., 'surpasses the baselines by at least 19.71% across all attacks and models' (Section 4.3.1), and Table 4 provides full baseline accuracy values for utility comparison." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "Dataset sizes are stated (208 for AlpacaFarm, 900 each for Inj-SQuAD and Inj-TriviaQA) but no justification is given for why these sizes are adequate for the claims made." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "No variance, standard deviation, or any spread measure is reported. Although do_sample is set to false (greedy decoding, Appendix A), making results deterministic, the paper does not discuss this as a reason for omitting variance or report any sensitivity analysis." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "Extensive baselines are included: four prompt-engineering defenses (Sandwich, Instructional, Reminder, Spotlight) and one fine-tuning defense (StruQ), plus a no-defense baseline. Described in Section 4.2." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "Baselines include recent work: StruQ [6] (2024), Spotlight [16] (2024), and the datasets from [8] (2025). The baselines represent the current state of the art in prompt injection defense." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "Section 4.4 includes two ablation studies: (1) impact of window size K on defense performance and utility (Figure 2), and (2) impact of removing in-context learning examples from the guideline prompt (Figure 3)." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "Two evaluation metrics are used: Attack Success Rate (ASR) for security and Accuracy for utility, evaluated on QA tasks (SQuAD, TriviaQA) and sentiment analysis (SST2)." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "No human evaluation is included. All evaluations are automated: ASR is measured by checking if the response contains the injected instruction's answer, and accuracy is measured against golden answers." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": false, 97 "justification": "No explicit separation between development and test data is described. The prompt template and in-context examples appear to have been designed without a held-out validation set, and it is unclear whether any tuning decisions were made on the same datasets used for final evaluation." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Results are broken down by attack type (Naive, Ignore, Escape, Fakecom, Combined), by model, and by dataset (AlpacaFarm, Inj-SQuAD, Inj-TriviaQA) across all main tables." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": false, 107 "justification": "The case study in Section 4.5 and Appendix C presents three cases, all showing successful defense. No failure cases are analyzed despite non-zero ASR values in several conditions (e.g., 7.00% on Inj-TriviaQA)." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "The ablation study in Figure 3 shows that removing in-context learning examples causes significant performance degradation on Qwen2-7B-Instruct (TriviaQA drops to 33.33%, SST drops to 23.17%), demonstrating a condition where the method fails." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims the method 'outperforms prompt-engineering-based baselines and is comparable to fine-tuning methods, reducing the ASR to 0% in some scenarios' — supported by Tables 1-3 showing 0% ASR in multiple cells. The claim of 'minimal impact on overall utility' is supported by Table 4." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper's ablation studies (Section 4.4) use controlled single-variable manipulation: removing in-context examples while keeping everything else constant demonstrates their causal contribution. The window size ablation similarly isolates one variable." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The title claims 'Defending against Prompt Injection Attacks' broadly, but the method is tested on only 5 prompt-engineering attacks and 2 gradient-based attacks. The paper does not discuss attacks that could bypass the method (e.g., attacks that manipulate the tagging format, or adversarially crafted outputs that mimic the expected structure)." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "No alternative explanations for the results are discussed. The paper does not consider why the method works beyond the stated reasoning, does not include a threats-to-validity section, and does not address potential confounding factors." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": true, 139 "justification": "The paper directly measures what it claims: ASR measures attack success (whether the defense prevents the model from following injected instructions), and accuracy measures utility. No proxy gap exists between measurements and claims." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": false, 146 "justification": "Models are identified by family names (e.g., 'Llama3-8B-Instruct', 'GPT-3.5-Turbo', 'GPT-4o-mini') without specific version snapshots or API dates. GPT-3.5-Turbo and GPT-4o-mini behavior changes across versions, and no snapshot date is provided." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": true, 151 "justification": "The full system prompt including the algorithm description and output structure is provided in Section 3.2. Two complete in-context learning examples are provided in Appendix D. The complete input format with tagging scheme is also shown." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": true, 156 "justification": "Appendix A reports: do_sample=false, max_new_tokens=256, max_length=8192, K=32 (words per line). Hardware is specified as single NVIDIA H100 GPU with PyTorch 2.1.0." 157 }, 158 "scaffolding_described": { 159 "applies": false, 160 "answer": false, 161 "justification": "No agentic scaffolding is used. The method is a direct prompting approach with post-processing filtering." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "The data preprocessing is thoroughly documented in Section 3.2 (Tagging and Splitting): data content is split into lines of maximum K=32 words, each prefixed with '[L X]' tags, and organized into Instruction Area and Data Area sections." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": false, 173 "justification": "There is no dedicated limitations section. The paper goes from the ablation study (Section 4.4) and case study (Section 4.5) directly to Related Work (Section 5) and Conclusion (Section 6) without discussing limitations." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": false, 178 "justification": "No threats to validity are discussed anywhere in the paper, neither specific nor generic." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show, what types of attacks might bypass the defense, or what settings are excluded from the claims." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": false, 190 "justification": "No raw experimental data (model outputs, per-example results) is released. Only aggregated metrics in tables are provided." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 4.1 describes the datasets: AlpacaFarm with simple questions (208 samples) following [6]'s setup, and Inj-SQuAD/Inj-TriviaQA (900 samples each) from [8] with phishing, advertisement, and propaganda injection goals." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants. All datasets are standard benchmarks (AlpacaFarm, SQuAD, TriviaQA) or previously published injection datasets." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": true, 205 "justification": "The pipeline from input to evaluation is documented: data content is split and tagged (Section 3.2), processed through the LLM with the defense prompt, filtered by tag matching, and evaluated against golden answers or injection targets." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": false, 212 "justification": "No funding sources are disclosed. There is no acknowledgments section listing grants, corporate sponsors, or funding agencies." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Author affiliations are clearly listed: National University of Singapore and The Hong Kong University of Science and Technology. The authors are not affiliated with the companies whose models are evaluated." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": false, 222 "justification": "Cannot be assessed since no funding information is disclosed. Without knowing the funding source, independence cannot be verified." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial interests statement is present in the paper." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": false, 233 "answer": false, 234 "justification": "This paper tests prompt injection defenses, not model knowledge on benchmarks. The primary evaluation (ASR) measures whether the defense prevents the model from following injected instructions, not the model's pre-trained knowledge." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": false, 238 "answer": false, 239 "justification": "The paper evaluates defense mechanisms against prompt injection attacks, not model capability on benchmark tasks. Train/test overlap is not relevant to the core claims." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": false, 243 "answer": false, 244 "justification": "The paper tests defenses rather than model knowledge. The QA utility evaluation is secondary and contamination would not affect the defense effectiveness claims." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": false, 288 "justification": "No inference cost, latency, or API costs are reported. The method adds overhead through longer prompts (tagging, in-context examples) and post-processing filtering, but this is not quantified." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "Only 'single NVIDIA H100 GPU' is mentioned (Appendix A). No total GPU hours, wall-clock time, or API costs are provided." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": false, 300 "justification": "Results are not reported across multiple random seeds. Appendix A states do_sample=false (greedy decoding), making results deterministic, but this is not framed as a seed sensitivity analysis." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": false, 305 "justification": "The number of experimental runs is not explicitly stated. While greedy decoding implies single deterministic runs, this is not discussed." 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "No hyperparameter search budget is reported. The window size K=32 and the prompt design appear to be chosen without systematic search. The ablation over window sizes (Figure 2) is done on the evaluation data, not a separate validation set." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": false, 315 "justification": "While the ablation study (Figure 2) shows performance across window sizes 8-128, the selection of K=32 is not justified through a proper validation/test split. Configuration selection appears to be done on the same data used for final evaluation." 316 }, 317 "multiple_comparison_correction": { 318 "applies": true, 319 "answer": false, 320 "justification": "The paper makes numerous comparisons across 5+ attack methods, 8 models, and 3 datasets without any statistical tests or multiple comparison corrections." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "The authors implement their own versions of baselines (prompt-engineering defenses) and compare against their own method without acknowledging potential self-comparison bias. No independent evaluation or discussion of this bias." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": false, 330 "justification": "No discussion of compute differences between methods. The proposed method requires longer prompts (with tagging and ICL examples) and post-processing filtering compared to simpler baselines. StruQ requires fine-tuning. These compute differences are not analyzed." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": false, 335 "justification": "No discussion of whether the ASR metric and the chosen attack benchmarks adequately measure real-world prompt injection defense effectiveness. The representativeness of the 5 prompt-engineering and 2 gradient-based attacks is not examined." 336 }, 337 "scaffold_confound_addressed": { 338 "applies": false, 339 "answer": false, 340 "justification": "No agentic scaffolding is used. The method is a direct prompting approach." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "Not discussed. The utility evaluation uses SQuAD (2016) and TriviaQA (2017), which predate all tested models' training data. Model familiarity with QA answers could inflate utility accuracy numbers, but this is not addressed." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "Not discussed. The evaluation setup provides QA context to the model alongside the question, and no analysis of whether the evaluation format leaks information is provided." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "Not discussed. No analysis of whether evaluation examples share structural similarities or whether the datasets have dependencies." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": false, 362 "justification": "No concrete leakage detection or prevention method is applied." 363 } 364 } 365 }, 366 "scan_version": 3, 367 "active_modules": [ 368 "experimental_rigor", 369 "data_leakage" 370 ], 371 "claims": [ 372 { 373 "claim": "The proposed method outperforms all prompt-engineering-based baselines by at least 19.71% ASR reduction in direct prompt injection scenarios.", 374 "evidence": "Table 1 shows ASR results on AlpacaFarm across three models and five attacks. The proposed method achieves maximum ASR of 2.88% vs. the best baseline (Spotlight) at 24.04% on Llama3-8B-Instruct Naive attack.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "The method reduces ASR to 0% in some scenarios.", 379 "evidence": "Tables 1-3 show 0.00% ASR in multiple cells: e.g., Llama3-8B-Instruct with Ignore attack (0.00%), Fakecom (0.00%) on AlpacaFarm (Table 1).", 380 "supported": "strong" 381 }, 382 { 383 "claim": "The method is comparable to or outperforms the fine-tuning defense StruQ, especially on unknown attacks (Fakecom, Combined).", 384 "evidence": "Table 1 shows StruQ ASR on Qwen2-7B: Fakecom 16.35%, Combined 30.29%, vs. the proposed method's 1.92% and 1.92%. Tables 2-3 show even larger gaps on indirect attacks where StruQ's ASR reaches 93.33% on Qwen2-7B Fakecom.", 385 "supported": "strong" 386 }, 387 { 388 "claim": "The method has minimal impact on general model performance.", 389 "evidence": "Table 4 shows the method improves QA accuracy in most cases (e.g., Llama3-8B SQuAD from 83.56% to 87.78%) and reduces SST2 accuracy by only 1.53% on average.", 390 "supported": "strong" 391 }, 392 { 393 "claim": "The method is effective on larger models (70B, 405B parameters) with maximum ASR of 5.77% for direct and 2.22% for indirect attacks.", 394 "evidence": "Tables 5-6 show results on Llama3-70B, Llama3.1-70B, and Llama3.1-405B. Maximum ASR is 5.77% (direct, Llama3.1-405B Naive) and 2.22% (indirect, Llama3-70B Naive).", 395 "supported": "strong" 396 }, 397 { 398 "claim": "The method is effective on closed-source models (GPT-3.5-Turbo, GPT-4o-mini).", 399 "evidence": "Tables 7-8 show maximum ASR of 3.77% for direct and 3.89% for indirect attacks on closed-source models.", 400 "supported": "strong" 401 }, 402 { 403 "claim": "Window size for splitting has no significant impact on defense performance or utility.", 404 "evidence": "Figure 2 shows across three models with window sizes 8-128, the difference between best and worst defense performance is only 0.84% for Llama3-8B and 2% for utility.", 405 "supported": "moderate" 406 }, 407 { 408 "claim": "In-context learning examples are critical for maintaining structured output and model utility.", 409 "evidence": "Figure 3 shows that removing ICL examples causes dramatic accuracy drops: Qwen2-7B TriviaQA drops from 78.00% to 33.33%, SST from 94.04% to 23.17%.", 410 "supported": "strong" 411 } 412 ], 413 "methodology_tags": [ 414 "benchmark-eval" 415 ], 416 "key_findings": "The paper proposes a prompt injection defense that leverages rather than suppresses LLMs' instruction-following ability, by prompting models to generate responses with explicit references to executed instructions and then filtering by reference tags. The method reduces ASR to near-zero across 8 models (7B-405B, open and closed-source), 5 attack types, and 3 datasets, outperforming all prompt-engineering baselines and matching or exceeding the fine-tuning defense StruQ. The approach is particularly effective against unknown attacks (Fakecom, Combined) where StruQ struggles, and maintains or improves QA utility while incurring only ~1.5% average accuracy loss on sentiment analysis.", 417 "red_flags": [ 418 { 419 "flag": "No error bars or uncertainty quantification", 420 "detail": "All results across 9 tables are reported as single point estimates with no confidence intervals, standard deviations, or significance tests, despite making comparative claims across many conditions." 421 }, 422 { 423 "flag": "No limitations section", 424 "detail": "The paper lacks any discussion of limitations, threats to validity, or scope boundaries. No consideration of what types of attacks might bypass the defense (e.g., adversarial attacks that manipulate the tagging format or mimic the expected output structure)." 425 }, 426 { 427 "flag": "No adaptive attack analysis", 428 "detail": "The threat model assumes attackers have no knowledge of the defense system. No evaluation against adaptive attacks where the attacker knows about the tagging and filtering mechanism and specifically crafts attacks to exploit it." 429 }, 430 { 431 "flag": "Case studies show only successes", 432 "detail": "The three case studies in Section 4.5/Appendix C all demonstrate successful defense. No analysis of the failure cases despite non-zero ASR (up to 7%) in several conditions." 433 }, 434 { 435 "flag": "No code released", 436 "detail": "Despite proposing a prompt-engineering method that would be easy to share, no code repository is provided for reproduction." 437 } 438 ], 439 "cited_papers": [ 440 { 441 "title": "Struq: Defending against prompt injection with structured queries", 442 "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"], 443 "year": 2024, 444 "arxiv_id": "2402.06363", 445 "relevance": "Fine-tuning-based prompt injection defense using adversarial training, serves as the primary fine-tuning baseline in this paper." 446 }, 447 { 448 "title": "Defending against indirect prompt injection attacks with spotlighting", 449 "authors": ["Keegan Hines", "Gary Lopez", "Matthew Hall", "Federico Zarfati", "Yonatan Zunger", "Emre Kiciman"], 450 "year": 2024, 451 "arxiv_id": "2403.14720", 452 "relevance": "Prompt-engineering defense using special tokens to delineate data content, serves as a baseline defense method." 453 }, 454 { 455 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 456 "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"], 457 "year": 2023, 458 "relevance": "Foundational work on indirect prompt injection attacks targeting LLM-integrated applications." 459 }, 460 { 461 "title": "Ignore previous prompt: Attack techniques for language models", 462 "authors": ["Fábio Perez", "Ian Ribeiro"], 463 "year": 2022, 464 "arxiv_id": "2211.09527", 465 "relevance": "Early work on prompt injection attack techniques including the 'ignore' attack method used as a baseline." 466 }, 467 { 468 "title": "The instruction hierarchy: Training LLMs to prioritize privileged instructions", 469 "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"], 470 "year": 2024, 471 "arxiv_id": "2404.13208", 472 "relevance": "Fine-tuning approach to teach LLMs to prioritize system-level instructions over user-level ones for prompt injection defense." 473 }, 474 { 475 "title": "Universal and transferable adversarial attacks on aligned language models", 476 "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J. Zico Kolter", "Matt Fredrikson"], 477 "year": 2023, 478 "arxiv_id": "2307.15043", 479 "relevance": "GCG attack method for gradient-based adversarial suffix optimization against LLMs, used as a gradient-based attack baseline." 480 }, 481 { 482 "title": "AutoDAN: interpretable gradient-based adversarial attacks on large language models", 483 "authors": ["Sicheng Zhu", "Ruiyi Zhang", "Bang An", "Gang Wu", "Joe Barrow", "Zichao Wang", "Furong Huang", "Ani Nenkova", "Tong Sun"], 484 "year": 2023, 485 "arxiv_id": "2310.15140", 486 "relevance": "Gradient-based adversarial attack method for LLMs, used as a gradient-based attack baseline." 487 }, 488 { 489 "title": "Aligning LLMs to be robust against prompt injection", 490 "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "Chuan Guo"], 491 "year": 2024, 492 "arxiv_id": "2410.05451", 493 "relevance": "Fine-tuning approach to align LLMs against prompt injection, related defense method." 494 }, 495 { 496 "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents", 497 "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"], 498 "year": 2024, 499 "arxiv_id": "2403.02691", 500 "relevance": "Benchmark for evaluating indirect prompt injection in agentic LLM systems." 501 }, 502 { 503 "title": "FATH: Authentication-based test-time defense against indirect prompt injection attacks", 504 "authors": ["Jiongxiao Wang", "Fangzhou Wu", "Wendi Li", "Jinsheng Pan", "Edward Suh", "Z Morley Mao", "Muhao Chen", "Chaowei Xiao"], 505 "year": 2024, 506 "arxiv_id": "2410.21492", 507 "relevance": "Authentication-based defense mechanism against indirect prompt injection attacks." 508 }, 509 { 510 "title": "Can indirect prompt injection attacks be detected and removed?", 511 "authors": ["Yulin Chen", "Haoran Li", "Yuan Sui", "Yufei He", "Yue Liu", "Yangqiu Song", "Bryan Hooi"], 512 "year": 2025, 513 "relevance": "Constructs the Inj-SQuAD and Inj-TriviaQA datasets used for indirect attack evaluation in this paper." 514 }, 515 { 516 "title": "MELON: Indirect prompt injection defense via masked re-execution and tool comparison", 517 "authors": ["Kaijie Zhu", "Xianjun Yang", "Jindong Wang", "Wenbo Guo", "William Yang Wang"], 518 "year": 2025, 519 "arxiv_id": "2502.05174", 520 "relevance": "Defense method against indirect prompt injection using masked re-execution, related approach." 521 }, 522 { 523 "title": "Automatic and universal prompt injection attacks against large language models", 524 "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang", "Ning Zhang", "Chaowei Xiao"], 525 "year": 2024, 526 "arxiv_id": "2403.04957", 527 "relevance": "Universal prompt injection attack leveraging GCG optimization, relevant to the gradient-based attack evaluations." 528 } 529 ], 530 "engagement_factors": { 531 "practical_relevance": { 532 "score": 2, 533 "justification": "The method is a prompt-engineering defense applicable to any LLM application, requiring only prompt changes and output filtering — directly usable by practitioners." 534 }, 535 "surprise_contrarian": { 536 "score": 1, 537 "justification": "Proposes leveraging instruction-following ability rather than suppressing it, a modest reversal of the dominant defense paradigm, but not a deeply surprising result." 538 }, 539 "fear_safety": { 540 "score": 2, 541 "justification": "Directly addresses prompt injection attacks which are a major LLM security concern, including phishing, propaganda, and goal hijacking scenarios." 542 }, 543 "drama_conflict": { 544 "score": 0, 545 "justification": "No controversy, no critique of specific companies or products, straightforward academic contribution." 546 }, 547 "demo_ability": { 548 "score": 1, 549 "justification": "The full prompt is provided so someone could implement it manually, but no code repository, demo, or installable tool is released." 550 }, 551 "brand_recognition": { 552 "score": 1, 553 "justification": "From NUS and HKUST — well-known academic institutions but not celebrity AI labs. Tests on popular models (Llama, GPT) adds some recognition." 554 } 555 } 556 }