scan.json (30391B)
1 { 2 "paper": { 3 "title": "FATH: Authentication-based Test-time Defense against Indirect Prompt Injection Attacks", 4 "authors": [ 5 "Jiongxiao Wang", 6 "Fangzhou Wu", 7 "Wendi Li", 8 "Jinsheng Pan", 9 "Edward Suh", 10 "Z. Morley Mao", 11 "Muhao Chen", 12 "Chaowei Xiao" 13 ], 14 "year": 2024, 15 "venue": "arXiv.org", 16 "arxiv_id": "2410.21492", 17 "doi": "10.48550/arXiv.2410.21492" 18 }, 19 "scan_version": 3, 20 "active_modules": ["experimental_rigor", "data_leakage"], 21 "methodology_tags": ["benchmark-eval"], 22 "key_findings": "FATH, an authentication-based test-time defense against indirect prompt injection, achieves near 0% attack success rate on GPT-3.5 across all tested attack methods including adaptive and optimization-based attacks. The method uses HMAC-based dynamic tags to label LLM responses and rule-based parsing to filter outputs, outperforming four existing test-time defenses. However, defense effectiveness is weaker on Llama3-8B (26-34% ASR under adaptive attacks), and the method incurs a Judge Score penalty of ~1.0-1.6 points, indicating reduced generation quality.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The abstract explicitly states: 'Our code is released at: https://github.com/Jayfeather1024/FATH'." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "All benchmark data is publicly available: Stanford Alpaca (Apache-2.0), QA from Zverev et al. 2024 (CC BY 4.0), CLF from OpenPromptInjection (CC BY 4.0), InjecAgent (MIT), and URLs generated by the Python 'fake' package (MIT). Appendix G documents all dataset details and licenses." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "The paper mentions 'Meta-Llama-3-8B-Instruct with 1x NVIDIA A100 GPU' and 'gpt-3.5-turbo with OpenAI API' but provides no requirements.txt, Dockerfile, or detailed library version list for dependencies like the hmac package or Sentence Transformers." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "While the GitHub repository is provided and prompt templates are given in appendices, the paper itself does not include step-by-step reproduction instructions (commands to run, scripts to execute, or a 'Reproducing Results' section)." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "All ASR values in Tables 2 and 3 are reported as point estimates (e.g., 0.08, 0.00, 0.26) with no confidence intervals or error bars." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper claims FATH outperforms all baselines based on comparing raw ASR numbers without any statistical significance tests (no p-values, no t-tests, no bootstrap tests)." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "Results in Tables 2 and 3 show both baseline and defense ASR values, providing clear context for the magnitude of improvement (e.g., No Defense ASR 0.60 → FATH ASR 0.00 for GPT-3.5 URL Injection under Combined Attack)." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "The paper uses 100 text examples per injection task for OpenPromptInjection+ and 510 for InjecAgent without justifying these sample sizes or performing power analysis." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "No variance, standard deviation, or multiple-run results are reported. All results appear to be single-run point estimates." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Table 2 compares FATH against four baseline test-time defenses: Instructional Prevention, Sandwich Prevention, Text Instruction Isolation, and ICL Defense, plus a No Defense setting." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "Baselines are from Liu et al. 2023b and Yi et al. 2023, which are the most recent test-time defense methods for indirect prompt injection at the time of writing." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "Section 5.6 and Table 4 present ablation studies removing Authentication Tags and Security Policy individually, showing the contribution of each component." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "The paper reports both Attack Success Rate (ASR) for defense effectiveness and Judge Score for generation quality impact (Table 2)." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": false, 97 "justification": "All evaluation is automated: ASR is computed programmatically and Judge Score uses GPT-3.5 as an LLM judge. No human evaluation of defense effectiveness or output quality." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": false, 102 "justification": "The paper does not describe a separate dev/test split. In-context examples are selected via semantic similarity to the user instruction, which means the ICL selection process operates on the same benchmark used for evaluation, without a clear separation." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Results are broken down by three injection task categories (URL, QA, CLF), by five attack methods plus adaptive attacks, and across two models (Table 2)." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": false, 112 "justification": "While the paper reports that FATH has higher ASR under adaptive attacks on Llama3 (0.26-0.34 in Table 2), it does not analyze specific failure cases or explain why the defense fails in those instances." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The paper reports that FATH reduces the Judge Score from 8.31 to 6.73 on Llama3 and from 7.94 to 6.91 on GPT-3.5, acknowledging 'a small decrease in the Judge Score.' It also reports that FATH under Llama3 still allows 26-34% ASR under adaptive attacks." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "The abstract claims 'near 0% ASR on GPT3.5 for various attack methods, surpassing all previous defenses,' which is confirmed by Table 2 showing 0.00-0.02 ASR across all attacks. The abstract also specifies 'under Llama3 and GPT3.5 models,' appropriately bounding the performance claim." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "The paper makes causal claims that FATH's components (Authentication Tags, Security Policy) contribute to defense effectiveness. The ablation study in Section 5.6 provides controlled single-variable manipulation evidence, showing removal of Security Policy increases ASR by 30%+ under adaptive attacks (Table 4)." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": false, 134 "justification": "While the abstract bounds specific performance claims to 'Llama3 and GPT3.5,' the title and broader claims like 'effectively defend against indirect prompt injection attacks' and 'securing LLM-integrated applications' are not bounded to the two models tested. The Limitations section partially addresses this but the framing remains broader than the evidence." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper does not consider alternative explanations for the observed results. For example, it does not discuss whether the low ASR could be due to prompt complexity overwhelming the model rather than the authentication mechanism specifically, or whether the effectiveness is specific to the tested attack patterns." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": true, 144 "justification": "The paper measures Attack Success Rate, which directly corresponds to its claims about defense against prompt injection attacks. The Judge Score metric is also clearly framed as a measure of generation quality impact. No proxy-outcome gap exists." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": false, 151 "justification": "The paper specifies 'Meta-Llama-3-8B-Instruct' (a specific model version) but only 'gpt-3.5-turbo' without a snapshot date or API version for the OpenAI model. GPT-3.5-turbo changes behavior over time without a pinned version." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": true, 156 "justification": "Full prompt templates are provided in Figures 3, 4, 7, 8, and appendices D, E, with the actual text used. The security policy, in-context examples, attack templates, and adaptive attack prompts are all included with explicit text." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": false, 161 "justification": "Section 5.2 states 'We set all parameters to default for model generation' without specifying the actual values of temperature, top-p, or other sampling parameters." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "FATH is a prompt-based defense method, not an agentic scaffolding system. It uses HMAC tag generation and rule-based parsing, but these are not agentic scaffolding." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 5.1 describes how OpenPromptInjection+ was constructed: selecting examples from Stanford Alpaca with both 'instruction' and 'input', defining three injection task categories (URL, QA, CLF), and details on dataset sources. Appendix G provides dataset licenses and sources." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": true, 178 "justification": "A dedicated 'Limitations' section appears after the Conclusion, discussing three specific limitations of FATH." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": true, 183 "justification": "The Limitations section discusses threats specific to this study: (1) substantial manual effort for prompt design across applications, (2) reliance on instruction-following ability making FATH ineffective on weaker models like Alpaca, (3) limited benchmark coverage that does not include real tool usage scenarios." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": true, 188 "justification": "The paper states specific boundaries: FATH 'may be reduced when applied to LLMs with comparatively weaker instruction-following abilities,' and current benchmarks 'can not provide real tool usage scenarios' so they 'directly provide external text information to simulate the results of tool execution.'" 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": false, 195 "justification": "The paper does not release raw experimental outputs, LLM responses, or per-example evaluation logs for independent verification." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section 5.1 and Appendix G describe benchmark construction: 100 examples from Stanford Alpaca per injection task, 510 examples from InjecAgent for direct harm, with sources and licenses documented." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants. All data sources are standard public benchmarks and programmatically generated data." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "The paper documents the pipeline from data source selection (Section 5.1), through attack injection (Appendix C), defense prompt construction (Section 4 and appendices), to evaluation metric computation (Section 5.2). Dataset construction for OpenPromptInjection+ is described step by step." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding acknowledgment or grant information appears in the paper." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Author affiliations are clearly listed: UW-Madison, Huazhong University, University of Rochester, NVIDIA, Cornell University, University of Michigan, UC-Davis." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": false, 227 "justification": "No funding information is disclosed, making it impossible to assess funder independence. One author (Edward Suh) is affiliated with NVIDIA, which has a commercial interest in LLM security." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests or financial interest declaration appears in the paper." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": false, 238 "answer": false, 239 "justification": "This paper tests a defense method against prompt injection attacks, not model knowledge or capability on a benchmark. Contamination of training data with benchmark answers is not the relevant concern." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": false, 243 "answer": false, 244 "justification": "The paper evaluates defense effectiveness, not model knowledge. Whether the model saw similar attack patterns during training is a different concern from traditional train/test contamination." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": false, 248 "answer": false, 249 "justification": "Same reasoning: the paper tests defense robustness, not whether models memorized benchmark answers." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in this study." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "FATH requires generating HMAC tags, running semantic search for ICL examples, and querying the LLM with substantially longer prompts (the defense prompt in Figure 3 is very long). No inference cost, latency, or API cost is reported." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "The paper mentions '1x NVIDIA A100 GPU' for Llama3 but does not state total GPU hours, API costs, or wall-clock time for the experiments." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "All results appear to be from single runs with no seed sensitivity analysis reported." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": false, 310 "justification": "The paper does not state how many runs produced the reported results. No mention of averaging across runs or number of trials." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "FATH involves design choices (number of ICL examples, tag generation, prompt template structure) but no hyperparameter search budget is reported." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": false, 320 "justification": "The paper presents one configuration of FATH (5 tags, N+1 ICL examples) without explaining how this configuration was selected or whether alternatives were tried." 321 }, 322 "multiple_comparison_correction": { 323 "applies": true, 324 "answer": false, 325 "justification": "The paper makes many comparisons across 5 defense methods, 6 attack types, 3 injection tasks, and 2 models, but no correction for multiple comparisons is applied. No statistical tests are used at all." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "The authors propose FATH and evaluate it against their own implementations of baseline defenses without acknowledging potential self-comparison bias." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "FATH uses significantly longer prompts with ICL examples and security policies compared to simpler baselines, but no compute cost comparison across methods is provided." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": false, 340 "justification": "The paper introduces OpenPromptInjection+ but does not discuss whether this benchmark accurately represents real-world prompt injection scenarios. The Limitations section acknowledges that 'current benchmarks can not provide real tool usage scenarios' but does not formally analyze construct validity." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "FATH is a prompt-level defense, not a scaffold. No scaffolding comparison is involved." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "The paper does not discuss whether attack patterns in the benchmarks could have appeared in the training data of GPT-3.5 or Llama3, which could affect how models respond to injections." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "Not discussed. The evaluation setup could potentially leak information about the expected response format through the extensive ICL examples." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "The paper does not discuss whether the ICL demonstration examples and test examples share structural similarities that could inflate performance." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": false, 367 "justification": "No concrete leakage detection or prevention method is applied." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "FATH achieves near 0% ASR on GPT-3.5 across all attack methods and injection tasks on OpenPromptInjection+.", 374 "evidence": "Table 2 shows 0.00-0.02 ASR for GPT-3.5 under Threat Modeling 1 attacks and 0.00 ASR under adaptive attacks across URL, QA, and CLF injection tasks.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "FATH achieves 0% ASR on both GPT-3.5 and Llama3 on the InjecAgent benchmark for Combined and Adaptive attacks.", 379 "evidence": "Table 3 shows 0.00 ASR for both models under both attack types, compared to 99.3% (Llama3) and 1.00 (GPT-3.5) without defense.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "FATH achieves 0% ASR under optimization-based attacks on Llama3, while the no-defense setting has 70% ASR.", 384 "evidence": "Section 5.5 reports these results with an example of the optimized injection prompt and FATH's successful defense output.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "FATH surpasses all previous test-time defense methods in defense performance.", 389 "evidence": "Table 2 shows FATH has the lowest ASR across all attack methods for both models compared to Instructional Prevention, Sandwich Prevention, Isolation, and ICL Defense.", 390 "supported": "strong" 391 }, 392 { 393 "claim": "Both Authentication Tags and Security Policy are necessary components for FATH's effectiveness, especially against adaptive attacks.", 394 "evidence": "Table 4 ablation study on GPT-3.5 shows removing Security Policy increases adaptive attack ASR to 34-56%, and removing Authentication Tags increases it to 6-18%, compared to 0% for full FATH.", 395 "supported": "strong" 396 }, 397 { 398 "claim": "FATH defense introduces only a small decrease in generation quality.", 399 "evidence": "Table 2 shows Judge Score drops from 8.31 to 6.73 on Llama3 (~19% decrease) and from 7.94 to 6.91 on GPT-3.5 (~13% decrease). Characterizing a 1.0-1.6 point drop as 'small' is debatable.", 400 "supported": "moderate" 401 } 402 ], 403 "red_flags": [ 404 { 405 "flag": "No error bars or multiple runs", 406 "detail": "All results are reported as single-point estimates without confidence intervals, standard deviations, or information about how many runs produced each number. Given that LLM outputs are stochastic, this is a significant omission." 407 }, 408 { 409 "flag": "Small sample sizes without justification", 410 "detail": "Only 100 examples per injection task category are used for OpenPromptInjection+. With ASR values at or near 0%, the precision of these estimates is limited (1 success out of 100 = 1% ASR, creating a floor effect)." 411 }, 412 { 413 "flag": "Inconsistent framing of Llama3 adaptive attack results", 414 "detail": "The paper emphasizes 'near 0% ASR' in framing but FATH allows 26-34% ASR on Llama3 under adaptive attacks (Table 2), which is substantially higher than 'near 0%' and only briefly acknowledged." 415 }, 416 { 417 "flag": "Judge Score penalty downplayed", 418 "detail": "The paper characterizes the Judge Score drop (8.31→6.73 on Llama3, 7.94→6.91 on GPT-3.5) as 'a small decrease,' but a 1.0-1.6 point decline on a 10-point scale represents meaningful degradation of response quality." 419 }, 420 { 421 "flag": "Only two models tested", 422 "detail": "Evaluation is limited to Llama3-8B-Instruct and GPT-3.5-turbo. The Limitations section notes FATH depends on instruction-following ability, but no larger or more capable models (GPT-4, Claude, Llama 70B) are tested." 423 }, 424 { 425 "flag": "Simulated rather than real tool usage evaluation", 426 "detail": "The InjecAgent evaluation simulates tool usage by directly providing external text information rather than testing in a real tool-calling environment, as acknowledged in the Limitations section." 427 } 428 ], 429 "cited_papers": [ 430 { 431 "title": "Prompt injection attacks and defenses in LLM-integrated applications", 432 "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"], 433 "year": 2023, 434 "arxiv_id": "2310.12815", 435 "relevance": "Proposes the OpenPromptInjection benchmark and several defense methods (Instructional Prevention, Sandwich Prevention, Isolation) used as baselines in this paper." 436 }, 437 { 438 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 439 "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"], 440 "year": 2023, 441 "relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications, demonstrating practical exploitation scenarios." 442 }, 443 { 444 "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents", 445 "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"], 446 "year": 2024, 447 "arxiv_id": "2403.02691", 448 "relevance": "Provides the InjecAgent benchmark for evaluating prompt injection in tool-integrated LLM agents, used as a primary evaluation benchmark." 449 }, 450 { 451 "title": "StruQ: Defending against prompt injection with structured queries", 452 "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"], 453 "year": 2024, 454 "arxiv_id": "2402.06363", 455 "relevance": "Training-time defense against prompt injection using structured queries, representing an alternative defense paradigm." 456 }, 457 { 458 "title": "Benchmarking and defending against indirect prompt injection attacks on large language models", 459 "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Keegan Hines", "Emre Kiciman", "Guangzhong Sun", "Xing Xie", "Fangzhao Wu"], 460 "year": 2023, 461 "arxiv_id": "2312.14197", 462 "relevance": "Proposes ICL Defense baseline and special token-based training-time defense against indirect prompt injection." 463 }, 464 { 465 "title": "Defending against indirect prompt injection attacks with spotlighting", 466 "authors": ["Keegan Hines", "Gary Lopez", "Matthew Hall", "Federico Zarfati", "Yonatan Zunger", "Emre Kiciman"], 467 "year": 2024, 468 "arxiv_id": "2403.14720", 469 "relevance": "Test-time defense using data spotlighting techniques to help LLMs distinguish instructions from external data." 470 }, 471 { 472 "title": "Automatic and universal prompt injection attacks against large language models", 473 "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang", "Ning Zhang", "Chaowei Xiao"], 474 "year": 2024, 475 "arxiv_id": "2403.04957", 476 "relevance": "Proposes the optimization-based prompt injection framework used as a worst-case attack in FATH evaluation." 477 }, 478 { 479 "title": "ReAct: Synergizing reasoning and acting in language models", 480 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik R Narasimhan", "Yuan Cao"], 481 "year": 2022, 482 "relevance": "Foundational agentic framework used in InjecAgent benchmark for tool-integrated LLM evaluation." 483 }, 484 { 485 "title": "A new era in LLM security: Exploring security concerns in real-world LLM-based systems", 486 "authors": ["Fangzhou Wu", "Ning Zhang", "Somesh Jha", "Patrick McDaniel", "Chaowei Xiao"], 487 "year": 2024, 488 "arxiv_id": "2402.18649", 489 "relevance": "Demonstrates practical security vulnerabilities in LLM systems including exploitation of code interpreter and web access capabilities." 490 }, 491 { 492 "title": "Tensor Trust: Interpretable prompt injection attacks from an online game", 493 "authors": ["Sam Toyer", "Olivia Watkins", "Ethan Adrian Mendes", "Justin Svegliato", "Luke Bailey", "Tiffany Wang", "Isaac Ong", "Karim Elmaaroufi", "Pieter Abbeel", "Trevor Darrell"], 494 "year": 2023, 495 "relevance": "Crowdsourced prompt injection attacks and defenses providing real-world attack pattern data." 496 }, 497 { 498 "title": "Prompt injection attack against LLM-integrated applications", 499 "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li", "Kailong Wang", "Tianwei Zhang", "Yepang Liu", "Haoyu Wang", "Yan Zheng", "Yang Liu"], 500 "year": 2023, 501 "arxiv_id": "2306.05499", 502 "relevance": "Early systematic study of prompt injection attacks providing attack templates (Naive, Escape Characters, Context Ignoring, Fake Completion) used in this paper." 503 }, 504 { 505 "title": "Mind2Web: Towards a generalist agent for the web", 506 "authors": ["Xiang Deng", "Yu Gu", "Boyuan Zheng", "Shijie Chen", "Sam Stevens", "Boshi Wang", "Huan Sun", "Yu Su"], 507 "year": 2024, 508 "relevance": "Web agent benchmark demonstrating LLM-integrated applications vulnerable to indirect prompt injection." 509 } 510 ], 511 "engagement_factors": { 512 "practical_relevance": { 513 "score": 2, 514 "justification": "FATH can be applied by developers to defend LLM-integrated applications at inference time without model fine-tuning, but requires significant per-application prompt engineering." 515 }, 516 "surprise_contrarian": { 517 "score": 1, 518 "justification": "The authentication-based framing is a novel angle but the core idea of using tags and ICL for defense is incremental over prior work." 519 }, 520 "fear_safety": { 521 "score": 2, 522 "justification": "Addresses prompt injection (OWASP Top 1 for LLMs) and demonstrates both attacks and defenses, raising awareness of LLM security risks." 523 }, 524 "drama_conflict": { 525 "score": 0, 526 "justification": "No controversy or dramatic claims; straightforward defense method comparison." 527 }, 528 "demo_ability": { 529 "score": 2, 530 "justification": "Code released on GitHub with prompt templates provided in appendices, but not packaged as a pip-installable tool or live demo." 531 }, 532 "brand_recognition": { 533 "score": 1, 534 "justification": "Authors from UW-Madison and NVIDIA; uses GPT-3.5 and Llama3 which are well-known but not the flagship models being tested." 535 } 536 } 537 }