scan.json (27260B)
1 { 2 "paper": { 3 "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents", 4 "authors": [ 5 "Edoardo Debenedetti", 6 "Jie Zhang", 7 "Mislav Balunovic", 8 "Luca Beurer-Kellner", 9 "Marc Fischer", 10 "Florian Tramèr" 11 ], 12 "year": 2024, 13 "venue": "NeurIPS 2024 Track on Datasets and Benchmarks", 14 "arxiv_id": "2406.13352" 15 }, 16 "checklist": { 17 "artifacts": { 18 "code_released": { 19 "applies": true, 20 "answer": true, 21 "justification": "Code is released under MIT license at https://github.com/ethz-spylab/agentdojo (Section 1 and Appendix E.1). A Zenodo DOI (10.5281/zenodo.12528188) is also provided." 22 }, 23 "data_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The benchmark tasks, tools, and environment state are released via the GitHub repository. Model outputs and conversations used to generate results are released as JSON files on Google Drive (Appendix E.3)." 27 }, 28 "environment_specified": { 29 "applies": true, 30 "answer": true, 31 "justification": "A requirements.txt file is explicitly mentioned in Appendix E.3: 'we additionally include a requirements.txt file that can be used to install the exact dependencies we used for the experimental results in the paper.'" 32 }, 33 "reproduction_instructions": { 34 "applies": true, 35 "answer": true, 36 "justification": "Appendix E.3 states the GitHub README provides 'extensive documentation on how to use our framework (including how to run the existing benchmark, create new tools, task, etc.)' and includes Jupyter Notebooks that reproduce all figures and tables." 37 } 38 }, 39 "statistical_methodology": { 40 "confidence_intervals_or_error_bars": { 41 "applies": true, 42 "answer": true, 43 "justification": "95% confidence intervals are reported for main results. The checklist confirms: 'We report 95% confidence intervals of our experiment by using statsmodels.stats.proportion.proportion_confint either in the plots, or in the tables in the appendix.' Tables 3, 4, and 5 all include ±CI values." 44 }, 45 "significance_tests": { 46 "applies": false, 47 "answer": false, 48 "justification": "This is a benchmark evaluation paper that reports performance metrics across systems; it does not make formal claims of statistical superiority requiring significance tests. Confidence intervals are reported but no formal significance testing framework is applied." 49 }, 50 "effect_sizes_reported": { 51 "applies": true, 52 "answer": true, 53 "justification": "Absolute percentage differences between conditions are reported throughout (e.g., targeted ASR drops from 57.69% to 6.84% with tool filter defense in Table 5), providing enough context to assess practical effect magnitude." 54 }, 55 "sample_size_justified": { 56 "applies": true, 57 "answer": false, 58 "justification": "The benchmark uses 97 user tasks and 629 security test cases, but no explicit power analysis or justification for why these numbers were chosen is provided. The tasks were designed to cover scenario diversity, not for statistical power." 59 }, 60 "variance_reported": { 61 "applies": true, 62 "answer": true, 63 "justification": "95% confidence intervals are reported for all major results in Tables 3, 4, and 5, providing spread information. Figure 6 captions also note that CI values are in the corresponding tables." 64 } 65 }, 66 "evaluation_design": { 67 "baselines_included": { 68 "applies": true, 69 "answer": true, 70 "justification": "Multiple attack baselines are compared: 'Ignore previous instructions', 'TODO', InjecAgent, and 'Important message' (Section 4.2, Figure 8, Table 4). Multiple defense baselines are also compared (Section 4.3, Figure 9, Table 5)." 71 }, 72 "baselines_contemporary": { 73 "applies": true, 74 "answer": true, 75 "justification": "Baselines include contemporary prompt injection techniques from 2023-2024 literature (InjecAgent 2024, Spotlighting 2024, ProtectAI 2024). LLM evaluations include Claude 3.5 Sonnet, GPT-4o, Gemini 1.5 Pro - all state-of-the-art at time of publication." 76 }, 77 "ablation_study": { 78 "applies": true, 79 "answer": true, 80 "justification": "Section 4.2 presents ablations on attacker knowledge (Table 2) showing impact of knowing user name and model name on attack success rate. Section 4.3 ablates multiple defense strategies." 81 }, 82 "multiple_metrics": { 83 "applies": true, 84 "answer": true, 85 "justification": "Three metrics are explicitly defined (Section 3.4): Benign Utility, Utility Under Attack, and Targeted Attack Success Rate (ASR). Results are reported across all three." 86 }, 87 "human_evaluation": { 88 "applies": false, 89 "answer": false, 90 "justification": "The paper uses deterministic utility functions to evaluate agent task completion rather than human evaluation. The paper explicitly notes this as an advantage: 'our setting since we study attacks that explicitly aim to inject new instructions into a model. Thus, if such an attack were particularly successful, there is a chance that it would also hijack the evaluation model.'" 91 }, 92 "held_out_test_set": { 93 "applies": false, 94 "answer": false, 95 "justification": "This paper introduces a benchmark framework and evaluates pre-trained LLMs via API. No model training or tuning is performed on any portion of the data. There is no dev/test split because there is no development phase using the benchmark data. The concept of a held-out test set is structurally inapplicable." 96 }, 97 "per_category_breakdown": { 98 "applies": true, 99 "answer": true, 100 "justification": "Results are broken down by task suite (Workspace, Slack, Travel, Banking) in Figure 7 showing attack success rates per injection task and suite. Figure 20 also shows utility by suite for different attacks." 101 }, 102 "failure_cases_discussed": { 103 "applies": true, 104 "answer": true, 105 "justification": "Specific failure cases are discussed: 'task 6 of our travel agent suite succeeds in 0% of cases' because the injection requires two unrelated actions. The Slack suite's 92% success rate is analyzed. Defense failure modes are discussed in Section 4.3." 106 }, 107 "negative_results_reported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Multiple negative results: incorrect attacker knowledge weakens attacks significantly (wrong user guess: -22.6%, Table 2). Prompt injection detector has too many false positives and degrades utility (Section 4.3). Denial-of-service attacks are less effective than targeted attacks (Figure 20)." 111 } 112 }, 113 "claims_and_evidence": { 114 "abstract_claims_supported": { 115 "applies": true, 116 "answer": true, 117 "justification": "Abstract claims are well-supported: '97 realistic tasks', '629 security test cases' confirmed in Table 1. 'LLMs fail at many tasks' confirmed by benign utility <66% in Table 3. 'Attacks break some security properties but not all' confirmed by per-task ASR breakdowns in Figure 7." 118 }, 119 "causal_claims_justified": { 120 "applies": true, 121 "answer": true, 122 "justification": "Causal claims are made via ablations with controlled single-variable manipulations (e.g., varying only attacker knowledge in Table 2, varying only defense type in Table 5). These controlled comparisons are adequate for the causal claims made." 123 }, 124 "generalization_bounded": { 125 "applies": true, 126 "answer": true, 127 "justification": "The paper is appropriately scoped: results are reported for specific models and the AgentDojo benchmark. The paper explicitly notes limitations like 'settings which AgentDojo does not cover yet' (Section 4.3). Claims are about AI agents in the tested environments, not general AI systems." 128 }, 129 "alternative_explanations_discussed": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper discusses alternative explanations for the inverse scaling finding ('This is a potentially unsurprising result, as models with low utility often fail at correctly executing the attacker's goal, even when the prompt injection succeeds'). The high Slack ASR is explained by attacker's large fraction of tool output control." 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper refers to most models by marketing names only: 'Claude 3 Sonnet', 'GPT-4o', 'Gemini 1.5 Pro', etc. Only one code figure (Figure 12) shows 'gpt-4o-2024-05-13'. The schema explicitly states 'marketing names like Gemini-2.5 or GPT-4o without a snapshot date or API version do NOT count as specified versions.' Nine out of ten models lack snapshot dates or API version identifiers in the paper text." 140 }, 141 "prompts_provided": { 142 "applies": true, 143 "answer": true, 144 "justification": "Full prompt text is provided in Appendix B: the default system prompt (Figure 14), the additional Claude Sonnet prompt (Figure 15), the Llama 3 70B prompt (Figure 16), defense prompts (Figures 17-18), and all four attack prompts (Figure 19) are reproduced verbatim." 145 }, 146 "hyperparameters_reported": { 147 "applies": true, 148 "answer": false, 149 "justification": "The paper does not report temperature, top-p, or other sampling hyperparameters for the LLM API calls. Model APIs are used 'following the respective documentation' but specific settings like temperature are not stated." 150 }, 151 "scaffolding_described": { 152 "applies": true, 153 "answer": true, 154 "justification": "The agentic scaffolding is described in detail: Section 3.2 explains the pipeline interface, Figure 11 shows the base component, Figure 12 shows the pipeline combining LLM with injection detector, and Section 3.3 explains how attacks integrate with the scaffold." 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": true, 159 "justification": "The benchmark data was 'generated manually or assisted by GPT-4o and Claude 3 Opus, by providing the models with the expected schema of the data and a few examples. For LLM-generated test data we manually inspected all outputs to ensure high quality.' (Section 3.1)" 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 5 (Conclusion) includes a dedicated discussion of limitations and future work, including five explicitly labeled avenues for improvement. Section 4.3 discusses specific failure modes of defenses." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": true, 171 "justification": "Specific threats are identified: defense failure when tool planning must adapt to tool outputs; defense failure when needed tools also suffice for attacks (17% of test cases); multi-session attack scenarios not yet covered; limitations of automated task generation for scaling." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": true, 176 "justification": "The paper explicitly states 'settings which AgentDojo does not cover yet' such as multi-session scenarios. It notes attacks are 'general-purpose and not designed specifically for any given tasks or security scenarios.' The paper acknowledges tasks are not from real deployments." 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": true, 183 "justification": "All model outputs and conversations are released as JSON files on Google Drive (Appendix E.3). The benchmark code and environment state data are on GitHub. Zenodo DOI 10.5281/zenodo.12528188 is provided." 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": true, 188 "justification": "Task creation is described: tasks were manually designed to cover diverse scenarios, with dummy data generated manually or via GPT-4o/Claude 3 Opus with manual inspection. The four environments and their content are described in detail (Section 3.1 and Table 1)." 189 }, 190 "recruitment_methods_described": { 191 "applies": false, 192 "answer": false, 193 "justification": "No human participants or crowd-sourced contributors are involved. The benchmark uses synthetically generated data and tasks designed by the paper authors. The data card confirms 'No real data. Only dummy data.'" 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": true, 198 "justification": "The pipeline from task design to evaluation is documented: task design (Section 3.1), cross-product to obtain security test cases (Section 3), environment state management (Section 3.1). The deterministic utility functions for evaluation are shown in code figures." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": true, 205 "justification": "Acknowledgments state: 'E.D. is supported by armasuisse Science and Technology' and 'J.Z. is funded by the Swiss National Science Foundation (SNSF) project grant 214838.' The data card also confirms: 'No institution provided explicit funding for the creation of this benchmark.'" 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "Author affiliations are disclosed on the title page: ETH Zurich (all authors) and Invariant Labs (Balunovic, Beurer-Kellner, Fischer). Invariant Labs is a startup building AI security products." 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": false, 215 "justification": "Mislav Balunovic, Luca Beurer-Kellner, and Marc Fischer are affiliated with Invariant Labs, a company that builds AI security tools (prompt injection defenses). They evaluate their own defense concepts in AgentDojo. armasuisse is a defense procurement agency — some interest in AI security results is plausible." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "No competing interests statement or declaration of financial interests appears in the paper. Three authors are affiliated with Invariant Labs, a commercial entity in the AI security space, but this financial interest is not explicitly declared beyond the affiliation listing." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": false, 226 "answer": false, 227 "justification": "AgentDojo tests prompt injection attack/defense robustness in a dynamic tool-calling environment, not model knowledge on a static benchmark. The schema states applies=false for 'studies that test defenses/tools rather than model knowledge.' The tasks require multi-step stateful tool interaction where contamination is not a meaningful concern — knowing the task description would not help a model execute the correct tool calls in the correct order on the specific environment state." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": false, 231 "answer": false, 232 "justification": "Same reasoning as training_cutoff_stated. This is a security benchmark testing defenses and attack robustness, not model knowledge. Train/test overlap is structurally irrelevant for evaluating whether a model follows injected instructions versus resisting them." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": false, 236 "answer": false, 237 "justification": "Same reasoning as the other contamination items. The benchmark was newly created in 2024 and tests dynamic security properties in stateful environments. Even theoretical contamination would not meaningfully affect the measured outcomes (attack success rates, defense effectiveness)." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": false, 243 "answer": false, 244 "justification": "No human participants. The paper explicitly marks human subjects sections as N/A in the NeurIPS checklist." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human participants. Only synthetically generated data and automated LLM evaluations." 250 }, 251 "demographics_reported": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants involved." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants involved." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants involved." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants involved." 270 }, 271 "attrition_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants involved." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": true, 280 "answer": true, 281 "justification": "Appendix D explicitly states: 'We estimate that running the full suite of 629 security test cases on GPT-4o costs around US$35, and running the suite of 97 utility test cases costs US$4.'" 282 }, 283 "compute_budget_stated": { 284 "applies": true, 285 "answer": true, 286 "justification": "The total API cost is reported ($35 for the full security test suite, $4 for utility cases on GPT-4o). No GPU compute is required as only API-accessed models are evaluated." 287 } 288 } 289 }, 290 "claims": [ 291 { 292 "claim": "Current LLMs solve less than 66% of AgentDojo tasks in the absence of any attack.", 293 "evidence": "Table 3 (Appendix C) shows benign utility scores: Claude 3.5 Sonnet 78.22%, GPT-4o 69.00%, Claude 3 Opus 66.61%, with many models substantially lower. The 66% bound refers to the best models (Section 1).", 294 "supported": "strong" 295 }, 296 { 297 "claim": "More capable models tend to be easier to attack (inverse scaling law for prompt injection vulnerability).", 298 "evidence": "Figure 6a shows a positive correlation between benign utility and targeted ASR. Claude 3.5 Sonnet (highest utility) has 33.86% ASR, GPT-4o 47.69% ASR, while Command-R+ (low utility) has only 0.95% ASR (Table 3).", 299 "supported": "strong" 300 }, 301 { 302 "claim": "The 'Important message' prompt injection significantly outperforms prior attacks including InjecAgent and 'Ignore previous instructions'.", 303 "evidence": "Table 4 shows targeted ASR: TODO 3.66%, Ignore previous 5.41%, InjecAgent 5.72%, Important message 57.7% (all with 95% CI). Figure 8 visualizes this comparison.", 304 "supported": "strong" 305 }, 306 { 307 "claim": "Tool filter defense reduces targeted attack success rate to approximately 7%.", 308 "evidence": "Table 5 shows tool filter achieves 6.84% targeted ASR vs 57.69% with no defense, while maintaining 73.13% benign utility.", 309 "supported": "strong" 310 }, 311 { 312 "claim": "Correct attacker knowledge of user name and model name only marginally improves attack success rate.", 313 "evidence": "Table 2 shows baseline ASR of 45.8%; adding correct user and model names yields 47.7% (+1.9%). Incorrect guesses significantly weaken attacks (wrong user: 23.2%, wrong model: 23.7%).", 314 "supported": "strong" 315 }, 316 { 317 "claim": "Injections placed at the end of tool responses are most effective, with up to 70% ASR against GPT-4o.", 318 "evidence": "Figure 21a shows attack success rate as a function of injection position, with end-of-context injections reaching ~70% ASR. This replicates prior observations from [70].", 319 "supported": "moderate" 320 } 321 ], 322 "methodology_tags": [ 323 "benchmark-eval" 324 ], 325 "key_findings": "AgentDojo is a dynamic benchmark framework for evaluating prompt injection attacks and defenses in tool-calling LLM agents, comprising 97 user tasks and 629 security test cases across four environments. State-of-the-art LLMs solve fewer than 78% of tasks even without attacks, and more capable models exhibit an inverse scaling vulnerability: they are easier to successfully attack. A simple tool-filter defense reduces targeted attack success from 58% to under 7% but fails when task planning cannot be done before observing untrusted data. The benchmark is released with full reproducibility artifacts, API cost estimates, and an extensible framework for adding new tasks, attacks, and defenses.", 326 "red_flags": [ 327 { 328 "flag": "Potential conflict of interest not declared", 329 "detail": "Three of six authors (Balunovic, Beurer-Kellner, Fischer) are affiliated with Invariant Labs, a commercial company building AI security tools. The paper evaluates prompt injection defenses that are closely related to Invariant Labs' commercial interests. No competing interests statement is present despite this affiliation." 330 }, 331 { 332 "flag": "LLM model versions partially underspecified", 333 "detail": "While the code uses specific snapshot versions (e.g., 'gpt-4o-2024-05-13'), the main paper text refers to models by marketing names ('GPT-4o', 'Claude 3.5 Sonnet', 'Gemini 1.5 Pro') without explicit snapshot dates for all models, making exact reproduction uncertain as model APIs evolve." 334 }, 335 { 336 "flag": "Benchmark contamination risk not addressed", 337 "detail": "The paper does not discuss training data cutoffs for any evaluated model or assess whether models could have trained on similar prompt injection scenarios. Though the tasks are newly created, the lack of any contamination discussion is a gap." 338 } 339 ], 340 "cited_papers": [ 341 { 342 "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents", 343 "authors": [ 344 "Qiusi Zhan", 345 "Zhixiang Liang", 346 "Zifan Ying", 347 "Daniel Kang" 348 ], 349 "year": 2024, 350 "arxiv_id": "2403.02691", 351 "relevance": "Direct predecessor benchmark for prompt injection in tool-calling LLM agents; AgentDojo is explicitly designed to improve on InjecAgent's limitations." 352 }, 353 { 354 "title": "AgentBench: Evaluating LLMs as Agents", 355 "authors": [ 356 "Xiao Liu" 357 ], 358 "year": 2023, 359 "arxiv_id": "2308.03688", 360 "relevance": "Multi-turn agent evaluation benchmark used as prior work context for AgentDojo's benchmark design." 361 }, 362 { 363 "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents", 364 "authors": [ 365 "Shuyan Zhou", 366 "Frank F. Xu", 367 "Hao Zhu" 368 ], 369 "year": 2023, 370 "arxiv_id": "2307.13854", 371 "relevance": "Realistic multi-turn web agent benchmark providing context for AgentDojo's approach to realistic agent evaluation." 372 }, 373 { 374 "title": "Identifying the Risks of LM Agents with an LM-Emulated Sandbox", 375 "authors": [ 376 "Yangjun Ruan", 377 "Honghua Dong", 378 "Andrew Wang" 379 ], 380 "year": 2024, 381 "relevance": "ToolEmu benchmark for evaluating LLM agent robustness; AgentDojo is designed to address ToolEmu's limitations when evaluating prompt injection attacks." 382 }, 383 { 384 "title": "Benchmarking and Defending Against Indirect Prompt Injection Attacks on Large Language Models", 385 "authors": [ 386 "Jingwei Yi", 387 "Yueqi Xie", 388 "Bin Zhu" 389 ], 390 "year": 2023, 391 "arxiv_id": "2312.14197", 392 "relevance": "Prior work on indirect prompt injection benchmarking that AgentDojo builds upon and extends to tool-calling multi-turn agent scenarios." 393 }, 394 { 395 "title": "JailbreakBench: An Open Robustness Benchmark for Jailbreaking Large Language Models", 396 "authors": [ 397 "Patrick Chao", 398 "Edoardo Debenedetti", 399 "Alexander Robey" 400 ], 401 "year": 2024, 402 "arxiv_id": "2404.01318", 403 "relevance": "Standardized adversarial benchmark for LLMs, used as a methodological reference for AgentDojo's benchmark design philosophy." 404 }, 405 { 406 "title": "HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal", 407 "authors": [ 408 "Mantas Mazeika", 409 "Long Phan", 410 "Xuwang Yin" 411 ], 412 "year": 2024, 413 "arxiv_id": "2402.04249", 414 "relevance": "Standardized red-teaming evaluation framework, contextualizing AgentDojo's approach to adversarial agent evaluation." 415 }, 416 { 417 "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection", 418 "authors": [ 419 "Kai Greshake", 420 "Sahar Abdelnabi", 421 "Shailesh Mishra", 422 "Christoph Endres", 423 "Thorsten Holz", 424 "Mario Fritz" 425 ], 426 "year": 2023, 427 "relevance": "Key paper establishing indirect prompt injection as a security threat for LLM-integrated applications, motivating AgentDojo's evaluation framework." 428 }, 429 { 430 "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions", 431 "authors": [ 432 "Eric Wallace", 433 "Kai Xiao", 434 "Reimar Leike", 435 "Lilian Weng", 436 "Johannes Heidecke", 437 "Alex Beutel" 438 ], 439 "year": 2024, 440 "arxiv_id": "2404.13208", 441 "relevance": "Defense approach training LLMs to resist prompt injection by prioritizing privileged instructions, directly relevant to AgentDojo's defense evaluation." 442 }, 443 { 444 "title": "Evaluating Language-Model Agents on Realistic Autonomous Tasks", 445 "authors": [ 446 "Megan Kinniment", 447 "Lucas Jun Koba Sato", 448 "Haoxing Du" 449 ], 450 "year": 2023, 451 "arxiv_id": "2312.11671", 452 "relevance": "METR evaluation of LLM agents on realistic autonomous tasks, providing context for AgentDojo's approach to evaluating agentic capability." 453 }, 454 { 455 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 456 "authors": [ 457 "Shunyu Yao", 458 "Jeffrey Zhao", 459 "Dian Yu" 460 ], 461 "year": 2022, 462 "arxiv_id": "2210.03629", 463 "relevance": "Foundational agent architecture combining reasoning and tool use, foundational to the tool-calling agent paradigm AgentDojo evaluates." 464 }, 465 { 466 "title": "Defending Against Indirect Prompt Injection Attacks With Spotlighting", 467 "authors": [ 468 "Keegan Hines", 469 "Gary Lopez", 470 "Matthew Hall" 471 ], 472 "year": 2024, 473 "arxiv_id": "2403.14720", 474 "relevance": "Data delimiting defense against prompt injection evaluated as a baseline in AgentDojo." 475 } 476 ] 477 }