scan.json (29090B)
1 { 2 "paper": { 3 "title": "RTBAS: Defending LLM Agents Against Prompt Injection and Privacy Leakage", 4 "authors": [ 5 "Peter Yong Zhong", 6 "Siyuan Chen", 7 "Ruiqi Wang", 8 "McKenna McCall", 9 "Ben L. Titzer", 10 "Heather Miller", 11 "Phillip B. Gibbons" 12 ], 13 "year": 2025, 14 "venue": "arXiv.org", 15 "arxiv_id": "2502.08966", 16 "doi": "10.48550/arXiv.2502.08966" 17 }, 18 "scan_version": 3, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "methodology_tags": ["benchmark-eval"], 21 "key_findings": "RTBAS adapts information flow control to LLM agent systems, using dependency screening (LM-Judge or attention-based) to selectively propagate security labels and mask irrelevant history regions. On the AgentDojo benchmark, RTBAS achieves 100% integrity against prompt injection attacks with <2% utility degradation. On a synthesized privacy leakage benchmark (37 tasks), RTBAS achieves 8.1% FPR and 10.8% FNR with the LM-Judge screener, significantly outperforming both always-confirm and never-confirm baselines.", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": false, 27 "justification": "No repository URL, code archive, or link to released code is provided anywhere in the paper." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": false, 32 "justification": "AgentDojo is publicly available, but the authors' augmented labels and the synthesized privacy leakage benchmark (37 test cases across 3 domains) are not released." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "The paper mentions using GPT-4o, OPT-125m, and Phi-3-Mini-128K but provides no requirements.txt, Dockerfile, or detailed environment specification." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "No step-by-step reproduction instructions, README, or scripts are provided." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "All results in Figures 5, 6, 7 and Tables 2, 4 are reported as point estimates without confidence intervals or error bars." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "Comparative claims between RTBAS and baselines (e.g., 'outperforming state-of-the-art defenses') are based solely on comparing raw numbers with no statistical significance tests." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "Effect sizes are reported with baseline context throughout: '10% and 7.4% degradation in utility' (§8.1.2), '<2% degradation', FPR reduced from 29.7% (GPTs) to 8.1% (LM-Judge). Raw utility values and percentage differences are provided." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "79 user tasks for prompt injection (629 test cases) and 37 test cases for privacy leakage are used without any justification for why these sample sizes are sufficient." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. All results appear to be single-run numbers." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "Multiple baselines are compared: Tool Filter by AgentDojo (SOTA), Naive Tainting, Redact All for prompt injection; Confirm Never, Confirm Every Time (GPTs), and Oracle for privacy leakage (§8.1.1, §8.2)." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "Tool Filter from AgentDojo (2024) is cited as existing SOTA. They also reference and dismiss PI Detector, Delimiting, and Prompt Sandwiching from recent work as strictly worse than Tool Filter." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "Two screener variants (LM-Judge and Attention-Based) are compared against each other and against Naive Tainting and Redact All, which serve as ablated versions of the framework. §8.3.2 directly compares the two screener approaches." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "Prompt injection evaluation uses both Utility and Integrity metrics (§8.1.1). Privacy leakage uses FPR, FNR, and Utility (§8.2). Taint tracking accuracy is also reported (§8.3.1)." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": false, 96 "justification": "All evaluation is automated through benchmark suites. No human evaluation of the system's defense outputs is included." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": true, 101 "justification": "For the attention-based classifier, §7.2 reports '85% train accuracy and 81% test accuracy', indicating a train/test split. AgentDojo provides fixed test cases not used for model development." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Results are broken down by task suite (banking, travel, workspace, slack) in Figures 5 and 6, and by domain (Amazon, flight booking, Venmo) in Figure 7." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "The Slack suite is discussed as an outlier with detailed explanation of why performance drops (§8.1.2). A workspace test case failure is explained where user-labeled high-integrity content contained injection. Over-tainting of flight booking history is noted (§8.2)." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "The Slack suite's poor performance (33% and 22% utility for Attention and LM-Judge) is reported and explained. The paper also reports an over-tainting failure case in the flight booking domain." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "Abstract claims of '100% of targeted attacks' prevented are supported by Figure 5 (with one acknowledged exception in workspace involving user-labeled content). '<2% loss of task utility' is supported by the weighted average results. 'Near-oracle performance' on privacy leaks is supported by Table 2 and Figure 7." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "The paper makes causal claims that RTBAS 'prevents' attacks. The controlled benchmark evaluation design, where the defense mechanism is the only variable changed while model and tasks remain constant, provides adequate basis for these causal claims." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": false, 133 "justification": "The title claims 'Defending LLM Agents' broadly, but evaluation uses only GPT-4o as the backend LM and only 4 task suites from AgentDojo plus 37 synthesized tasks. No testing with other LMs, other attack types, or real-world deployments. The paper does not explicitly bound these generalization limits." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "The Discussion section (§9) addresses labeling burden and cost as limitations, but does not discuss alternative explanations for the observed results. For example, it does not consider whether performance gains could be due to AgentDojo's specific attack patterns or whether the synthesized benchmark's design inherently favors IFC-based approaches." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": true, 143 "justification": "The paper measures Utility (task completion) and Integrity (attack prevention) directly. These metrics are well-defined by AgentDojo and map directly to the claims being made. The proxy gap is explicitly addressed: Integrity is defined as 'whether the attacker succeeds in their attacks' (§8.1.1)." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": false, 150 "justification": "The paper specifies 'GPT-4o' without a snapshot date or API version. 'OPT-125m' and 'Phi-3-Mini-128K' are more specific but GPT-4o is the primary evaluation model and its version is not pinned." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": false, 155 "justification": "The LM-Judge approach is described conceptually (region tagging with «REGION_N» markers, prompt sandwiching) but the actual prompt text used for the judge is not provided. Only the tagging format is shown." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": false, 160 "justification": "No temperature, top-p, max tokens, or sampling settings are reported for GPT-4o. For the LSTM classifier, they mention 'two-layer LSTM' but report no learning rate, batch size, or training hyperparameters." 161 }, 162 "scaffolding_described": { 163 "applies": true, 164 "answer": true, 165 "justification": "The RTBAS framework is described in extensive detail through Algorithms 1-4, with formal definitions of the screener, redactor, and runtime behavior. The full information flow pipeline from screening to redaction to tool call verification is documented." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Data labeling for AgentDojo is described in §8.1.1 with key labeling principles (low-integrity for external sources, high-integrity for side-effect tools). The privacy leakage benchmark creation is described in §8.2 with stated principles. Attention data collection from 3,424 pairs is described in §3.3." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": true, 177 "justification": "Section 9 (Discussion) serves as a limitations section, discussing labeling burden ('a open problem in IFC') and computational cost ('currently resource-intensive'). The Slack suite failure analysis in §8.1.2 also functions as limitation discussion." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": true, 182 "justification": "Specific threats discussed: (1) 'the security guarantees provided are only as good as the labels provided' (§8.1.2), (2) Slack tasks are 'inherently unsafe' due to dependence on untrusted web content, (3) labeling burden on developers (§9), (4) resource-intensive screener operation (§9)." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": true, 187 "justification": "The paper states that security guarantees depend on label quality and policy correctness (§8.1.2). The attacker model (§5) explicitly bounds what attackers can and cannot do. The Discussion notes that labeling is 'an open problem in IFC' and cost is a limitation." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": false, 194 "justification": "No raw experimental data, logs, or detailed per-test-case results are released. Only aggregate results in figures and tables are provided." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "AgentDojo benchmark is referenced with details in Table 1 (79 tasks, 4 suites, tools, messages per test case). Privacy leakage benchmark creation is described in §8.2 and Table 3. Attention data collection described in §3.3 with 3,424 pairs and 2,416 labeled data points." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants. Data sources are standard benchmarks (AgentDojo) and author-synthesized test cases." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": false, 209 "justification": "While data labeling principles are described, the full pipeline from raw benchmark data to final evaluation results has gaps. For the attention classifier, dataset collection from '40 well-labeled test cases' (§7.2) is mentioned but the split into train/test and exact dataset size are unclear." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": true, 216 "justification": "Acknowledgments section states: 'This work supported in part by the Parallel Data Lab, the WebAssembly Research Center and Cylab at Carnegie Mellon University, and the National Science Foundation under grant 2211882.'" 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations (Carnegie Mellon University, Two Sigma Investments) are clearly listed. The evaluated systems (GPT-4o, Claude) are from third parties (OpenAI, Anthropic), so no affiliation conflict exists." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": true, 226 "justification": "NSF and CMU research centers are the funders. Heather Miller's Two Sigma affiliation is disclosed but Two Sigma has no direct financial interest in the prompt injection defense results." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests statement or financial interests declaration is present in the paper." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": false, 237 "answer": false, 238 "justification": "This paper tests a defense mechanism (RTBAS) against prompt injection attacks, not a pre-trained model's knowledge on a benchmark. The evaluation measures defense effectiveness, not model capability." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": false, 242 "answer": false, 243 "justification": "The paper evaluates defense frameworks, not model knowledge. Contamination of the LM's training data is not relevant to whether the defense mechanism correctly detects and blocks attacks." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": false, 247 "answer": false, 248 "justification": "Same as above — the paper tests defenses rather than model knowledge, making benchmark contamination structurally inapplicable." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study. All evaluation is automated benchmark testing." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants. §11 Ethics Considerations confirms 'No real-world user data or personally identifiable information (PII) was involved.'" 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": true, 292 "justification": "Table 4 reports average price ($), time (seconds), and token counts per test case for the banking suite. RTBAS (Attn) costs $0.028 vs $0.015 for vanilla, with runtime of 8.7s vs 4.4s." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "Table 4 provides per-test-case averages for the banking suite only. Total compute budget across all experiments (all 4 suites, privacy leakage benchmark, attention model training) is not stated." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single runs." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": false, 309 "justification": "The number of experimental runs is never stated. It is unclear whether results are from a single run or averaged across multiple runs." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "No hyperparameter search budget is reported for the LSTM classifier or any other component of the system." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": false, 319 "justification": "The paper presents results for LM-Judge and Attention-based screeners but does not explain how design choices (e.g., two-layer LSTM, specific attention features) were selected." 320 }, 321 "multiple_comparison_correction": { 322 "applies": true, 323 "answer": false, 324 "justification": "No statistical tests are performed at all, so multiple comparison correction is moot. However, multiple comparisons across 4 suites and multiple baselines are made without any correction." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "The authors evaluate their own RTBAS system against baselines they re-implemented (Naive Tainting, Redact All) without acknowledging self-comparison bias." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": true, 334 "justification": "Table 4 directly compares runtime cost (price, time, tokens) across methods alongside performance. §8.3.3 explicitly discusses the overhead introduced by the detectors, noting that RTBAS costs roughly 2x compared to vanilla." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": false, 339 "justification": "The paper does not discuss whether AgentDojo adequately represents real-world prompt injection threats or whether the synthesized privacy benchmark captures the diversity of real privacy leakage scenarios." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": false, 343 "answer": false, 344 "justification": "RTBAS is itself the scaffold/defense being tested. The same underlying model (GPT-4o) is used across all conditions, and the defense mechanism IS the variable under evaluation." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "No discussion of whether GPT-4o's training data may include information about AgentDojo tasks or attack patterns." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "No discussion of whether the evaluation setup leaks information that would not be available in real deployments." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "No discussion of potential non-independence between test cases (e.g., same task suite sharing tools and system prompts)." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": false, 366 "justification": "No concrete leakage detection or prevention methods are applied to the evaluation data." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "RTBAS prevents 100% of attacks that violate the security policy on the AgentDojo benchmark.", 373 "evidence": "Figure 5 shows integrity scores of 1.00 for both RTBAS variants across banking, slack, and travel suites, and 0.99 for workspace. §8.1.2 acknowledges one workspace exception where user-labeled high-integrity content contained an injection.", 374 "supported": "strong" 375 }, 376 { 377 "claim": "RTBAS incurs less than 2% utility degradation under prompt injection attack compared to no defense.", 378 "evidence": "Figure 5(e) weighted average shows RTBAS (LM-Judge) at 0.53 utility and RTBAS (Attn) at 0.58 utility vs baseline 0.58 without defense. Under attack, the LM-Judge loses <1% and Attention-based loses 3% (§8.1.2).", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "RTBAS achieves near-oracle performance on detecting privacy leaks, detecting and executing without user confirmation the same set of tool calls as the oracle for all but one task.", 383 "evidence": "Table 2 shows LM-Judge FPR of 8.1% and FNR of 10.8%. Figure 7 shows utility closely matching oracle. §8.2 states 'we are losing utility only in 1 out of 15 test cases.'", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "Attention scores from small open-source LMs effectively capture dependency relationships in TBAS backed by large closed-source models.", 388 "evidence": "Figures 2 and 3 in §3.3 show attention score distributions separating dependent from non-dependent data using OPT-125m for TBAS backed by GPT-4o and Claude. 74-86% of non-dependent attention mass is below 0.2.", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "The selective propagation approach arrives at exact ground truth privacy labels more than 70% (LM-Judge) and 57% (Attention) of the time.", 393 "evidence": "§8.3.1 reports taint tracking accuracy: 70.0% for LM-Judge and 57.3% for Attention-based, compared to 22.3% for Redact All and 56.7% for GPTs.", 394 "supported": "moderate" 395 } 396 ], 397 "red_flags": [ 398 { 399 "flag": "Very small synthesized benchmark", 400 "detail": "The privacy leakage benchmark contains only 37 test cases across 3 domains, all manually created by the authors. This is too small to draw robust conclusions about real-world privacy leakage detection, and author-created benchmarks risk favoring the proposed approach." 401 }, 402 { 403 "flag": "No reproducibility artifacts", 404 "detail": "No code, synthesized benchmark data, augmented labels, or reproduction instructions are released. The system relies on multiple components (LM-Judge prompts, LSTM classifier, attention feature extraction) that cannot be recreated from the paper alone." 405 }, 406 { 407 "flag": "Single-run results without variance", 408 "detail": "All results appear to be from single experimental runs with no error bars, confidence intervals, or variance across seeds. LLM outputs are stochastic, so single-run results may not be stable." 409 }, 410 { 411 "flag": "Model version not pinned", 412 "detail": "GPT-4o is used without specifying an API version or snapshot date. Model behavior changes across versions, making results potentially unreproducible." 413 }, 414 { 415 "flag": "User confirmation modeled as rejection", 416 "detail": "In the prompt injection benchmark, the paper does not model user confirmations — all flagged calls are simply skipped. This means the reported utility may underestimate actual RTBAS utility if users would confirm legitimate-but-flagged calls." 417 } 418 ], 419 "cited_papers": [ 420 { 421 "title": "AgentDojo: A dynamic environment to evaluate attacks and defenses for LLM agents", 422 "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunović", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"], 423 "year": 2024, 424 "arxiv_id": "2406.13352", 425 "relevance": "Primary benchmark used for evaluating prompt injection attacks and defenses in LLM agent systems." 426 }, 427 { 428 "title": "Prompt injection attack against LLM-integrated applications", 429 "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li", "Kailong Wang"], 430 "year": 2024, 431 "relevance": "Foundational work on prompt injection attacks against LLM-integrated applications." 432 }, 433 { 434 "title": "Formalizing and benchmarking prompt injection attacks and defenses", 435 "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"], 436 "year": 2024, 437 "relevance": "Formalizes prompt injection attacks and defenses with benchmarking framework." 438 }, 439 { 440 "title": "StruQ: Defending against prompt injection with structured queries", 441 "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"], 442 "year": 2024, 443 "relevance": "Prompt injection defense using structured queries and data/instruction separation." 444 }, 445 { 446 "title": "Jatmo: Prompt injection defense by task-specific finetuning", 447 "authors": ["Julien Piet", "Maha Alrashed", "Chawin Sitawarin", "Sizhe Chen"], 448 "year": 2024, 449 "relevance": "Defense approach using task-specific finetuning on non-instruction-tuned models." 450 }, 451 { 452 "title": "The instruction hierarchy: Training LLMs to prioritize privileged instructions", 453 "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng"], 454 "year": 2024, 455 "relevance": "Pre-training approach to enforce instruction hierarchies for prompt injection resistance." 456 }, 457 { 458 "title": "Permissive information-flow analysis for large language models", 459 "authors": ["Shoaib Ahmed Siddiqui", "Radhika Gaonkar", "Boris Köpf", "David Krueger"], 460 "year": 2024, 461 "relevance": "Most closely related work on information flow control for LLMs, which RTBAS improves upon by reducing exponential to constant overhead." 462 }, 463 { 464 "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents", 465 "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"], 466 "year": 2024, 467 "relevance": "Benchmark for indirect prompt injection attacks in tool-integrated LLM agents." 468 }, 469 { 470 "title": "ReAct: Synergizing reasoning and acting in language models", 471 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du"], 472 "year": 2023, 473 "relevance": "Foundational agent framework for tool-based LLM interaction patterns." 474 }, 475 { 476 "title": "Attention tracker: Detecting prompt injection attacks in LLMs", 477 "authors": ["Kuo-Han Hung", "Ching-Yun Ko", "Ambrish Rawat"], 478 "year": 2024, 479 "relevance": "Uses attention mechanisms for prompt injection detection, related to RTBAS's attention-based screener." 480 }, 481 { 482 "title": "Defending against indirect prompt injection attacks with spotlighting", 483 "authors": ["Keegan Hines", "Gary Lopez", "Matthew Hall", "Federico Zarfati"], 484 "year": 2024, 485 "relevance": "Defense against indirect prompt injection using input transformation techniques." 486 }, 487 { 488 "title": "Universal and transferable adversarial attacks on aligned language models", 489 "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr"], 490 "year": 2023, 491 "relevance": "Demonstrates universal adversarial attacks on aligned LMs, motivating defense research like RTBAS." 492 } 493 ], 494 "engagement_factors": { 495 "practical_relevance": { 496 "score": 2, 497 "justification": "The RTBAS framework addresses a real problem (prompt injection in LLM agents) but no code is released, requiring practitioners to reimplement from the paper." 498 }, 499 "surprise_contrarian": { 500 "score": 1, 501 "justification": "Adapting information flow control to LLMs is a novel framing but does not strongly challenge conventional wisdom about prompt injection defenses." 502 }, 503 "fear_safety": { 504 "score": 2, 505 "justification": "Demonstrates concrete prompt injection and privacy leakage risks in LLM agent systems with financial and messaging examples." 506 }, 507 "drama_conflict": { 508 "score": 0, 509 "justification": "No controversy or adversarial framing; straightforward defense paper." 510 }, 511 "demo_ability": { 512 "score": 0, 513 "justification": "No code, demo, or tool is released." 514 }, 515 "brand_recognition": { 516 "score": 1, 517 "justification": "Carnegie Mellon is well-respected in security and systems research but not a household name like OpenAI or Google." 518 } 519 } 520 }