scan.json (31350B)
1 { 2 "paper": { 3 "title": "ReasAlign: Reasoning Enhanced Safety Alignment against Prompt Injection Attack", 4 "authors": [ 5 "Hao Li", 6 "Yankai Yang", 7 "G. Edward Suh", 8 "Ning Zhang", 9 "Chaowei Xiao" 10 ], 11 "year": 2026, 12 "venue": "arXiv.org", 13 "arxiv_id": "2601.10173", 14 "doi": "10.48550/arXiv.2601.10173" 15 }, 16 "scan_version": 3, 17 "active_modules": ["experimental_rigor", "data_leakage"], 18 "methodology_tags": ["benchmark-eval"], 19 "key_findings": "ReasAlign incorporates structured reasoning steps into LLM safety alignment to defend against indirect prompt injection attacks while preserving utility. On CyberSecEval2, ReasAlign achieves 94.6% utility and 3.6% ASR, far outperforming Meta SecAlign (56.4% utility, 74.4% ASR). Ablation studies show that the reasoning mechanism is the primary driver of security improvement, reducing ASR from 21.8% to 3.6% on CySE compared to direct-answer supervision. The approach generalizes across general knowledge, instruction-following, and agentic workflow benchmarks on both Llama-3.1-8B and Qwen2.5-14B.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "The abstract states: 'Our code and experimental results could be found at https://github.com/leolee99/ReasAlign.'" 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "All evaluation benchmarks (MMLU, MMLU-Pro, IFEval, BBH, AlpacaEval2, SEP, CyberSecEval2, InjecAgent, AgentDojo) are publicly available standard datasets. Training data sources (SQuADv2, TaskTracker, BeaverTails) are also public." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "Section 5.1 lists the base model (Llama-3.1-8B-Instruct) and training hyperparameters, but no library versions, requirements.txt, Dockerfile, or detailed environment specifications are provided." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "While code is released at a GitHub URL and implementation details are in Section 5.1, the paper contains no step-by-step reproduction instructions or 'Reproducing Results' section." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "All results in Figures 3, 4, and Tables 1-2 are reported as point estimates with no confidence intervals, error bars, or ± notation." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "No statistical significance tests are used anywhere. Claims like 'ReasAlign outperforms Meta SecAlign' are based solely on comparing raw numbers without any statistical testing." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "The paper consistently reports absolute differences with baseline context, e.g., 'reduces the ASR from 43.6% to just 3.6% on CySE' and 'utility gap between ReasAlign and Meta SecAlign widens significantly, reaching 38.2% on CySE.' Baseline values are provided alongside improvements." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "No justification is provided for sample sizes. The paper uses standard benchmarks without discussing whether the benchmark sizes are adequate for the claims made." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "No standard deviations, variance, or multi-run statistics are reported. Results appear to be from single runs with no spread measures." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Section 5.1 describes three baselines: undefended Llama-3.1-8B-Instruct, SecAlign, and Meta-SecAlign-8B. All are compared across all benchmarks." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "Meta SecAlign (Chen et al., 2025) is identified as the state-of-the-art defense model. SecAlign (Chen et al., 2024b) is also recent. These represent the strongest known defenses at the time of writing." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Section 5.5 ablates the reasoning module (with vs. without reasoning, Table 1). Section 5.6 ablates the node scaling parameter N from 1 to 5 (Figure 5)." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Two primary metrics are used throughout: Utility (task completion/correctness) and Attack Success Rate (ASR). Both are reported for every benchmark." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": false, 94 "justification": "All evaluation is automated. Utility is assessed by a GPT-4o-mini judge for instruction-following tasks, and by benchmark-defined metrics for general knowledge and agentic tasks. No human evaluation is performed." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "The model is trained on synthesized data from SQuADv2/TaskTracker/BeaverTails and evaluated on entirely separate benchmarks (MMLU, CySE, SEP, InjecAgent, AgentDojo), providing clear train/test separation." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Results are broken down across three evaluation dimensions (general knowledge: 4 benchmarks in Figure 3; instruction following: utility and security in Figure 4; agentic workflows: Table 2), with per-benchmark numbers throughout." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": false, 109 "justification": "The case studies (Figures 1, 6) show ReasAlign succeeding where Meta SecAlign fails. No cases where ReasAlign itself fails are shown or analyzed." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The paper reports slight utility degradation on general knowledge tasks (Figure 3), reduced no-attack utility on SEP (-0.9% vs. direct answer in Table 1), lower utility than base Qwen2.5-14B on AgentDojo (24.9% vs 27.4% in Table 2), and higher token overhead (Section 5.7, Figure 10)." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "Abstract claims of '94.6% utility and only 3.6% ASR' on CyberSecEval2 and Meta SecAlign's '56.4% utility and 74.4% ASR' are confirmed by Figure 4. The claim of 'best trade-off between security and utility' is supported across all evaluation dimensions." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "The paper claims reasoning improves security. Section 5.5 provides a controlled ablation comparing the same training data with vs. without reasoning steps (Table 1), demonstrating a causal contribution of the reasoning mechanism." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The abstract claims ReasAlign is 'a robust and practical defense against prompt injection attacks in real-world agentic systems,' but testing is limited to specific benchmarks with two models (Llama-3.1-8B and Qwen2.5-14B). No real-world deployment evaluation is performed." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper does not discuss alternative explanations such as whether improvements stem from the specific training data composition rather than reasoning, whether GPT-4o-mini's reasoning patterns encode specific heuristics, or whether results would hold against adaptive attackers aware of the defense mechanism." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper measures utility (task completion correctness) and ASR (whether the model follows injected instructions), which directly correspond to what is claimed. The metrics match the granularity of the claims." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": false, 148 "justification": "While 'Llama-3.1-8B-Instruct' and 'Qwen2.5-14B-Instruct' are reasonably specific, 'GPT-4o-mini' is used as both the data generator (Section 4.1) and evaluation judge (Section 5.1) without any snapshot date or API version. GPT-4o-mini behavior changes across versions." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": false, 153 "justification": "Figures 7 and 8 provide prompt templates for training data generation with placeholders ({user query}, {context}, etc.). However, the agent evaluation system prompt is only cited (Husain, 2024) but not reproduced, and the GPT-4o-mini judge prompt is not provided. A reader cannot reconstruct every prompt sent to the model." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 5.1 reports: batch size 4, 3 epochs, Adam optimizer with weight decay, learning rate 2×10⁻⁵, max input length 8192 tokens, N=3 for test-time scaling." 159 }, 160 "scaffolding_described": { 161 "applies": true, 162 "answer": true, 163 "justification": "The structured reasoning pipeline (Section 4.1: problem analysis → reasoning → final answer) and test-time scaling mechanism (Section 4.2: beam search tree with judge model scoring nodes) are described in detail, including the judge training procedure with DPO." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 4.1 documents the full data construction pipeline: user queries and context from SQuADv2, injection triggers from TaskTracker, injection instructions from BeaverTails, followed by structured reasoning sampling with GPT-4o-mini." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "A dedicated 'Limitations' section appears after the conclusion, discussing the overhead introduced by reasoning-based approaches." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": false, 180 "justification": "The limitations section mentions only the overhead issue and a future direction to explore selective reasoning. It does not discuss specific threats such as: GPT-4o-mini serving as both data generator and evaluator, potential overfitting to SQuADv2-style patterns, or vulnerability to adaptive attacks." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": false, 185 "justification": "The paper does not explicitly state what the results do NOT show. It claims broad applicability to 'real-world agentic systems' without bounding the scope to the specific benchmarks and models tested." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": false, 192 "justification": "The evaluation benchmarks are publicly available, but the model's raw outputs, synthesized training data, and judge model scores are not released. The Ethics Statement says artifacts 'will be made publicly available' (future tense)." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "Section 4.1 describes the data collection in detail: sources (SQuADv2 for queries/context, TaskTracker for triggers, BeaverTails for injection instructions), synthesis process, and structured reasoning sampling with GPT-4o-mini." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants. Data sources are standard publicly available benchmarks and datasets." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": true, 207 "justification": "Section 4.1 documents the pipeline from source data collection through injection sample synthesis, structured reasoning sampling, and safety alignment training. Section 4.2 describes the judge training pipeline with paired preference data." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding acknowledgment, grants, or sponsorship information appears anywhere in the paper." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations are listed: Washington University in St. Louis, University of Wisconsin–Madison, NVIDIA, and Johns Hopkins University. NVIDIA affiliation is disclosed for G. Edward Suh and Chaowei Xiao." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "Two authors (G. Edward Suh, Chaowei Xiao) are affiliated with NVIDIA, which has a commercial interest in LLM security and deployment. No independence statement is provided, and no funding source is disclosed to assess independence." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests or financial interests statement appears in the paper. NVIDIA affiliations could represent undisclosed financial interests." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": false, 236 "justification": "The paper evaluates Llama-3.1-8B-Instruct on general knowledge benchmarks (MMLU, MMLU-Pro, BBH, IFEval) without stating the model's training data cutoff date." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": false, 241 "justification": "No discussion of whether MMLU, MMLU-Pro, BBH, or IFEval questions may have appeared in Llama-3.1's pre-training data." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": false, 246 "justification": "MMLU (2021) and BBH were published well before Llama-3.1's training. No contamination analysis is performed despite this temporal overlap making contamination plausible." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study. All evaluation is automated." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants. The Ethics Statement discusses responsible AI development but not human subjects review." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": true, 290 "justification": "Section 5.7 analyzes token overhead. Figure 10 compares average token cost per sample between ReasAlign and Meta SecAlign across four benchmarks, with discussion of the tradeoff between cost and performance gains." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "No total GPU hours, training time, or hardware specifications are provided. Training hyperparameters are listed but the total computational budget for training or the hardware used is not stated." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "No multi-seed results are reported. All results appear to be from single runs with no analysis of seed sensitivity." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": false, 307 "justification": "The number of experimental runs is never stated. Results are presented without clarifying whether they represent single runs or averages." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": false, 312 "justification": "Hyperparameters are stated (Section 5.1) but no search budget, search method, or number of configurations tried is reported." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": true, 317 "justification": "The N=3 test-time scaling parameter is justified through the ablation study in Section 5.6 (Figure 5), which shows diminishing returns beyond N=3. The selection is made on a principled basis." 318 }, 319 "multiple_comparison_correction": { 320 "applies": true, 321 "answer": false, 322 "justification": "The paper makes numerous comparisons across 9+ benchmarks and multiple methods without performing any statistical tests, let alone applying multiple comparison corrections." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The authors reproduce Meta SecAlign using its official code for the Qwen experiment, but do not acknowledge the general bias of evaluating their own system. No independent evaluation or acknowledgment of author-evaluation bias." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": true, 332 "justification": "Section 5.7 compares token overhead between ReasAlign and Meta SecAlign (Figure 10). Section 5.6 shows how performance varies with the number of scaling nodes N, directly relating compute cost to performance." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": false, 337 "justification": "The paper uses multiple benchmarks but does not discuss whether they actually measure real-world prompt injection defense effectiveness. No analysis of construct validity for any benchmark." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": true, 341 "answer": true, 342 "justification": "All methods are compared on the same base model (Llama-3.1-8B-Instruct). The test-time scaling component is separately ablated (N=1 in Section 5.7 isolates reasoning overhead from scaling). The Qwen2.5-14B experiment uses the same training framework. The scaffold variable is controlled." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "No discussion of whether Llama-3.1's training data includes solutions to MMLU, BBH, or other benchmark problems that were publicly available before the model's training cutoff." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "No discussion of whether the evaluation setup leaks information. For example, the structured input format (user query vs. external data separation) could provide leakage cues." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "While training data (SQuADv2-based) and evaluation benchmarks are from different sources providing implicit independence, this is never explicitly discussed or verified." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": false, 364 "justification": "No concrete leakage detection or prevention methods (canary strings, membership inference, temporal splits, decontamination) are applied." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "ReasAlign achieves 94.6% utility and only 3.6% ASR on CyberSecEval2, far surpassing Meta SecAlign (56.4% utility, 74.4% ASR).", 371 "evidence": "Figure 4 shows CySE results under attack: ReasAlign 94.6% utility / 3.6% ASR vs. Meta SecAlign 56.4% utility / 7.3% ASR. The 74.4% ASR cited in the abstract for Meta SecAlign appears to reference the SEP benchmark under attack for the undefended Llama3 model, not Meta SecAlign on CySE. Meta SecAlign achieves 7.3% ASR on CySE per Figure 4.", 372 "supported": "moderate" 373 }, 374 { 375 "claim": "The reasoning mechanism is the primary driver of security improvement, reducing ASR from 21.8% to 3.6% on CySE.", 376 "evidence": "Table 1 (Section 5.5) compares direct-answer supervision vs. reasoning-enhanced ReasAlign on the same training data. ASR drops from 21.8% to 3.6% on CySE and from 66.9% to 1.1% on SEP with reasoning added.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "ReasAlign maintains utility comparable to the undefended model while significantly improving security.", 381 "evidence": "Figure 3 shows general knowledge scores within 0.5% of the base model on average. Figure 4 shows no-attack utility of 96.8% (AlpacaEval2) and 98.0% (SEP) vs. 95.7% and 99.0% for base Llama3. Under attack, utility improves from 78.2% to 94.6% on CySE.", 382 "supported": "strong" 383 }, 384 { 385 "claim": "Test-time scaling improves both utility and security as the number of nodes increases from 1 to 5.", 386 "evidence": "Figure 5 (Section 5.6) on SEP shows under-attack utility improving from 91.6% (N=1) to 96.4% (N=5) and ASR decreasing from 4.6% to 0.9%. Growth rates slow after N=3.", 387 "supported": "strong" 388 }, 389 { 390 "claim": "ReasAlign generalizes to larger models, reducing ASR on AgentDojo from 14.5% to 2.4% and on InjecAgent from 24.5% to 2.7% with Qwen2.5-14B.", 391 "evidence": "Table 2 shows Qwen2.5-14B results: ReasAlign achieves 2.4% ASR on AgentDojo (vs. 14.5% undefended, 8.1% Meta SecAlign) and 2.7% on InjecAgent (vs. 24.6% undefended, 4.3% Meta SecAlign), with slight utility loss (24.9% vs. 27.4% on AgentDojo).", 392 "supported": "strong" 393 }, 394 { 395 "claim": "ReasAlign achieves the best trade-off between security and utility across all evaluated tasks.", 396 "evidence": "Across 7 utility benchmarks and 4 security benchmarks, ReasAlign consistently achieves the lowest or near-lowest ASR while maintaining the highest utility under attack. Some slight utility loss on no-attack settings (e.g., -0.9% on SEP in Table 1).", 397 "supported": "moderate" 398 } 399 ], 400 "red_flags": [ 401 { 402 "flag": "No error bars or variance across runs", 403 "detail": "All results are reported as single point estimates with no confidence intervals, standard deviations, or multi-seed experiments. Given that fine-tuning and test-time scaling involve stochastic elements, the stability of results is unknown." 404 }, 405 { 406 "flag": "GPT-4o-mini used as both data generator and evaluator", 407 "detail": "GPT-4o-mini generates the structured reasoning training data (Section 4.1) and also serves as the evaluation judge for utility assessment (Section 5.1). This circular dependency could inflate utility scores if the judge model favors patterns similar to those it generated." 408 }, 409 { 410 "flag": "Potential abstract ASR discrepancy", 411 "detail": "The abstract states Meta SecAlign has '74.4% ASR' on CyberSecEval2, but Figure 4 shows Meta SecAlign achieving 7.3% ASR on CySE. The 74.4% figure appears to come from the undefended Llama3 model on SEP. This should be verified against the source data." 412 }, 413 { 414 "flag": "No adaptive attack evaluation", 415 "detail": "All attacks are drawn from existing benchmarks. The paper does not evaluate against adaptive attackers who know ReasAlign's defense mechanism (e.g., crafting injections that mimic the structured reasoning format or that are designed to pass the reasoning check)." 416 }, 417 { 418 "flag": "NVIDIA affiliation without conflict disclosure", 419 "detail": "Two authors are affiliated with NVIDIA, which has commercial interests in LLM security and deployment. No competing interests statement or funding disclosure is provided." 420 } 421 ], 422 "cited_papers": [ 423 { 424 "title": "Struq: Defending against prompt injection with structured queries", 425 "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David A. Wagner"], 426 "year": 2024, 427 "arxiv_id": "2402.06363", 428 "relevance": "Foundational model-level prompt injection defense that introduces structured input separation, serving as a key baseline." 429 }, 430 { 431 "title": "SecAlign: Defending against prompt injection with preference optimization", 432 "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "David Wagner", "Chuan Guo"], 433 "year": 2024, 434 "arxiv_id": "2410.05451", 435 "relevance": "Direct predecessor using DPO for prompt injection defense; compared as baseline throughout the paper." 436 }, 437 { 438 "title": "Meta SecAlign: A secure foundation LLM against prompt injection attacks", 439 "authors": ["Sizhe Chen", "Arman Zharmagambetov", "David Wagner", "Chuan Guo"], 440 "year": 2025, 441 "arxiv_id": "2507.02735", 442 "relevance": "State-of-the-art model-level defense against prompt injection; primary baseline showing over-defense issues." 443 }, 444 { 445 "title": "CyberSecEval 2: A wide-ranging cybersecurity evaluation suite for large language models", 446 "authors": ["Manish Bhatt", "Sahana Chennabasappa", "Yue Li"], 447 "year": 2024, 448 "arxiv_id": "2404.13161", 449 "relevance": "Key evaluation benchmark for LLM security including prompt injection tasks in open-ended settings." 450 }, 451 { 452 "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents", 453 "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"], 454 "year": 2024, 455 "relevance": "Agentic prompt injection benchmark used to evaluate defense effectiveness in tool-integrated LLM agents." 456 }, 457 { 458 "title": "AgentDojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents", 459 "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"], 460 "year": 2024, 461 "relevance": "Dynamic agent security benchmark providing realistic evaluation of prompt injection attacks and defenses." 462 }, 463 { 464 "title": "Defeating prompt injections by design", 465 "authors": ["Edoardo Debenedetti", "Ilia Shumailov", "Tianqi Fan"], 466 "year": 2025, 467 "arxiv_id": "2503.18813", 468 "relevance": "CaMeL system-level defense using static control/data flow analysis to enforce security against prompt injection." 469 }, 470 { 471 "title": "PIGuard: Prompt injection guardrail via mitigating overdefense for free", 472 "authors": ["Hao Li", "Xiaogeng Liu", "Ning Zhang", "Chaowei Xiao"], 473 "year": 2025, 474 "relevance": "External prompt injection guardrail that addresses the overdefense problem, closely related to ReasAlign's motivation." 475 }, 476 { 477 "title": "IsolateGPT: An execution isolation architecture for LLM-based agentic systems", 478 "authors": ["Yuhao Wu", "Franziska Roesner", "Tadayoshi Kohno", "Ning Zhang", "Umar Iqbal"], 479 "year": 2025, 480 "relevance": "System-level defense using execution environment isolation for LLM agents, representing an alternative defense paradigm." 481 }, 482 { 483 "title": "Progent: Programmable privilege control for LLM agents", 484 "authors": ["Tianneng Shi", "Jingxuan He", "Zhun Wang"], 485 "year": 2025, 486 "arxiv_id": "2504.11703", 487 "relevance": "Dynamic policy-based defense for LLM agents with programmable privilege control." 488 }, 489 { 490 "title": "DRIFT: Dynamic rule-based defense with injection isolation for securing LLM agents", 491 "authors": ["Hao Li", "Xiaogeng Liu", "Hung-Chun Chiu", "Dianqi Li", "Ning Zhang", "Chaowei Xiao"], 492 "year": 2025, 493 "arxiv_id": "2506.12104", 494 "relevance": "Dynamic rule-based system-level defense for LLM agents against prompt injection, from overlapping author group." 495 }, 496 { 497 "title": "Are you still on track!? Catching LLM task drift with activations", 498 "authors": ["Sahar Abdelnabi", "Aideen Fay", "Giovanni Cherubin", "Ahmed Salem", "Mario Fritz", "Andrew Paverd"], 499 "year": 2024, 500 "arxiv_id": "2406.00799", 501 "relevance": "TaskTracker dataset used as source of injection triggers in ReasAlign's training data construction." 502 }, 503 { 504 "title": "Llama Guard: LLM-based input-output safeguard for human-AI conversations", 505 "authors": ["Hakan Inan", "Kartikeya Upasani", "Jianfeng Chi"], 506 "year": 2023, 507 "arxiv_id": "2312.06674", 508 "relevance": "LLM-based safety guardrail for AI conversations, representing the external detection-based defense approach." 509 } 510 ], 511 "engagement_factors": { 512 "practical_relevance": { 513 "score": 2, 514 "justification": "Practitioners building LLM agents could adopt ReasAlign, and code is released, but the method requires LoRA fine-tuning of base models which limits immediate plug-and-play usability." 515 }, 516 "surprise_contrarian": { 517 "score": 1, 518 "justification": "The finding that reasoning helps defense is consistent with the broader trend of reasoning improving LLM performance; the over-defense critique of Meta SecAlign is somewhat novel but not deeply contrarian." 519 }, 520 "fear_safety": { 521 "score": 2, 522 "justification": "Directly addresses prompt injection vulnerabilities in LLM agents and demonstrates that state-of-the-art defenses (Meta SecAlign) have severe utility-security tradeoff issues." 523 }, 524 "drama_conflict": { 525 "score": 1, 526 "justification": "Implicitly challenges Meta's SecAlign defense as impractical due to over-defense, but frames it as incremental improvement rather than confrontation." 527 }, 528 "demo_ability": { 529 "score": 1, 530 "justification": "Code is released on GitHub but requires fine-tuning LLMs from scratch; no live demo or pip-installable tool." 531 }, 532 "brand_recognition": { 533 "score": 1, 534 "justification": "NVIDIA co-authorship provides some recognition, but this is not a flagship NVIDIA product release. Uses open-source Llama model." 535 } 536 } 537 }