scan.json (33636B)
1 { 2 "paper": { 3 "title": "Proactive Hardening of LLM Defenses with HASTE", 4 "authors": [ 5 "Henry Chen", 6 "Victor Aranda", 7 "Samarth Keshari", 8 "Ryan Heartfield", 9 "Nicole Nichols" 10 ], 11 "year": 2026, 12 "venue": "NDSS Symposium 2026, LAST-X Workshop", 13 "arxiv_id": "2601.19051", 14 "doi": "10.48550/arXiv.2601.19051" 15 }, 16 "scan_version": 3, 17 "active_modules": ["experimental_rigor", "data_leakage"], 18 "methodology_tags": ["benchmark-eval"], 19 "key_findings": "HASTE's most aggressive hard-negative mining with semantic fuzzing (HM-Max+Sem) reduces baseline detector accuracy from 95.91% to 31.76%, demonstrating that iterative hard-negative generation is far more effective at producing evasive prompts than basic fuzzing alone. HASTE-retrained models recover to ~94% accuracy on held-out evaluation data, with hard-mining configurations achieving this in 50% fewer iteration loops than baseline strategies. Semantic fuzzing is the most effective individual technique, and combining all fuzzing types can paradoxically counteract semantic fuzzing's evasive effectiveness. Most improvement plateaus after the first or second iteration, suggesting diminishing returns from extended iteration loops.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper. The generation prompts are explicitly withheld: 'These specific prompts are not provided to prevent their misuse' (Section III-B)." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": false, 30 "justification": "The seed dataset of ~4,500 adversarial prompts is described as from 'internal and public sources' and 'proprietary internal findings' (Section IV-B) but is not released. No download link or dataset archive is provided." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "No requirements.txt, Dockerfile, conda environment, or library version specifications are provided. The paper names models used (GPT-4o, DeBERTa-v3, JailJudge, T5-base paraphraser) but provides no environment setup details." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided. The framework architecture is described conceptually but cannot be reproduced without code, prompts, and data." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": true, 47 "justification": "Section V-A mentions 'overlapping 95% confidence intervals of the sampling estimates (e.g., It2: 0.6257–0.6794 vs. It5: 0.6417–0.6757)' to support the claim that post-iteration-2 fluctuations are not statistically meaningful." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "No formal significance tests (p-values, t-tests, etc.) are reported. Comparative claims like 'HM-Max-Sem is the most effective strategy' (Section V-B) are based on comparing point estimates without statistical testing." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "Effect sizes are reported with baseline context throughout, e.g., 'dropping classifier accuracy from 95.91% to 37.00%' (Section V-B), delta improvements in Tables V and VI (e.g., '+19.9' for Benign), and percentage drops from baseline in Table III." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "The choice of 500 prompts per run, ~4,500 seed prompts, 1,000 hold-out set, and 10 iterations is stated but never justified. No power analysis or rationale for these specific numbers is provided." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "Tables II, III, and IV report single numbers per configuration per iteration with no standard deviation, variance across runs, or indication of how many runs produced each number. The confidence intervals in Section V-A appear to be single-sample binomial intervals, not cross-run variance." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "The Base configuration (no hard-mining, no fuzzing) serves as a clear baseline. Multiple configurations are systematically compared against it in Tables II, III, and IV." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "The base detection model is ProtectAI/deberta-v3-base-prompt-injection (2023), a contemporary prompt-injection detector. The judge model JailJudge (2024) and target model GPT-4o are also recent." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "The experimental design is structured as a systematic ablation. Section IV-A notes: 'the experiment configurations implicitly implement module ablations by toggling specific parameters.' Fuzzing variants, hard-mining ratios, and combinations are independently toggled across 11 configurations (Table I)." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Two distinct metrics are used: (i) iteration accuracy measuring baseline detector performance on generated prompts, and (ii) HASTE-optimized model accuracy (H accuracy) measuring retrained detector performance on the held-out evaluation set (Section IV-A)." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": false, 94 "justification": "No human evaluation is performed. Evaluation relies entirely on automated metrics: the DeBERTa classifier for detection and JailJudge (an LLM) for prompt potency scoring. No human review of generated adversarial prompts or detection quality." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "Section III-A clearly states: 'an evaluation set is held out and remains untouched by any fuzzing, synthetic generation or hard negative mining.' The 1,000-prompt held-out set is used for Table IV H accuracy results, independent of in-loop training data." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Tables V and VI provide per-category accuracy breakdowns (Benign, Obfuscation, Objective Manipulation, Other, Role Play) for each configuration on the out-of-loop evaluation set." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section V-A discusses how combining all fuzzing types counteracts semantic fuzzing's effectiveness. SHAP analysis (Figure 9) examines token-level attributions for false-positive examples. Section V-B notes that HM-Bal shows proportional rather than synergistic gains." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Several negative results are reported: Base configuration 'produces little to no impact on the maliciousness of generated samples' (Section V-A); combining all fuzzing types counteracts semantic fuzzing's evasive effectiveness; '10 iterations are not necessary' (Section V-B); benchmark sensitivity may be insufficient to capture robustness differences (Section V-C)." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The abstract's claim of '~64% reduction in malicious prompt detection' is supported by Table III (HM-Max-Sem: 95.91% → 31.76%). The claim of 'significantly fewer iteration loops' is supported by Table IV showing HM configurations reach ~93% at M5 vs Base requiring M10." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "Causal claims ('hard negative mining successfully evades baseline detectors,' 'HASTE optimizes detection efficacy') are supported by a controlled ablation design where configurations differ by one or two parameters (Table I), isolating the contribution of fuzzing and hard-mining." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The title ('LLM Defenses') and abstract ('optimize LLM defense') frame results broadly, but experiments use only one detector (ProtectAI DeBERTa-v3), one target model (GPT-4o), one judge (JailJudge), and one seed dataset. The Future Work section acknowledges the need to 'evaluate multiple base classifier architectures' (Section VI), confirming the limitation without bounding it in claims." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper offers one interpretation for each result without systematically considering alternatives. For example, the observation that combined fuzzing counteracts semantic fuzzing is noted as 'beyond the scope of this study' (Section V-A) rather than explored. No confound analysis or robustness checks against alternative explanations are provided." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper measures detector accuracy directly and frames it as such — 'iteration accuracy' and 'H accuracy' are clearly defined as detection metrics (Section IV-A). The paper does not over-claim beyond what the metrics measure, though it uses broader framing in the title and abstract." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": false, 148 "justification": "The paper specifies 'GPT-4o' as target and generator, 'JailJudge' as judge, 'ProtectAI/deberta-v3-base-prompt-injection' as detector, and 'humarin/chatgpt_paraphraser_on_T5_base' for semantic fuzzing. However, no API snapshot dates or version identifiers are provided for GPT-4o, which changes behavior across versions." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": false, 153 "justification": "The paper explicitly withholds prompts: 'These specific prompts are not provided to prevent their misuse' (Section III-B). Only natural-language descriptions of what generation strategies do are given." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": false, 158 "justification": "Some parameters are stated — 5% syntactic mutation rate, maliciousness threshold of 2, 500 prompts per run, 80/20 train/test split. However, critical hyperparameters for GPT-4o generation (temperature, top-p) and DeBERTa fine-tuning (learning rate, epochs, batch size, optimizer) are not reported." 159 }, 160 "scaffolding_described": { 161 "applies": false, 162 "answer": false, 163 "justification": "The HASTE pipeline is a data generation and model training framework, not agentic scaffolding with tools, retry logic, or memory/context management. The LLMs are used as generation and evaluation components in a batch pipeline." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section IV-B documents the pipeline: seed dataset partitioned into 3,500 for generation and 1,000 hold-out, prompts deduplicated and standardized, labeled with taxonomy metadata, balanced 50/50 with benign corpus (~40,000 benign examples), and split 80/20 for train/test with stratification." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": false, 175 "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. Section VI ('Future Work') discusses areas for improvement but frames them as expansion opportunities rather than honest discussion of what the current work does not show." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": false, 180 "justification": "No specific threats to validity are discussed. The Future Work section mentions that 'current benchmarks may not be sensitive enough' (Section VI) but does not identify specific threats to the internal or external validity of the current experiments." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": false, 185 "justification": "The paper does not explicitly state what the results do NOT show. The Future Work section implicitly acknowledges gaps (single detector architecture, single target model, fixed fuzzing parameters) but these are framed as expansion opportunities rather than explicit scope boundaries." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": false, 192 "justification": "No raw data (seed prompts, generated adversarial prompts, evaluation scores, training data) is made available. The seed dataset includes 'proprietary internal findings' (Section III-A) which precludes release." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "Section III-A describes the seed dataset: '~4,500 prompts aggregated from internal and public sources' including 'real-world jailbreaks observed in the wild, public benchmark datasets, proprietary internal findings, and crowd-sourced CTF-style challenges.' Prompts were 'deduplicated and standardized' and labeled by attack taxonomy." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants are involved in this study. The data consists of adversarial and benign prompts from existing datasets and internal sources." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": false, 207 "justification": "The pipeline stages are described conceptually with figures (Figures 1, 3-8), but exact counts at each filtering stage are not provided. For example, how many prompts passed the evaluation threshold (score ≥ 2), how many survived deduplication, and how many fuzzed variants were produced per iteration are not reported." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding disclosure or acknowledgments section is present. All five authors are employed by Palo Alto Networks, a major cybersecurity company, but no funding statement is included." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "All authors are listed as Palo Alto Networks employees with @paloaltonetworks.com email addresses in the paper header." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "Palo Alto Networks is a cybersecurity company that sells LLM security products. The company has a direct financial interest in demonstrating that proactive defense hardening frameworks like HASTE are effective. The funder (employer) is not independent of the outcome." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests statement, patent disclosure, or financial interests declaration is present in the paper." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": false, 235 "answer": false, 236 "justification": "This paper tests prompt-injection detection models, not a pre-trained model's knowledge or capability on a benchmark. The evaluation is about whether a classifier can identify adversarial prompts, not whether an LLM has memorized benchmark answers." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": false, 240 "answer": false, 241 "justification": "Same as above — the paper evaluates defense detection systems rather than pre-trained model capability on benchmarks. Standard train/test separation is used for the classifier but benchmark contamination in the traditional sense does not apply." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": false, 245 "answer": false, 246 "justification": "Same as above — the paper tests defense detection efficacy, not pre-trained model knowledge. Contamination of the LLM's training data with benchmark answers is not the relevant concern here." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants are involved in this study." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants are involved in this study." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants are involved in this study." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants are involved in this study." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants are involved in this study." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants are involved in this study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants are involved in this study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": false, 290 "justification": "No API costs, token counts, or latency figures are reported despite extensive use of GPT-4o for generation, evaluation, and labeling across 10 iterations of 11 configurations." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "No GPU hours, total API spend, hardware specifications, or training time for DeBERTa fine-tuning are reported." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "No multi-seed experiments are reported. Tables II, III, and IV show single numbers per configuration per iteration with no indication of variability across random seeds." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": false, 307 "justification": "The number of experimental runs producing each result is not stated. It is unclear whether each table entry represents a single run or an average across multiple runs." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": false, 312 "justification": "No hyperparameter search budget is reported for DeBERTa fine-tuning or for tuning the HASTE pipeline parameters. The evaluation threshold is set to 2 based on a single observation ('65% of generated samples scored 1') without systematic search." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": true, 317 "justification": "All 11 configurations are reported transparently in Tables II-IV rather than cherry-picking the best. The paper presents the full comparison grid, allowing readers to assess each configuration's relative merit." 318 }, 319 "multiple_comparison_correction": { 320 "applies": true, 321 "answer": false, 322 "justification": "No formal statistical tests are used, so no multiple comparison correction is applied. With 11 configurations compared across 10+ iterations, the risk of spurious differences is high but unaddressed." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The authors evaluate their own HASTE framework against their own baseline configurations using their own seed dataset and pipeline. No acknowledgment of self-comparison bias or independent evaluation is provided." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": false, 332 "justification": "Different configurations require different amounts of compute (e.g., fuzzing adds processing, 10 iterations vs 5), but performance is not reported as a function of compute budget. The 50% reduction in iterations is framed as efficiency but not quantified in compute terms." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": false, 337 "justification": "The Future Work section briefly notes that 'current benchmarks may not be sensitive enough to capture the deeper robustness learned from these harder samples' (Section VI), but no analysis of construct validity is performed. Whether iteration accuracy and H accuracy truly measure 'robustness' and 'evasiveness' is not examined." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": false, 341 "answer": false, 342 "justification": "The HASTE pipeline configurations ARE the variable being studied, not a confound. The detection model is held constant (DeBERTa-v3) across all comparisons, and the pipeline differences are the intentional experimental manipulation." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "No discussion of whether GPT-4o's training data might contain similar adversarial prompts to those in the seed set, which could affect generation quality. The held-out evaluation set addresses train/test leakage for the classifier but temporal leakage for the generative components is not discussed." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "No discussion of whether the evaluation setup provides information that would not be available in real deployment. For example, whether the LLM-as-a-Judge scores leak signal about prompt structure back into the pipeline is not analyzed." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "The held-out evaluation set and in-loop training data are drawn from the same ~4,500 prompt seed dataset. Whether these two partitions are truly independent in distribution (e.g., same authors, same attack patterns) is not discussed." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": false, 364 "justification": "No concrete leakage detection or prevention method (canary strings, membership inference, n-gram overlap, decontamination pipeline) is applied." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "Hard negative mining with semantic fuzzing reduces baseline detector accuracy from 95.91% to approximately 31.76% (a ~64% reduction).", 371 "evidence": "Table III shows HM-Max+Sem iteration accuracy dropping from 95.91% (It0) to 31.76% (It10). The abstract reports 'approximately 64%' reduction.", 372 "supported": "strong" 373 }, 374 { 375 "claim": "HASTE-optimized models achieve comparable final accuracy (~93-94%) with 50% fewer iteration loops than baseline strategies.", 376 "evidence": "Table IV shows HM configurations reaching ~93% H accuracy at M5, while the Base configuration only reaches 81.06% at M5 and needs M10 to reach 92.80%. Section V-C states 'at least a 50% reduction of the number of HASTE iteration loops.'", 377 "supported": "strong" 378 }, 379 { 380 "claim": "Semantic fuzzing is the most effective individual fuzzing method for producing evasive prompts.", 381 "evidence": "Table II shows Base+Sem reducing accuracy to 65.26% vs 71.62% (Syntactic), 70.86% (Format), and 70.37% (All). Section V-A concludes Base+Sem 'comparatively produced the largest percentage of prompts to successfully evade detection.'", 382 "supported": "strong" 383 }, 384 { 385 "claim": "Most adversarial generation improvement plateaus after the first or second iteration.", 386 "evidence": "Tables II and III show the largest accuracy drops at It1, with minimal further degradation through It10. Section V-A notes 'the peak improvement for producing evasive malicious prompts typically occurs in the second iteration' and Section V-B states 'the utility of all strategies plateaus almost immediately after the first iteration.'", 387 "supported": "strong" 388 }, 389 { 390 "claim": "Combining all fuzzing types (semantic + syntactic + format) counteracts the evasive effectiveness of semantic fuzzing alone.", 391 "evidence": "Table II: Base+All achieves 70.37% accuracy vs Base+Sem at 65.26%. Section V-A states 'the addition of syntactic and format fuzzing may counteract the evasive effectiveness of semantic fuzzing.' SHAP analysis (Figure 9) shows syntactic changes dominate token attributions.", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "The HASTE framework provides a self-improving mechanism that makes detectors increasingly robust over time.", 396 "evidence": "Table IV shows H accuracy improving from 82.12% (M0) to 93-94% (M10) across configurations. However, 'self-improving' is somewhat overclaimed — the improvement plateaus quickly and the framework is tested on a single detector with a single seed dataset.", 397 "supported": "moderate" 398 } 399 ], 400 "red_flags": [ 401 { 402 "flag": "Company evaluating its own framework", 403 "detail": "All five authors are Palo Alto Networks employees evaluating a Palo Alto Networks security framework. No competing interests statement is provided, and the company has direct financial interest in demonstrating effective LLM defense hardening." 404 }, 405 { 406 "flag": "Prompts and code withheld — reproduction impossible", 407 "detail": "Generation prompts are explicitly withheld ('to prevent misuse'), no source code is released, and the seed dataset includes proprietary data. Independent reproduction of the experimental results is not possible." 408 }, 409 { 410 "flag": "Single detector architecture", 411 "detail": "All experiments use only ProtectAI DeBERTa-v3 as the base detector. The paper's broad claims about 'LLM Defenses' rest entirely on one model, undermining generalizability. The Future Work section acknowledges the need to 'evaluate multiple base classifier architectures.'" 412 }, 413 { 414 "flag": "No variance or multi-run reporting", 415 "detail": "All tables report single point estimates per configuration per iteration. Without variance across seeds or multiple runs, it is impossible to assess whether observed differences between configurations are reliable or within noise." 416 }, 417 { 418 "flag": "No cost reporting despite heavy API usage", 419 "detail": "The pipeline uses GPT-4o extensively for generation, evaluation (target model), and labeling across 11 configurations × 10 iterations, but reports zero cost information. Practitioners cannot assess whether HASTE is economically viable." 420 } 421 ], 422 "cited_papers": [ 423 { 424 "title": "Universal and transferable adversarial attacks on aligned language models", 425 "authors": ["A. Zou", "Z. Wang", "N. Carlini", "M. Nasr", "J. Z. Kolter", "M. Fredrikson"], 426 "year": 2023, 427 "arxiv_id": "2307.15043", 428 "relevance": "Foundational work on automatically generated adversarial suffixes that reliably bypass LLM alignment, directly motivating the need for frameworks like HASTE." 429 }, 430 { 431 "title": "Attack and defense techniques in large language models: A survey and new perspectives", 432 "authors": ["Z. Liao", "K. Chen", "Y. Lin", "K. Li", "Y. Liu", "H. Chen", "X. Huang", "Y. Yu"], 433 "year": 2025, 434 "arxiv_id": "2505.00976", 435 "relevance": "Comprehensive survey of LLM attacks and defenses that categorizes adversarial prompts, optimization attacks, and application-level threats." 436 }, 437 { 438 "title": "Efficient Detection of Toxic Prompts in Large Language Models", 439 "authors": ["Y. Liu", "J. Yu", "H. Sun", "L. Shi", "G. Deng", "Y. Chen", "Y. Liu"], 440 "year": 2024, 441 "doi": "10.1145/3691620.3695018", 442 "relevance": "Introduces ToxicDetector, an embedding-based approach to toxic prompt detection that represents the progression from textual to hybrid detection models." 443 }, 444 { 445 "title": "Llama Guard: LLM-based input-output safeguard for human-AI conversations", 446 "authors": ["H. Inan", "K. Upasani", "J. Chi"], 447 "year": 2023, 448 "arxiv_id": "2312.06674", 449 "relevance": "Foundational LLM safety guardrail model that frames prompt safety as structured classification, one of the detection models the HASTE framework aims to improve upon." 450 }, 451 { 452 "title": "NeMo Guardrails: A toolkit for controllable and safe LLM applications with programmable rails", 453 "authors": ["T. Rebedea", "R. Dinu", "M. Sreedhar", "C. Parisien", "J. Cohen"], 454 "year": 2023, 455 "arxiv_id": "2310.10501", 456 "relevance": "Programmable runtime guardrail framework implementing dialogue control flows and safe tool-use routing for LLM applications." 457 }, 458 { 459 "title": "JailbreakBench: An open robustness benchmark for jailbreaking large language models", 460 "authors": ["P. Chao", "E. Debenedetti", "A. Robey"], 461 "year": 2024, 462 "arxiv_id": "2404.01318", 463 "relevance": "Standardized benchmark for evaluating jailbreak effectiveness with consistent evaluation and cross-model comparison." 464 }, 465 { 466 "title": "HarmBench: A standardized evaluation framework for automated red teaming and robust refusal", 467 "authors": ["M. Mazeika", "L. Phan", "X. Yin", "A. Zou"], 468 "year": 2024, 469 "arxiv_id": "2402.04249", 470 "relevance": "Standardized red-teaming benchmark focusing on harmful behaviors, providing automated evaluation of LLM safety." 471 }, 472 { 473 "title": "A comprehensive survey in LLM(-agent) full stack safety: Data, training and deployment", 474 "authors": ["K. Wang"], 475 "year": 2025, 476 "arxiv_id": "2504.15585", 477 "relevance": "Comprehensive survey covering the full stack of LLM safety from data to deployment, relevant to understanding the defense landscape HASTE targets." 478 }, 479 { 480 "title": "JailGuard: A universal detection framework for LLM prompt-based attacks", 481 "authors": ["X. Zhang", "C. Zhang", "T. Li"], 482 "year": 2025, 483 "arxiv_id": "2312.10766", 484 "relevance": "Universal detection framework for prompt-based attacks that demonstrates detection models struggle with out-of-distribution attacks — a problem HASTE addresses." 485 }, 486 { 487 "title": "AID: Adaptive integration of detectors for safe AI with language models", 488 "authors": ["X. Wang", "E. Diao", "Q. Le", "J. Ding", "A. Anwar"], 489 "year": 2025, 490 "relevance": "Demonstrates that combining multiple detectors improves adaptability over single-model approaches, providing context for HASTE's single-detector evaluation limitation." 491 }, 492 { 493 "title": "Building guardrails for large language models", 494 "authors": ["Y. Dong", "R. Mu", "G. Jin"], 495 "year": 2024, 496 "arxiv_id": "2402.01822", 497 "relevance": "Describes multi-layered safety architectures for LLMs that HASTE's detection hardening component fits within." 498 }, 499 { 500 "title": "JailJudge: A comprehensive jailbreak judge benchmark with multi-agent enhanced explanation evaluation framework", 501 "authors": ["F. Liu", "Y. Feng", "Z. Xu"], 502 "year": 2024, 503 "arxiv_id": "2410.12855", 504 "relevance": "The LLM-as-a-judge model used in HASTE's evaluation stage for scoring prompt-response pair maliciousness." 505 } 506 ], 507 "engagement_factors": { 508 "practical_relevance": { 509 "score": 2, 510 "justification": "The HASTE concept is directly applicable for security teams hardening LLM defenses, but no code, prompts, or data are released for practitioners to actually use." 511 }, 512 "surprise_contrarian": { 513 "score": 1, 514 "justification": "Results largely confirm expected patterns (hard-negative mining helps, iterations plateau); the finding that combined fuzzing counteracts semantic fuzzing is mildly surprising." 515 }, 516 "fear_safety": { 517 "score": 2, 518 "justification": "Demonstrates that adversarial prompts can reduce detection model accuracy from 96% to 32%, highlighting the vulnerability of current prompt-injection detectors." 519 }, 520 "drama_conflict": { 521 "score": 0, 522 "justification": "No controversy, no challenge to specific products or companies, no dramatic narrative." 523 }, 524 "demo_ability": { 525 "score": 0, 526 "justification": "No code, demo, or tool released. Prompts explicitly withheld." 527 }, 528 "brand_recognition": { 529 "score": 1, 530 "justification": "Palo Alto Networks is well-known in cybersecurity but not a top-tier AI research lab in public perception." 531 } 532 } 533 }