scan.json (29925B)
1 { 2 "paper": { 3 "title": "PromptScreen: Efficient Jailbreak Mitigation Using Semantic Linear Classification in a Multi-Staged Pipeline", 4 "authors": [ 5 "Akshaj Prashanth Rao", 6 "Advait Singh", 7 "Saumya Kumaar Saksena", 8 "Dhruv Kumar" 9 ], 10 "year": 2025, 11 "venue": "arXiv", 12 "arxiv_id": "2512.19011" 13 }, 14 "scan_version": 3, 15 "active_modules": ["experimental_rigor", "data_leakage"], 16 "methodology_tags": ["benchmark-eval"], 17 "key_findings": "PromptScreen proposes a multi-stage defense pipeline against LLM jailbreak and prompt injection attacks, with a lightweight TF-IDF + Linear SVM classifier as its core component. The SVM achieves 93.4% accuracy and 96.5% specificity on a held-out test set drawn from 30,937 labeled prompts, while the full pipeline (SVM + VectorDB + classifier cluster) achieves 0% ASR at 47s average latency versus 450s for ShieldGemma. An ablation study shows character n-gram (2,4) features outperform word-level features under adversarial perturbations, reaching 94.09% accuracy.", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "Section 7 provides a GitHub URL: https://github.com/dronefreak/PromptScreen. The paper states 'the source code is available' at this link." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "Section 3.2 states 'Both the corpus and associated preprocessing scripts are included in the open-source release of this work.' The adversarial data also references publicly available datasets (ADV-LLM, gentelbench-v1)." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "No requirements.txt, Dockerfile, conda environment file, or detailed dependency specifications are provided in the paper. Library versions are not listed." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions are included in the paper. While code is released, the paper contains no 'Reproducing Results' section or specific commands to run." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "Tables 2, 3, and 4 report only point estimates (e.g., 93.40% accuracy) with no confidence intervals, error bars, or ± notation." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "The paper claims the SVM 'outperforms' ShieldGemma and the classifier cluster based solely on comparing raw numbers in Table 2. No statistical significance tests (p-values, t-tests, etc.) are reported for any comparison." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "The paper reports improvements with baseline context: accuracy from 35.1% (ShieldGemma) to 93.4% (LSVM), latency from ~450s to 47s, and block rate from 50.62% to 96.50% across defense subsets in Tables 2 and 3." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "No justification is given for why 30,937 prompts were chosen, why the test set is ~2,000 prompts, or whether this is sufficient for the claims made. No power analysis is discussed." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "No standard deviation, variance across runs, or spread measures are reported anywhere. All results appear to be from single runs." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "Table 2 compares the LSVM against ShieldGemma-2B and a classifier cluster. Table 3 compares multiple defense subset configurations. Table 4 compares 8 SVM ablation configurations." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "ShieldGemma (Google, 2024) is a recent production-grade defense. The classifier cluster uses contemporary models (textdetox/xlmr-large-toxicity-classifier-v2, jackhhao/jailbreak-classifier)." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "Section 6 presents a systematic ablation study over 8 SVM feature configurations (word n-grams, character n-grams, hybrid), isolating the contribution of each representation type. Table 3 also ablates pipeline components." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "The paper reports accuracy, precision, sensitivity (recall), specificity, negative predictive value, Attack Success Rate, and Time-to-Classify across multiple tables." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "No human evaluation of defense outputs is performed. All evaluation is automated using classification metrics and an LLM-as-a-judge for ASR verification." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": true, 97 "justification": "Section 3.2 explicitly states: 'A held-out test set of more than 2,000 prompts is reserved exclusively for evaluation of defense configurations and Attack Success Rate. No prompt appears in more than one split.'" 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Table 2 provides per-defense breakdowns. Table 3 shows per-configuration defense effectiveness. Table 4 provides per-SVM-configuration breakdowns across all metrics. Table 1 shows dataset composition by category." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": false, 107 "justification": "The paper does not discuss specific examples of misclassified prompts, analyze why certain attacks bypass particular defense stages, or provide qualitative error analysis of the main system." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "Appendix A.1 explicitly reports 'defenses that were implemented but did not meet our robustness or usability requirements,' including the Heuristic Vector Analyzer and Polymorphic Prompt Assembly, with explanations of why they failed." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "Abstract claims of 93.4% accuracy, 96.5% specificity, 10× lower latency than ShieldGemma, and improvement from 35.1% to 93.4% are all supported by Tables 2 and 3." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "Causal claims (e.g., adding SVM to the pipeline reduces ASR) are supported by the ablation design: Table 3 shows defense subsets with and without the SVM, and Section 6 systematically varies one feature representation at a time while holding all else constant." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The abstract claims 'staged, resource-efficient defenses can robustly secure modern LLM-driven applications' and the conclusion claims 'scalable, resource-efficient protection against prompt-based adversarial behavior,' but results are from one target model (gpt-oss:20b), one curated dataset, English only, and single-turn only. Limitations section acknowledges some bounds but the title and abstract make broader claims." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper does not discuss alternative explanations for the results, such as whether the dataset characteristics favor TF-IDF+SVM approaches, whether the attack styles are particularly amenable to surface-pattern detection, or whether performance would hold on attacks drawn from different distributions." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": true, 139 "justification": "The paper measures classification accuracy and ASR on a labeled prompt corpus, which directly corresponds to the claimed defense effectiveness. The measurements match the granularity of the claims without proxy gaps." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": false, 146 "justification": "The target LLM is 'gpt-oss:20b' which is not a widely known model and lacks a snapshot/version date. The attack evaluator is described only as 'Gemini' without any version specification. ShieldGemma-2B is identified by size but not version/snapshot." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": false, 151 "justification": "No actual prompt text is provided for the LLM-as-a-judge attack evaluator, the ShieldGemma safety guidelines, or any other prompt-based component. The paper describes what prompts do conceptually but does not reproduce the actual text." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": false, 156 "justification": "No hyperparameters are reported: no SVM C parameter, no TF-IDF min/max_df settings, no VectorDB similarity threshold τ or k values, no temperature or sampling parameters for any LLM, no learning rates for any trained component." 157 }, 158 "scaffolding_described": { 159 "applies": false, 160 "answer": false, 161 "justification": "The paper does not use agentic scaffolding. The multi-stage defense pipeline is the system being evaluated, not an agent scaffold." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 3.3.1 provides a detailed 7-step text preprocessing pipeline: lowercasing, emoji-to-text conversion, punctuation stripping, tokenization, alphabetic filtering, stopword removal, and POS-aware WordNet lemmatization." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 8 'Limitations' contains three specific subsections addressing multilingual constraints, multi-turn awareness, and generalization to zero-day attacks." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": true, 178 "justification": "The limitations are specific to this study: 'preprocessing pipeline is optimized for English-language prompts,' 'pipeline primarily evaluates prompts as isolated, single-turn inputs,' and 'performance is fundamentally bounded by the diversity of its training corpus.'" 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": true, 183 "justification": "Section 8 explicitly states what was not tested: multilingual attacks, multi-turn attacks, and truly novel zero-day attack strategies. These are specific scope boundaries." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 3.2 states 'Both the corpus and associated preprocessing scripts are included in the open-source release of this work to facilitate continued benchmarking and community evaluation.'" 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 3.2 describes data sources in detail: manually curated jailbreaks from prior literature, automated adversarial prompts from ADV-LLM, prompt-injection examples from gentelbench-v1, and benign prompts from public instruction-following datasets." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants. Data sources are public datasets and prior academic literature." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": false, 205 "justification": "While the paper describes data sources and final composition (Table 1: 30,937 total), there is no documentation of filtering stages — how many prompts were initially collected from each source, how many were removed by 'rule-based validation and manual verification,' or what the inclusion/exclusion criteria were at each stage." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": false, 212 "justification": "No funding sources are mentioned. Section 9 'Acknowledgement' only acknowledges ChatGPT use for grammar improvement." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Author affiliations are clearly listed: BITS Pilani and Trustwise. Email addresses include institutional identifiers." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": false, 222 "justification": "Cannot assess funder independence because funding is not disclosed. One author is affiliated with 'Trustwise,' which appears to be a commercial entity potentially in the AI trust/safety space, raising questions about financial interest in the outcome." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial disclosure statement is present. The Trustwise affiliation is listed but the nature of any financial relationship is not addressed." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": false, 233 "answer": false, 234 "justification": "The paper evaluates a defense system (SVM + pipeline), not a pre-trained model's capability on a knowledge benchmark. The LLM (gpt-oss:20b) is the downstream target, not the system under evaluation." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": false, 238 "answer": false, 239 "justification": "The paper tests defense mechanisms against adversarial prompts, not a pre-trained model's knowledge on a benchmark. Contamination in the traditional sense is not applicable." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": false, 243 "answer": false, 244 "justification": "The paper evaluates a defense pipeline, not a pre-trained model's benchmark performance. The SVM is trained on the paper's own curated dataset, not a pre-existing benchmark." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study. All evaluation is on automated prompt classification." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants. The study evaluates automated defense mechanisms on a prompt corpus." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in the study." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in the study." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in the study." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in the study." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in the study." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": true, 288 "justification": "Table 3 reports average Time-to-Classify (TTC) for each defense configuration: 47.24s for the SVM-based pipeline, 30.13s for VectorDB+Classifier, and 450.35s for the ShieldGemma configuration." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "No total computational budget is stated — no GPU hours, training time for the SVM or embedding models, hardware specifications, or total API spend for LLM evaluation." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": false, 300 "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single runs." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": false, 305 "justification": "The number of experimental runs is never stated. Results in Tables 2-4 are presented without indicating whether they are from single or multiple runs." 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "No hyperparameter search budget is reported for the SVM, TF-IDF vectorizer, VectorDB threshold, or any other component. Section 6 ablates feature representations but not hyperparameters." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": false, 315 "justification": "The paper presents the 'baseline' SVM configuration in Table 2 and the char n-gram (2,4) as best in Table 4, but does not explain how or why the baseline configuration was selected, or whether selection was done on validation or test data." 316 }, 317 "multiple_comparison_correction": { 318 "applies": true, 319 "answer": false, 320 "justification": "Eight SVM configurations are compared in Table 4 and three defense subsets in Table 3, but no statistical tests are performed at all, let alone corrections for multiple comparisons." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "The authors do not acknowledge the bias of evaluating their own system. Their SVM is compared against their own implementation/deployment of ShieldGemma and other baselines without discussing whether the baselines were optimally configured." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": false, 330 "justification": "While TTC is reported alongside accuracy, performance is not systematically analyzed as a function of compute. The SVM uses far less compute than ShieldGemma but this asymmetry is not formally addressed." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": false, 335 "justification": "The paper does not discuss whether its curated 30,937-prompt corpus validly represents the distribution of real-world jailbreak attacks, or whether accuracy on this corpus translates to real-world defense effectiveness." 336 }, 337 "scaffold_confound_addressed": { 338 "applies": false, 339 "answer": false, 340 "justification": "The paper evaluates defense pipeline configurations, not model comparisons across different scaffolds. The pipeline IS the system under test." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of temporal relationships between training and test data, or whether the SVM's training corpus temporally overlaps with the held-out test set in ways that could inflate performance." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the evaluation setup leaks information (e.g., whether test prompts share templates, sources, or structural patterns with training prompts that enable surface-level matching rather than genuine generalization)." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "The paper states 'No prompt appears in more than one split,' but does not address whether prompts from the same source (e.g., ADV-LLM) in train and test splits share structural templates or are near-duplicates, which would inflate generalization estimates." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": false, 362 "justification": "No concrete leakage detection or prevention method is applied (no n-gram overlap analysis, deduplication, or temporal splits)." 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "The Text Processing + Semantic LSVM achieves 93.4% accuracy and 96.5% specificity on held-out data.", 369 "evidence": "Table 2 reports accuracy 0.9340 and specificity 0.9650 for the LSVM module on the held-out test set (Section 5.1).", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "The full pipeline (SVM + VectorDB + Classifier) achieves 0% ASR, blocking all 1,456 evaluated adversarial attempts.", 374 "evidence": "Table 3 shows 0 successful attacks out of 51 attempted (1,405 blocked) for the {SVM, VectorDB, Classifier} configuration. Verified using Gemini as LLM-as-a-judge.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "The SVM-based configuration yields over 10× lower latency than ShieldGemma while improving accuracy from 35.1% to 93.4%.", 379 "evidence": "Table 3 shows TTC of 47.24s for SVM config vs 450.35s for the ShieldGemma config. Table 2 shows accuracy 0.3513 for ShieldGemma vs 0.9340 for LSVM.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "Character n-gram (2,4) TF-IDF features yield the highest SVM accuracy at 94.09% under adversarial perturbations.", 384 "evidence": "Table 4 (Section 6) shows char n-gram (2,4) achieving 0.9409 accuracy, the highest among 8 tested configurations on an augmented dataset with leetspeak, homoglyphs, and whitespace perturbations.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "Lightweight classical ML defenses can robustly secure modern LLM-driven applications against prompt injection and jailbreak attacks.", 389 "evidence": "Supported by results on one target model (gpt-oss:20b) and one curated dataset. Conclusion (Section 7) generalizes to 'modern LLM-powered agents' broadly.", 390 "supported": "weak" 391 } 392 ], 393 "red_flags": [ 394 { 395 "flag": "Potentially unfair baseline comparison", 396 "detail": "ShieldGemma shows only 35.1% accuracy and 4.6% specificity, meaning it flags nearly all inputs (including benign) as malicious. This suggests ShieldGemma may be misconfigured, used with an inappropriate threshold, or not designed for this binary classification task. The paper does not investigate or explain why a production-grade defense performs so poorly." 397 }, 398 { 399 "flag": "No statistical rigor in comparisons", 400 "detail": "All claims of superiority are based on comparing raw point estimates from what appear to be single runs. No confidence intervals, significance tests, or multi-run variance are reported for any experiment." 401 }, 402 { 403 "flag": "Zero ASR claim on limited test set", 404 "detail": "Claiming 0% ASR sounds definitive but is based on only 51 prompts that passed the defense stack (from ~1,456 tested). This is a property of the specific test corpus, not a guarantee of robustness. The phrasing 'eliminated all attacks' overstates what the evidence shows." 405 }, 406 { 407 "flag": "Unknown target model", 408 "detail": "The target LLM 'gpt-oss:20b' is not a widely known model and cannot be easily looked up. This makes it difficult to contextualize the ASR results or assess whether the model's own safety measures contributed to the 0% ASR." 409 }, 410 { 411 "flag": "Undisclosed potential conflict of interest", 412 "detail": "Author Dhruv Kumar is affiliated with 'Trustwise,' which appears to be a commercial entity in AI safety/trust. No competing interests statement is provided, and no funding disclosure explains the relationship." 413 }, 414 { 415 "flag": "Hyperparameters and configuration details missing", 416 "detail": "Critical parameters — SVM regularization (C), TF-IDF vocabulary settings, VectorDB similarity threshold τ, number of neighbors k, and all LLM temperature/sampling settings — are never reported, undermining reproducibility." 417 } 418 ], 419 "cited_papers": [ 420 { 421 "title": "Prompt Injection Attacks and Defenses in Vision-Language Models", 422 "authors": ["B. Greshake", "T. Lode", "R. Greshake"], 423 "year": 2024, 424 "relevance": "Foundational work on prompt injection as an attack vector against LLM-based systems." 425 }, 426 { 427 "title": "Jailbreaking ChatGPT via Prompt Engineering: An Empirical Study", 428 "authors": ["Y. Liu", "S. Li", "T. Wang", "P. Zhang", "S. Yan"], 429 "year": 2024, 430 "arxiv_id": "2305.13860", 431 "relevance": "Empirical categorization of jailbreak strategies including role-playing and privilege escalation against ChatGPT." 432 }, 433 { 434 "title": "Greedy Coordinate Gradient-Based Search for Universal Adversarial Attacks", 435 "authors": ["A. Zou", "Z. Li", "D. Zhou", "Y. Wu", "J. Gu", "Y. Wang"], 436 "year": 2023, 437 "arxiv_id": "2307.15043", 438 "relevance": "Introduced universal adversarial suffix attacks (GCG) that bypass safety constraints across multiple LLMs." 439 }, 440 { 441 "title": "Iterative Self-Tuning LLMs for Enhanced Jailbreaking Capabilities", 442 "authors": ["K. Sun", "H. Yang", "Q. Liu"], 443 "year": 2024, 444 "arxiv_id": "2410.18469", 445 "relevance": "Demonstrates automated iterative refinement of adversarial tokens for near-perfect jailbreak success rates against closed-source models." 446 }, 447 { 448 "title": "Tree of Attacks: Jailbreaking Black-Box LLMs Automatically", 449 "authors": ["S. Mehrotra", "S. Dathathri", "A. Garg"], 450 "year": 2023, 451 "arxiv_id": "2312.02119", 452 "relevance": "Automated black-box jailbreak method using Tree-of-Thought-style exploration, expanding the attack space without model internals." 453 }, 454 { 455 "title": "Emoji Attack: A Method for Misleading Judge LLMs in Safety Risk Detection", 456 "authors": ["Y. Wei", "J. Chen", "Z. Li"], 457 "year": 2024, 458 "arxiv_id": "2411.01077", 459 "relevance": "Demonstrates token-manipulation attacks using emoji insertion to mislead safety classifiers." 460 }, 461 { 462 "title": "Multi-Turn Jailbreaking Large Language Models via Attention Shifting", 463 "authors": ["X. Du", "F. Mo", "M. Wen", "T. Gu", "H. Zheng", "H. Jin", "J. Shi"], 464 "year": 2025, 465 "relevance": "Shows multi-turn attacks exploiting dialogue history and attention dynamics significantly outperform single-shot jailbreaks." 466 }, 467 { 468 "title": "Intention Analysis Makes LLMs a Good Jailbreak Defender", 469 "authors": ["Y. Zhang", "X. Wang", "M. Liu"], 470 "year": 2024, 471 "arxiv_id": "2401.06561", 472 "relevance": "Pre-generation intent analysis defense that models user objectives against safety policies to reduce ASR." 473 }, 474 { 475 "title": "Attention Tracker: Detecting Prompt Injection Attacks in LLMs", 476 "authors": ["S. Hung", "H. Chien", "Y. Lee"], 477 "year": 2025, 478 "arxiv_id": "2411.00348", 479 "relevance": "Model-internal defense using anomalous attention distributions to detect prompt injection without external classifiers." 480 }, 481 { 482 "title": "From LLMs to MLLMs to Agents: A Survey of Emerging Security Challenges", 483 "authors": ["Y. Mao", "T. Huang", "Z. Liu"], 484 "year": 2025, 485 "arxiv_id": "2506.15170", 486 "relevance": "Survey of security challenges in agentic LLM deployments spanning the full LLM-to-agent spectrum." 487 }, 488 { 489 "title": "LLM Security: Vulnerabilities, Attacks, Defenses, and Countermeasures", 490 "authors": ["C. Aguilera-Martínez", "F. Berzal"], 491 "year": 2025, 492 "arxiv_id": "2505.01177", 493 "relevance": "Comprehensive survey of LLM security covering vulnerabilities, attack types, and defense mechanisms." 494 }, 495 { 496 "title": "Embedding-based classifiers can detect prompt injection attacks", 497 "authors": ["M. A. Ayub", "S. Majumdar"], 498 "year": 2024, 499 "arxiv_id": "2410.22284", 500 "relevance": "Demonstrates embedding-based classification as a defense against prompt injection, closely related to the SVM approach in this paper." 501 } 502 ], 503 "engagement_factors": { 504 "practical_relevance": { 505 "score": 2, 506 "justification": "Defense framework with released code that could be integrated into LLM application pipelines, though requires adaptation for production use." 507 }, 508 "surprise_contrarian": { 509 "score": 1, 510 "justification": "Shows a classical ML approach (TF-IDF + SVM) can outperform a neural LLM-based moderator, mildly surprising but not unprecedented." 511 }, 512 "fear_safety": { 513 "score": 2, 514 "justification": "Directly addresses LLM jailbreak and prompt injection security concerns relevant to deployed systems." 515 }, 516 "drama_conflict": { 517 "score": 0, 518 "justification": "No controversy, no challenge to specific companies or claims beyond standard benchmarking." 519 }, 520 "demo_ability": { 521 "score": 2, 522 "justification": "Code released on GitHub with dataset; could be tried by practitioners, though no live demo or pip package." 523 }, 524 "brand_recognition": { 525 "score": 0, 526 "justification": "Authors from BITS Pilani and Trustwise — not a well-known AI lab or widely recognized brand." 527 } 528 } 529 }