scan.json (31724B)
1 { 2 "paper": { 3 "title": "PromptScreen: Efficient Jailbreak Mitigation Using Semantic Linear Classification in a Multi-Staged Pipeline", 4 "authors": [ 5 "Akshaj Prashanth Rao", 6 "Advait Singh", 7 "Saumya Kumaar Saksena", 8 "Dhruv Kumar" 9 ], 10 "year": 2025, 11 "venue": "arXiv preprint", 12 "arxiv_id": "2512.19011", 13 "doi": "10.48550/arXiv.2512.19011" 14 }, 15 "scan_version": 3, 16 "active_modules": ["experimental_rigor", "data_leakage"], 17 "methodology_tags": ["benchmark-eval"], 18 "key_findings": "PromptScreen proposes a multi-stage defense pipeline against LLM jailbreak and prompt injection attacks, centered on a lightweight Linear SVM classifier operating on TF-IDF features. The SVM module achieves 93.4% accuracy and 96.5% specificity on a held-out test set of 2,000+ prompts, substantially outperforming ShieldGemma (35.1% accuracy) while being 10× faster. The full pipeline combining SVM, VectorDB, and classifier cluster achieves 0% Attack Success Rate on 1,456 adversarial prompts. An ablation study shows character n-gram features (94.09%) outperform word-level features (90.27%) on an augmented adversarial dataset.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "GitHub repository URL provided in Section 7: https://github.com/dronefreak/PromptScreen. The paper states 'the source code is available at' this URL." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "Section 3.2 states: 'Both the corpus and associated preprocessing scripts are included in the open-source release of this work to facilitate continued benchmarking and community evaluation.' The dataset of 30,937 labeled prompts is described as released." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "No requirements.txt, Dockerfile, conda environment file, or detailed environment specification is mentioned in the paper. Library versions are not listed." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No step-by-step reproduction instructions, README with commands, or 'Reproducing Results' section is included in the paper. The algorithms are described in pseudocode but no executable reproduction guide is provided." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "All results in Tables 2, 3, and 4 are reported as point estimates only (e.g., '93.40% accuracy'). No confidence intervals, error bars, or ± notation appears anywhere." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper claims the SVM 'outperforms' ShieldGemma and other baselines based solely on comparing raw metric values. No statistical significance tests (p-values, t-tests, bootstrap tests, etc.) are used." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "The paper reports improvements with baseline context: 'improves overall accuracy from 35.1% to 93.4%' and 'reducing average time-to-completion from ≈450s to 47s, yielding over 10× lower latency.' Tables 2 and 3 provide sufficient context to assess magnitude." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "No justification given for the corpus size of 30,937 prompts or the test set of 2,000+ prompts. No power analysis or discussion of whether the sample size is adequate for the claims being made." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "All results appear to be from single runs. No standard deviations, variance across seeds, or multi-run spread measures are reported in any table." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Table 2 compares three defense approaches: ShieldGemma, the classifier cluster, and the LSVM. Table 3 compares three pipeline configurations with different defense subsets." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "ShieldGemma (Google, 2024) is a recent production-grade defense. The classifier cluster uses contemporary HuggingFace models (textdetox/xlmr-large-toxicity-classifier-v2, jackhhao/jailbreak-classifier)." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "Section 6 presents a systematic SVM ablation study across 8 feature configurations (word n-grams, char n-grams, hybrid). Table 3 also ablates pipeline configurations by including/excluding defense stages." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "Tables 2 and 4 report precision, sensitivity, specificity, negative predictive value, and accuracy. Table 3 adds block rate, ASR, and average TTC. Well over two metrics are used." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": false, 93 "justification": "All evaluation is automated. Classification performance uses standard metrics; ASR is judged by an LLM-as-a-judge and verified by Gemini. No human evaluation of the system's outputs or classifications is performed." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 3.2 states: 'A held-out test set of more than 2,000 prompts is reserved exclusively for evaluation of defense configurations and Attack Success Rate. No prompt appears in more than one split.'" 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Table 2 breaks down performance per individual defense module. Table 3 shows results per pipeline configuration. Table 4 shows per-SVM-configuration results. Table 1 shows dataset composition by class (jailbreak, benign, prompt-injection)." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Appendix A.1 explicitly discusses 'defenses that were implemented but did not meet our robustness or usability requirements,' including the heuristic vector analyzer (false positive issues) and PPA (insufficient protection). Section 8 discusses failure scenarios." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "Appendix A.1 reports two failed defense strategies: the heuristic vector analyzer ('prone to false positives on benign prompts') and Polymorphic Prompt Assembly ('protection was not sufficient against the strongest attacks')." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "Abstract claims of 93.4% accuracy, 96.5% specificity, and 10× lower latency than ShieldGemma are directly supported by Tables 2 and 3. The '35.1% to 93.4%' improvement and '≈450s to 47s' TTC claims match the tabulated results." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper makes causal claims ('LSVM improves accuracy,' 'this module achieves...'). The ablation study (Section 6) and pipeline configuration comparisons (Table 3) provide controlled manipulation of components, supporting these claims through single-variable ablation design." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The abstract claims 'scalable, resource-efficient protection against prompt-based adversarial behavior' broadly, but the system is tested only on English prompts with specific attack types from known datasets. While Section 8 acknowledges multilingual limitations, the title and conclusion make sweeping claims about 'modern LLM-driven applications' that exceed the tested scope." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper does not discuss alternative explanations for its results. For instance: could ShieldGemma's poor 35.1% accuracy be due to misconfiguration? Could the dataset composition favor keyword-based detection? Could the 0% ASR reflect the attack corpus rather than the defense quality? None of these alternatives are considered." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper directly measures classification accuracy, ASR, and TTC — the same quantities it claims to evaluate. There is no proxy gap: the paper claims to measure defense accuracy and latency, and that is what it measures." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": true, 147 "justification": "Specific model identifiers are provided: ShieldGemma-2B, gpt-oss:20b (target LLM), textdetox/xlmr-large-toxicity-classifier-v2, jackhhao/jailbreak-classifier. These are sufficiently specific model identifiers to reproduce the setup." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": false, 152 "justification": "ShieldGemma is described as consuming 'the prompt alongside structured safety guidelines' but these guidelines are not shown. The LLM-as-a-judge Attack Evaluator uses 'predefined criteria' that are not provided. No actual prompt text is given for any LLM-based component." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": false, 157 "justification": "No hyperparameters are reported for the Linear SVM (C value, regularization), TF-IDF vectorizer (max_features, min/max df), VectorDB similarity threshold τ, or any model component. The paper describes the architecture but not its parameterization." 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "No agentic scaffolding is used. The system is a sequential classification pipeline, not an agent with tool use, memory, or iterative reasoning." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 3.3.1 documents the text preprocessing pipeline in detail: lowercasing, emoji-to-text conversion, punctuation removal, tokenization, alphabetic filtering, stopword removal, and POS-aware WordNet lemmatization. Dataset construction is documented in Section 3.2." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 8 ('Limitations') is a dedicated limitations section with three specific subsections: Multilingual Constraints, Contextual and Multi-turn Awareness, and Generalization to Zero-Day Attacks." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "The limitations are specific to this study: (1) preprocessing pipeline is optimized for English only, (2) evaluates single-turn prompts and may miss multi-turn distributed attacks, (3) SVM performance bounded by training corpus diversity for zero-day attacks. These are concrete, study-specific threats." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": true, 184 "justification": "Section 8 explicitly states the system 'does not address or evaluate protection against multilingual jailbreak attempts,' is limited to 'isolated, single-turn inputs,' and performance is 'bounded by the diversity of its training corpus' for novel attack strategies." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 3.2 states: 'Both the corpus and associated preprocessing scripts are included in the open-source release.' All dataset instances are 'serialized in structured JSON with explicit source metadata and taxonomy labels.'" 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 3.2 describes data sources in detail: manually curated jailbreaks from prior literature, automated adversarial prompts from ADV-LLM, prompt-injection examples from gentelbench-v1, and benign prompts from public instruction-following datasets. Labeling taxonomy with three classes is defined." 197 }, 198 "recruitment_methods_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Data sources are described: adversarial prompts from prior academic literature (Liu et al., Sun et al., Pathade), ADV-LLM automated outputs, gentelbench-v1 dataset, and public instruction-following datasets for benign prompts. Each source's purpose and origin is stated in Section 3.2." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": false, 206 "justification": "While data sources and final composition (Table 1: 30,937 total) are described, the pipeline from source collection to final dataset lacks detail. How many examples came from each source is not stated. The filtering process ('manually reviewed to remove ambiguous cases') gives no counts of how many were removed or what criteria defined 'ambiguous.'" 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No funding sources are disclosed. The Acknowledgement section (Section 9) only mentions the use of ChatGPT for writing assistance, with no mention of grants, sponsors, or funding agencies." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are clearly stated: BITS Pilani and Trustwise. Dhruv Kumar has a dual affiliation with both institutions." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "Cannot assess funder independence since funding is not disclosed. One author (Dhruv Kumar) is affiliated with Trustwise, a company in the AI trust/safety space that could benefit commercially from demonstrating lightweight defense effectiveness." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial interests statement appears in the paper. The Trustwise affiliation raises potential commercial interest in the defense pipeline's success, but this is not discussed." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": false, 234 "answer": false, 235 "justification": "The paper tests defense mechanisms (SVM, classifier cluster, etc.) against adversarial prompts, not a pre-trained model's capability on a benchmark. The target LLM (gpt-oss:20b) is used only to measure ASR (whether attacks pass through defenses), not to evaluate model knowledge." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": false, 239 "answer": false, 240 "justification": "The paper evaluates defense tools rather than a pre-trained model's capability on a benchmark. Contamination in the traditional sense (model has seen test data during pre-training) does not apply." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": false, 244 "answer": false, 245 "justification": "The paper tests defense mechanisms, not model capability on benchmarks. The SVM is trained by the authors on their own dataset, not a pre-trained model being evaluated on a potentially contaminated benchmark." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study. All evaluation is automated on a curated prompt dataset." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants. The study evaluates defense mechanisms on text prompts without involving human subjects." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in the study." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in the study." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in the study." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": true, 289 "justification": "Time-to-Classify (TTC) is reported in Table 3: {VectorDB, Classifier} at 30.13s, {YARA, Cluster, VectorDB, ShieldGemma} at 450.35s, and {SVM, VectorDB, Classifier} at 47.24s. Section 5.3 discusses computational efficiency as a primary evaluation dimension." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "No total computational budget is stated. Hardware specifications, GPU/CPU used for experiments, total training time for the SVM, or total evaluation time across the test set are not reported." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "No mention of random seeds or seed sensitivity analysis. The SVM training and all evaluations appear to be single-run results with no assessment of seed-dependent variance." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": false, 306 "justification": "The number of experimental runs is never stated. Results appear to come from single runs but this is not explicitly confirmed." 307 }, 308 "hyperparameter_search_budget": { 309 "applies": true, 310 "answer": false, 311 "justification": "No hyperparameter search budget is reported. The SVM ablation (Section 6) tests different feature configurations but the number of configurations tried, search method, or computational cost of the search is not disclosed." 312 }, 313 "best_config_selection_justified": { 314 "applies": true, 315 "answer": false, 316 "justification": "The main results (Table 2) use the LSVM module but it is unclear which specific TF-IDF configuration was used — the ablation (Table 4) shows char n-gram (2,4) as best at 94.09%, but the main results report 93.4%. The selection of the final configuration is not justified." 317 }, 318 "multiple_comparison_correction": { 319 "applies": true, 320 "answer": false, 321 "justification": "Multiple comparisons are made across defense configurations (Table 2, 3) and SVM variants (Table 4) without any correction for multiple comparisons. No statistical tests are performed at all, let alone corrected ones." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": false, 326 "justification": "The authors train, configure, and evaluate their own SVM system against baselines without acknowledging author-evaluation bias. No independent evaluation is performed, and no discussion of the bias inherent in self-evaluation appears." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": true, 330 "answer": true, 331 "justification": "Table 3 reports both defense effectiveness (block rate, ASR) and TTC per configuration, enabling performance-vs-compute comparison. Section 5.3 explicitly discusses the latency trade-off between lightweight (SVM, 47s) and heavyweight (ShieldGemma, 450s) defenses." 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": false, 336 "justification": "The paper does not discuss whether its curated corpus of 30,937 prompts is representative of real-world attack distributions, whether the attack difficulty matches production threats, or whether the benchmark actually measures what is claimed about 'robust defense.'" 337 }, 338 "scaffold_confound_addressed": { 339 "applies": false, 340 "answer": false, 341 "justification": "No agentic scaffolding is involved. The defense pipeline is a sequential classifier, and comparisons are between defense modules, not between models in different scaffolds." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether adversarial prompts in the training set were generated or collected before or after the test prompts, or whether temporal ordering could introduce leakage in the SVM's training." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether the evaluation setup provides information not available in real deployment. The VectorDB stores known attack signatures that could overlap structurally with test attacks from the same source campaigns." 354 }, 355 "non_independence_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "The paper states 'No prompt appears in more than one split' but does not address whether train and test prompts from the same source (e.g., ADV-LLM) share structural patterns or were generated by the same attack algorithms, which would compromise independence." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": false, 363 "justification": "No concrete leakage detection or prevention method is described. Only basic non-overlap between splits is ensured, without n-gram analysis, membership inference, or decontamination pipelines." 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "The SVM-based semantic filter achieves 93.4% accuracy and 96.5% specificity on held-out data.", 370 "evidence": "Table 2 reports accuracy of 0.9340 and specificity of 0.9650 for the 'Text processing + Semantic LSVM' configuration evaluated on the held-out test set.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "The SVM-based configuration improves overall accuracy from 35.1% to 93.4% while reducing average TTC from ~450s to 47s, yielding over 10× lower latency than ShieldGemma.", 375 "evidence": "Tables 2 and 3 show ShieldGemma at 35.13% accuracy / 450.35s TTC versus the SVM configuration at 93.4% accuracy / 47.24s TTC. The comparison is between different system types (LLM-based moderator vs. classical ML classifier).", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "The full pipeline {SVM, VectorDB, Classifier} achieves 0% Attack Success Rate across all evaluated adversarial attempts.", 380 "evidence": "Table 3 shows 0 successful attacks out of 51 attempted (1,405 of 1,456 blocked), yielding 0.00% ASR. Results independently verified using Gemini.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Character n-gram features outperform word-level features for adversarial prompt detection under perturbation.", 385 "evidence": "Table 4 shows char n-gram (2,4) achieving 94.09% accuracy vs. baseline word unigram at 90.27% on the augmented (perturbed) dataset. The pattern is consistent across char n-gram variants.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Lightweight semantic classifiers can outperform heavier LLM-based moderators in accuracy-focused evaluation settings.", 390 "evidence": "Table 2 shows LSVM at 93.4% accuracy vs ShieldGemma at 35.1%. However, ShieldGemma's anomalously low accuracy may indicate suboptimal configuration rather than an inherent limitation of LLM-based approaches.", 391 "supported": "weak" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "Anomalously poor baseline performance", 397 "detail": "ShieldGemma achieves only 35.1% accuracy, well below random chance for binary classification. This suggests possible misconfiguration, suboptimal prompt design, or a mismatch between the evaluation setup and ShieldGemma's intended use case. The paper does not investigate or explain this anomalously poor performance." 398 }, 399 { 400 "flag": "No error bars or repeated runs", 401 "detail": "All results in Tables 2, 3, and 4 are single-run point estimates with no uncertainty quantification. For an SVM that depends on train/test split composition, this is a significant omission." 402 }, 403 { 404 "flag": "0% ASR may be artificially clean", 405 "detail": "Achieving exactly 0% ASR on 1,456 adversarial prompts, with only 51 prompts reaching the target LLM, is claimed without uncertainty bounds. The low number of prompts actually reaching the LLM (51) means the true ASR may fluctuate significantly with different test samples." 406 }, 407 { 408 "flag": "Undisclosed conflict of interest", 409 "detail": "Co-author Dhruv Kumar is affiliated with Trustwise, a company in the AI trust/safety space. A commercially viable lightweight defense pipeline directly serves Trustwise's business interests, yet no competing interests statement is provided." 410 }, 411 { 412 "flag": "Train-test independence not verified", 413 "detail": "Adversarial prompts from the same source (e.g., ADV-LLM) may share structural patterns between train and test splits. The paper ensures no exact duplicates but does not verify independence of attack patterns across splits, risking inflated accuracy estimates." 414 } 415 ], 416 "cited_papers": [ 417 { 418 "title": "Prompt Injection Attacks and Defenses in Vision-Language Models", 419 "authors": ["B. Greshake", "T. Lode", "R. Greshake"], 420 "year": 2024, 421 "relevance": "Foundational work on prompt injection attacks and defenses extending to vision-language models, directly relevant to LLM security research." 422 }, 423 { 424 "title": "Jailbreaking ChatGPT via Prompt Engineering: An Empirical Study", 425 "authors": ["Y. Liu", "S. Li", "T. Wang", "P. Zhang", "S. Yan"], 426 "year": 2024, 427 "arxiv_id": "2305.13860", 428 "relevance": "Empirical study categorizing jailbreak strategies including role-playing and privilege escalation, directly relevant to LLM safety evaluation." 429 }, 430 { 431 "title": "Greedy Coordinate Gradient-Based Search for Universal Adversarial Attacks", 432 "authors": ["A. Zou", "Z. Li", "D. Zhou", "Y. Wu", "J. Gu", "Y. Wang"], 433 "year": 2023, 434 "arxiv_id": "2307.15043", 435 "relevance": "Introduces GCG universal adversarial suffix attack, a foundational attack method against LLM safety alignment." 436 }, 437 { 438 "title": "Iterative Self-Tuning LLMs for Enhanced Jailbreaking Capabilities", 439 "authors": ["K. Sun", "H. Yang", "Q. Liu"], 440 "year": 2024, 441 "arxiv_id": "2410.18469", 442 "relevance": "Demonstrates automated iterative jailbreak refinement achieving near-perfect success rates, relevant to adversarial attack research on LLMs." 443 }, 444 { 445 "title": "From LLMs to MLLMs to Agents: A Survey of Emerging Security Challenges", 446 "authors": ["Y. Mao", "T. Huang", "Z. Liu"], 447 "year": 2025, 448 "arxiv_id": "2506.15170", 449 "relevance": "Survey of security challenges in the progression from LLMs to multimodal models to agentic systems." 450 }, 451 { 452 "title": "Intention Analysis Makes LLMs a Good Jailbreak Defender", 453 "authors": ["Y. Zhang", "X. Wang", "M. Liu"], 454 "year": 2024, 455 "arxiv_id": "2401.06561", 456 "relevance": "Pre-generation intent analysis for jailbreak defense, directly relevant to LLM safety and defense methodology." 457 }, 458 { 459 "title": "Attention Tracker: Detecting Prompt Injection Attacks in LLMs", 460 "authors": ["S. Hung", "H. Chien", "Y. Lee"], 461 "year": 2025, 462 "arxiv_id": "2411.00348", 463 "relevance": "Uses model-internal attention signals to detect prompt injection, representing an alternative detection approach to external classifiers." 464 }, 465 { 466 "title": "Tree of Attacks: Jailbreaking Black-Box LLMs Automatically", 467 "authors": ["S. Mehrotra", "S. Dathathri", "A. Garg"], 468 "year": 2023, 469 "arxiv_id": "2312.02119", 470 "relevance": "Automated black-box jailbreak method using Tree-of-Thought exploration, relevant to LLM safety and adversarial attack research." 471 }, 472 { 473 "title": "Multi-Turn Jailbreaking Large Language Models via Attention Shifting", 474 "authors": ["X. Du", "F. Mo", "M. Wen", "T. Gu", "H. Zheng", "H. Jin", "J. Shi"], 475 "year": 2025, 476 "relevance": "Demonstrates multi-turn jailbreak attacks exploiting dialogue history, relevant to agentic LLM security challenges." 477 }, 478 { 479 "title": "Emoji Attack: A Method for Misleading Judge LLMs in Safety Risk Detection", 480 "authors": ["Y. Wei", "J. Chen", "Z. Li"], 481 "year": 2024, 482 "arxiv_id": "2411.01077", 483 "relevance": "Demonstrates token-manipulation attacks (emoji insertion) that mislead safety classifiers, relevant to LLM safety evaluation." 484 }, 485 { 486 "title": "Embedding-based classifiers can detect prompt injection attacks", 487 "authors": ["M. A. Ayub", "S. Majumdar"], 488 "year": 2024, 489 "arxiv_id": "2410.22284", 490 "relevance": "Prior work on using embedding-based classification for prompt injection detection, a directly comparable approach to PromptScreen." 491 }, 492 { 493 "title": "ShieldGemma: A Generative AI Safety Classifier", 494 "authors": ["Google"], 495 "year": 2024, 496 "relevance": "Production-grade LLM safety classifier used as a primary baseline in this paper's evaluation." 497 } 498 ], 499 "engagement_factors": { 500 "practical_relevance": { 501 "score": 2, 502 "justification": "Proposes a deployable defense pipeline with released code, useful for practitioners securing LLM applications, though not a plug-and-play solution." 503 }, 504 "surprise_contrarian": { 505 "score": 1, 506 "justification": "The finding that a simple SVM outperforms an LLM-based moderator is somewhat surprising but aligns with known advantages of classical ML for specific classification tasks." 507 }, 508 "fear_safety": { 509 "score": 2, 510 "justification": "Directly addresses LLM jailbreak and prompt injection attacks, a growing security concern for AI-powered applications." 511 }, 512 "drama_conflict": { 513 "score": 1, 514 "justification": "Implicit criticism of ShieldGemma's performance (35.1% accuracy) could generate mild discussion, but no dramatic controversy." 515 }, 516 "demo_ability": { 517 "score": 2, 518 "justification": "Code and dataset released on GitHub, allowing practitioners to test and replicate the defense pipeline." 519 }, 520 "brand_recognition": { 521 "score": 0, 522 "justification": "Authors are from BITS Pilani and Trustwise, neither of which are widely recognized in the AI safety community." 523 } 524 } 525 }