ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (34161B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "PromptScreen: Efficient Jailbreak Mitigation Using Semantic Linear Classification in a Multi-Staged Pipeline",
      6     "authors": [
      7       "Akshaj Prashanth Rao",
      8       "Advait Singh",
      9       "Saumya Kumaar Saksena",
     10       "Dhruv Kumar"
     11     ],
     12     "year": 2025,
     13     "venue": "Unknown",
     14     "arxiv_id": "2512.19011",
     15     "doi": "10.48550/arXiv.2512.19011"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims of 93.4% accuracy, 96.5% specificity, and 10× lower latency than ShieldGemma are directly supported by Tables 2 and 3. The '35.1% to 93.4%' improvement and '≈450s to 47s' TTC claims match the tabulated results.",
     23         "source": "opus"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper makes causal claims ('LSVM improves accuracy,' 'this module achieves...'). The ablation study (Section 6) and pipeline configuration comparisons (Table 3) provide controlled manipulation of components, supporting these claims through single-variable ablation design.",
     29         "source": "opus"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The abstract claims 'scalable, resource-efficient protection against prompt-based adversarial behavior' broadly, but the system is tested only on English prompts with specific attack types from known datasets. While Section 8 acknowledges multilingual limitations, the title and conclusion make sweeping claims about 'modern LLM-driven applications' that exceed the tested scope.",
     35         "source": "opus"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper does not discuss alternative explanations for its results. For instance: could ShieldGemma's poor 35.1% accuracy be due to misconfiguration? Could the dataset composition favor keyword-based detection? Could the 0% ASR reflect the attack corpus rather than the defense quality? None of these alternatives are considered.",
     41         "source": "opus"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper directly measures classification accuracy, ASR, and TTC — the same quantities it claims to evaluate. There is no proxy gap: the paper claims to measure defense accuracy and latency, and that is what it measures.",
     47         "source": "opus"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 8 ('Limitations') is a dedicated limitations section with three specific subsections: Multilingual Constraints, Contextual and Multi-turn Awareness, and Generalization to Zero-Day Attacks.",
     55         "source": "opus"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The limitations are specific to this study: (1) preprocessing pipeline is optimized for English only, (2) evaluates single-turn prompts and may miss multi-turn distributed attacks, (3) SVM performance bounded by training corpus diversity for zero-day attacks. These are concrete, study-specific threats.",
     61         "source": "opus"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Section 8 explicitly states the system 'does not address or evaluate protection against multilingual jailbreak attempts,' is limited to 'isolated, single-turn inputs,' and performance is 'bounded by the diversity of its training corpus' for novel attack strategies.",
     67         "source": "opus"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding sources are disclosed. The Acknowledgement section (Section 9) only mentions the use of ChatGPT for writing assistance, with no mention of grants, sponsors, or funding agencies.",
     75         "source": "opus"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations are clearly stated: BITS Pilani and Trustwise. Dhruv Kumar has a dual affiliation with both institutions.",
     81         "source": "opus"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "Cannot assess funder independence since funding is not disclosed. One author (Dhruv Kumar) is affiliated with Trustwise, a company in the AI trust/safety space that could benefit commercially from demonstrating lightweight defense effectiveness.",
     87         "source": "opus"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests statement appears in the paper. The Trustwise affiliation raises potential commercial interest in the defense pipeline's success, but this is not discussed.",
     93         "source": "opus"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are defined: 'jailbreak' (inputs coercing model to violate safety), 'prompt-injection' (inputs targeting application logic), 'ASR' (attack success rate fraction). Taxonomy is explicit in dataset section.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Three contributions are explicitly stated: (1) efficient LSVM semantic filter, (2) multi-stage configurable pipeline, (3) 30,000+ labeled prompt dataset. Each is clearly framed as a contribution.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 'Related Work' discusses four prior areas (detection/mitigation, token manipulation, prompt optimization, benchmarks) and explicitly positions this work: 'In contrast, our work treats defense as configurable, multi-stage pipeline.' Shows differentiation.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "GitHub repository URL provided in Section 7: https://github.com/dronefreak/PromptScreen. The paper states 'the source code is available at' this URL.",
    124           "source": "opus"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Section 3.2 states: 'Both the corpus and associated preprocessing scripts are included in the open-source release of this work to facilitate continued benchmarking and community evaluation.' The dataset of 30,937 labeled prompts is described as released.",
    130           "source": "opus"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No requirements.txt, Dockerfile, conda environment file, or detailed environment specification is mentioned in the paper. Library versions are not listed.",
    136           "source": "opus"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No step-by-step reproduction instructions, README with commands, or 'Reproducing Results' section is included in the paper. The algorithms are described in pseudocode but no executable reproduction guide is provided.",
    142           "source": "opus"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "All results in Tables 2, 3, and 4 are reported as point estimates only (e.g., '93.40% accuracy'). No confidence intervals, error bars, or ± notation appears anywhere.",
    150           "source": "opus"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "The paper claims the SVM 'outperforms' ShieldGemma and other baselines based solely on comparing raw metric values. No statistical significance tests (p-values, t-tests, bootstrap tests, etc.) are used.",
    156           "source": "opus"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "The paper reports improvements with baseline context: 'improves overall accuracy from 35.1% to 93.4%' and 'reducing average time-to-completion from ≈450s to 47s, yielding over 10× lower latency.' Tables 2 and 3 provide sufficient context to assess magnitude.",
    162           "source": "opus"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No justification given for the corpus size of 30,937 prompts or the test set of 2,000+ prompts. No power analysis or discussion of whether the sample size is adequate for the claims being made.",
    168           "source": "opus"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "All results appear to be from single runs. No standard deviations, variance across seeds, or multi-run spread measures are reported in any table.",
    174           "source": "opus"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Table 2 compares three defense approaches: ShieldGemma, the classifier cluster, and the LSVM. Table 3 compares three pipeline configurations with different defense subsets.",
    182           "source": "opus"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "ShieldGemma (Google, 2024) is a recent production-grade defense. The classifier cluster uses contemporary HuggingFace models (textdetox/xlmr-large-toxicity-classifier-v2, jackhhao/jailbreak-classifier).",
    188           "source": "opus"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Section 6 presents a systematic SVM ablation study across 8 feature configurations (word n-grams, char n-grams, hybrid). Table 3 also ablates pipeline configurations by including/excluding defense stages.",
    194           "source": "opus"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Tables 2 and 4 report precision, sensitivity, specificity, negative predictive value, and accuracy. Table 3 adds block rate, ASR, and average TTC. Well over two metrics are used.",
    200           "source": "opus"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "All evaluation is automated. Classification performance uses standard metrics; ASR is judged by an LLM-as-a-judge and verified by Gemini. No human evaluation of the system's outputs or classifications is performed.",
    206           "source": "opus"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Section 3.2 states: 'A held-out test set of more than 2,000 prompts is reserved exclusively for evaluation of defense configurations and Attack Success Rate. No prompt appears in more than one split.'",
    212           "source": "opus"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Table 2 breaks down performance per individual defense module. Table 3 shows results per pipeline configuration. Table 4 shows per-SVM-configuration results. Table 1 shows dataset composition by class (jailbreak, benign, prompt-injection).",
    218           "source": "opus"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Appendix A.1 explicitly discusses 'defenses that were implemented but did not meet our robustness or usability requirements,' including the heuristic vector analyzer (false positive issues) and PPA (insufficient protection). Section 8 discusses failure scenarios.",
    224           "source": "opus"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Appendix A.1 reports two failed defense strategies: the heuristic vector analyzer ('prone to false positives on benign prompts') and Polymorphic Prompt Assembly ('protection was not sufficient against the strongest attacks').",
    230           "source": "opus"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Specific model identifiers are provided: ShieldGemma-2B, gpt-oss:20b (target LLM), textdetox/xlmr-large-toxicity-classifier-v2, jackhhao/jailbreak-classifier. These are sufficiently specific model identifiers to reproduce the setup.",
    238           "source": "opus"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": false,
    243           "justification": "ShieldGemma is described as consuming 'the prompt alongside structured safety guidelines' but these guidelines are not shown. The LLM-as-a-judge Attack Evaluator uses 'predefined criteria' that are not provided. No actual prompt text is given for any LLM-based component.",
    244           "source": "opus"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": false,
    249           "justification": "No hyperparameters are reported for the Linear SVM (C value, regularization), TF-IDF vectorizer (max_features, min/max df), VectorDB similarity threshold τ, or any model component. The paper describes the architecture but not its parameterization.",
    250           "source": "opus"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "No agentic scaffolding is used. The system is a sequential classification pipeline, not an agent with tool use, memory, or iterative reasoning.",
    256           "source": "opus"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Section 3.3.1 documents the text preprocessing pipeline in detail: lowercasing, emoji-to-text conversion, punctuation removal, tokenization, alphabetic filtering, stopword removal, and POS-aware WordNet lemmatization. Dataset construction is documented in Section 3.2.",
    262           "source": "opus"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "Section 3.2 states: 'Both the corpus and associated preprocessing scripts are included in the open-source release.' All dataset instances are 'serialized in structured JSON with explicit source metadata and taxonomy labels.'",
    270           "source": "opus"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Section 3.2 describes data sources in detail: manually curated jailbreaks from prior literature, automated adversarial prompts from ADV-LLM, prompt-injection examples from gentelbench-v1, and benign prompts from public instruction-following datasets. Labeling taxonomy with three classes is defined.",
    276           "source": "opus"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": true,
    280           "answer": true,
    281           "justification": "Data sources are described: adversarial prompts from prior academic literature (Liu et al., Sun et al., Pathade), ADV-LLM automated outputs, gentelbench-v1 dataset, and public instruction-following datasets for benign prompts. Each source's purpose and origin is stated in Section 3.2.",
    282           "source": "opus"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": false,
    287           "justification": "While data sources and final composition (Table 1: 30,937 total) are described, the pipeline from source collection to final dataset lacks detail. How many examples came from each source is not stated. The filtering process ('manually reviewed to remove ambiguous cases') gives no counts of how many were removed or what criteria defined 'ambiguous.'",
    288           "source": "opus"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": false,
    294           "answer": false,
    295           "justification": "The paper tests defense mechanisms (SVM, classifier cluster, etc.) against adversarial prompts, not a pre-trained model's capability on a benchmark. The target LLM (gpt-oss:20b) is used only to measure ASR (whether attacks pass through defenses), not to evaluate model knowledge.",
    296           "source": "opus"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": false,
    300           "answer": false,
    301           "justification": "The paper evaluates defense tools rather than a pre-trained model's capability on a benchmark. Contamination in the traditional sense (model has seen test data during pre-training) does not apply.",
    302           "source": "opus"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": false,
    306           "answer": false,
    307           "justification": "The paper tests defense mechanisms, not model capability on benchmarks. The SVM is trained by the authors on their own dataset, not a pre-trained model being evaluated on a potentially contaminated benchmark.",
    308           "source": "opus"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in this study. All evaluation is automated on a curated prompt dataset.",
    316           "source": "opus"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants. The study evaluates defense mechanisms on text prompts without involving human subjects.",
    322           "source": "opus"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in the study.",
    328           "source": "opus"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in the study.",
    334           "source": "opus"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in the study.",
    340           "source": "opus"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in the study.",
    346           "source": "opus"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in the study.",
    352           "source": "opus"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": true,
    359           "justification": "Time-to-Classify (TTC) is reported in Table 3: {VectorDB, Classifier} at 30.13s, {YARA, Cluster, VectorDB, ShieldGemma} at 450.35s, and {SVM, VectorDB, Classifier} at 47.24s. Section 5.3 discusses computational efficiency as a primary evaluation dimension.",
    360           "source": "opus"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "No total computational budget is stated. Hardware specifications, GPU/CPU used for experiments, total training time for the SVM, or total evaluation time across the test set are not reported.",
    366           "source": "opus"
    367         }
    368       },
    369       "experimental_rigor": {
    370         "seed_sensitivity_reported": {
    371           "applies": true,
    372           "answer": false,
    373           "justification": "No mention of random seeds or seed sensitivity analysis. The SVM training and all evaluations appear to be single-run results with no assessment of seed-dependent variance.",
    374           "source": "opus"
    375         },
    376         "number_of_runs_stated": {
    377           "applies": true,
    378           "answer": false,
    379           "justification": "The number of experimental runs is never stated. Results appear to come from single runs but this is not explicitly confirmed.",
    380           "source": "opus"
    381         },
    382         "hyperparameter_search_budget": {
    383           "applies": true,
    384           "answer": false,
    385           "justification": "No hyperparameter search budget is reported. The SVM ablation (Section 6) tests different feature configurations but the number of configurations tried, search method, or computational cost of the search is not disclosed.",
    386           "source": "opus"
    387         },
    388         "best_config_selection_justified": {
    389           "applies": true,
    390           "answer": false,
    391           "justification": "The main results (Table 2) use the LSVM module but it is unclear which specific TF-IDF configuration was used — the ablation (Table 4) shows char n-gram (2,4) as best at 94.09%, but the main results report 93.4%. The selection of the final configuration is not justified.",
    392           "source": "opus"
    393         },
    394         "multiple_comparison_correction": {
    395           "applies": true,
    396           "answer": false,
    397           "justification": "Multiple comparisons are made across defense configurations (Table 2, 3) and SVM variants (Table 4) without any correction for multiple comparisons. No statistical tests are performed at all, let alone corrected ones.",
    398           "source": "opus"
    399         },
    400         "self_comparison_bias_addressed": {
    401           "applies": true,
    402           "answer": false,
    403           "justification": "The authors train, configure, and evaluate their own SVM system against baselines without acknowledging author-evaluation bias. No independent evaluation is performed, and no discussion of the bias inherent in self-evaluation appears.",
    404           "source": "opus"
    405         },
    406         "compute_budget_vs_performance": {
    407           "applies": true,
    408           "answer": true,
    409           "justification": "Table 3 reports both defense effectiveness (block rate, ASR) and TTC per configuration, enabling performance-vs-compute comparison. Section 5.3 explicitly discusses the latency trade-off between lightweight (SVM, 47s) and heavyweight (ShieldGemma, 450s) defenses.",
    410           "source": "opus"
    411         },
    412         "benchmark_construct_validity": {
    413           "applies": true,
    414           "answer": false,
    415           "justification": "The paper does not discuss whether its curated corpus of 30,937 prompts is representative of real-world attack distributions, whether the attack difficulty matches production threats, or whether the benchmark actually measures what is claimed about 'robust defense.'",
    416           "source": "opus"
    417         },
    418         "scaffold_confound_addressed": {
    419           "applies": false,
    420           "answer": false,
    421           "justification": "No agentic scaffolding is involved. The defense pipeline is a sequential classifier, and comparisons are between defense modules, not between models in different scaffolds.",
    422           "source": "opus"
    423         }
    424       },
    425       "data_leakage": {
    426         "temporal_leakage_addressed": {
    427           "applies": true,
    428           "answer": false,
    429           "justification": "No discussion of whether adversarial prompts in the training set were generated or collected before or after the test prompts, or whether temporal ordering could introduce leakage in the SVM's training.",
    430           "source": "opus"
    431         },
    432         "feature_leakage_addressed": {
    433           "applies": true,
    434           "answer": false,
    435           "justification": "No discussion of whether the evaluation setup provides information not available in real deployment. The VectorDB stores known attack signatures that could overlap structurally with test attacks from the same source campaigns.",
    436           "source": "opus"
    437         },
    438         "non_independence_addressed": {
    439           "applies": true,
    440           "answer": false,
    441           "justification": "The paper states 'No prompt appears in more than one split' but does not address whether train and test prompts from the same source (e.g., ADV-LLM) share structural patterns or were generated by the same attack algorithms, which would compromise independence.",
    442           "source": "opus"
    443         },
    444         "leakage_detection_method": {
    445           "applies": true,
    446           "answer": false,
    447           "justification": "No concrete leakage detection or prevention method is described. Only basic non-overlap between splits is ensured, without n-gram analysis, membership inference, or decontamination pipelines.",
    448           "source": "opus"
    449         }
    450       }
    451     }
    452   },
    453   "claims": [
    454     {
    455       "claim": "Semantic LSVM classifier achieves 93.4% accuracy and 96.5% specificity on jailbreak/prompt-injection detection",
    456       "evidence": "Table 2 reports Text processing + Semantic LSVM with accuracy 0.9340, specificity 0.9650 on held-out test set",
    457       "supported": "strong"
    458     },
    459     {
    460       "claim": "LSVM-based configuration improves over ShieldGemma baseline by 58 percentage points in accuracy (35.1% → 93.4%)",
    461       "evidence": "Table 2 shows ShieldGemma accuracy 0.3513 vs LSVM 0.9340 on same test set",
    462       "supported": "strong"
    463     },
    464     {
    465       "claim": "Full multi-stage pipeline achieves 0% Attack Success Rate (blocks all 1,456 adversarial test prompts)",
    466       "evidence": "Table 3, {SVM, VectorDB, Classifier} configuration shows 0.00% ASR with 1405 blocked prompts, 51 attempted, 0 successful",
    467       "supported": "strong"
    468     },
    469     {
    470       "claim": "Pipeline reduces inference latency 10× compared to ShieldGemma (47s vs 450s)",
    471       "evidence": "Table 3 shows avg TTC of 47.24s for {SVM, VectorDB, Classifier} vs 450.3459s for {YARA, Cluster, VectorDB, ShieldGemma}",
    472       "supported": "strong"
    473     },
    474     {
    475       "claim": "Character n-gram features outperform word-level features in SVM ablation study",
    476       "evidence": "Table 4: char n-gram (2,4) achieves 94.09% accuracy vs baseline word unigram at 90.27%",
    477       "supported": "strong"
    478     },
    479     {
    480       "claim": "System effectively handles emoji, Unicode homoglyph, and whitespace obfuscation attacks",
    481       "evidence": "Ablation study (Section 6) augmented dataset with leetspeak, homoglyphs, whitespace perturbations and tested robustness; char n-grams designed to capture these",
    482       "supported": "moderate"
    483     },
    484     {
    485       "claim": "Staged architecture with early-stage semantic filter enables high accuracy with minimal latency overhead",
    486       "evidence": "SVM alone achieves 0.47s avg TTC (Table 3); full pipeline 47.24s. Design discussion (Section 3.1) explains early rejection reduces downstream processing",
    487       "supported": "moderate"
    488     },
    489     {
    490       "claim": "Simple Linear SVM outperforms complex model-based approaches for prompt-level security",
    491       "evidence": "LSVM (Table 2: 93.4% acc) outperforms ShieldGemma (35.1%) and Classifier cluster (65.0%); however, comparison not controlled for model capacity or training data differences",
    492       "supported": "moderate"
    493     }
    494   ],
    495   "methodology_tags": [
    496     "benchmark-eval",
    497     "systems-evaluation"
    498   ],
    499   "key_findings": "PromptScreen combines classical machine learning (Linear SVM with TF-IDF features) and modular defenses into a multi-stage pipeline for detecting prompt injection and jailbreak attacks. The semantic LSVM component achieves 93.4% accuracy and 96.5% specificity, substantially outperforming the model-based ShieldGemma baseline (35.1% accuracy) while reducing inference latency from ~450s to 47s. When combined with complementary defenses (vector similarity matching, classifier ensembles), the full system achieves 0% Attack Success Rate on the evaluation set of 30,000+ labeled prompts while maintaining negligible false-positive rates on benign inputs. The results suggest that efficient, interpretable defenses can be more effective than heavyweight LLM-based moderators for real-time security at scale.",
    500   "red_flags": [
    501     {
    502       "flag": "No confidence intervals or variance reporting",
    503       "detail": "All results (Tables 2-3) report single-point values without CIs, std devs, or ranges. No multiple runs or cross-validation reported, limiting statistical rigor."
    504     },
    505     {
    506       "flag": "No statistical significance tests",
    507       "detail": "Large differences between methods (e.g., LSVM 93.4% vs ShieldGemma 35.1%) not tested for significance. Unclear if differences could arise by chance."
    508     },
    509     {
    510       "flag": "Limited system ablation",
    511       "detail": "Feature-level ablation provided (Section 6) but no system-level ablation. Cannot isolate contribution of each pipeline stage (SVM vs VectorDB vs Classifier cluster)."
    512     },
    513     {
    514       "flag": "LLM-as-a-judge prompts not disclosed",
    515       "detail": "ASR evaluation uses unspecified LLM prompts and safety guidelines. Reproducibility of Attack Evaluator verdicts is limited without exact prompt text."
    516     },
    517     {
    518       "flag": "Hyperparameters not reported",
    519       "detail": "SVM hyperparameters (C, kernel, tolerance) absent. VectorDB similarity threshold τ not specified. Reproducibility compromised."
    520     },
    521     {
    522       "flag": "Single target model tested",
    523       "detail": "Evaluation against only 'gpt-oss:20b.' Generalization to other models (GPT-4, Claude, Llama) unknown."
    524     },
    525     {
    526       "flag": "Dataset sourced from known attacks only",
    527       "detail": "Training/test data aggregated from academic literature and documented exploits. May not represent novel zero-day attack space despite Section 8 acknowledging this limitation."
    528     },
    529     {
    530       "flag": "Acknowledged English-only preprocessing",
    531       "detail": "System does not support multilingual jailbreaks (Section 8, Limitation 1). Limits applicability to non-English deployments."
    532     },
    533     {
    534       "flag": "Acknowledged single-turn limitation",
    535       "detail": "Pipeline evaluates isolated prompts, not multi-turn dialogue. Real adversaries exploit conversation history (Section 8, Limitation 2). Results may not translate to agentic systems."
    536     },
    537     {
    538       "flag": "Potential dataset overfitting",
    539       "detail": "VectorDB trained on same 28,000 prompts used for SVM training. Though train/test separated, both models learned on same distribution. Robustness to truly novel attacks uncertain."
    540     }
    541   ],
    542   "cited_papers": [
    543     {
    544       "title": "Jailbreaking ChatGPT via Prompt Engineering: An Empirical Study",
    545       "relevance": "Empirical taxonomy of jailbreak strategies; dataset source for adversarial prompts"
    546     },
    547     {
    548       "title": "Greedy Coordinate Gradient-Based Search for Universal Adversarial Attacks",
    549       "relevance": "Adversarial suffix optimization techniques; attack vectors evaluated in defense pipeline"
    550     },
    551     {
    552       "title": "Prompt Injection Attacks and Defenses in Vision-Language Models",
    553       "relevance": "Defines prompt injection vs jailbreaking distinction; related defense architecture patterns"
    554     },
    555     {
    556       "title": "Emoji Attack: A Method for Misleading Judge LLMs in Safety Risk Detection",
    557       "relevance": "Token-level obfuscation attacks; motivates character-level feature engineering in SVM"
    558     },
    559     {
    560       "title": "Tree of Attacks: Jailbreaking Black-Box LLMs Automatically",
    561       "relevance": "Automated black-box jailbreak search methods; adversarial corpus source"
    562     },
    563     {
    564       "title": "Intention Analysis Makes LLMs a Good Jailbreak Defender",
    565       "relevance": "Pre-generation intent analysis approach; compared baseline moderation strategy"
    566     },
    567     {
    568       "title": "Attention Tracker: Detecting Prompt Injection Attacks in LLMs",
    569       "relevance": "Model-internal defense signals; alternative detection mechanism not pursued here"
    570     },
    571     {
    572       "title": "From LLMs to MLLMs to Agents: A Survey of Emerging Security Challenges",
    573       "relevance": "Agentic attack surface expansion; motivates need for efficient early-stage defenses"
    574     }
    575   ],
    576   "engagement_factors": {
    577     "practical_relevance": {
    578       "score": 2,
    579       "justification": "Proposes a deployable defense pipeline with released code, useful for practitioners securing LLM applications, though not a plug-and-play solution."
    580     },
    581     "surprise_contrarian": {
    582       "score": 1,
    583       "justification": "The finding that a simple SVM outperforms an LLM-based moderator is somewhat surprising but aligns with known advantages of classical ML for specific classification tasks."
    584     },
    585     "fear_safety": {
    586       "score": 2,
    587       "justification": "Directly addresses LLM jailbreak and prompt injection attacks, a growing security concern for AI-powered applications."
    588     },
    589     "drama_conflict": {
    590       "score": 1,
    591       "justification": "Implicit criticism of ShieldGemma's performance (35.1% accuracy) could generate mild discussion, but no dramatic controversy."
    592     },
    593     "demo_ability": {
    594       "score": 2,
    595       "justification": "Code and dataset released on GitHub, allowing practitioners to test and replicate the defense pipeline."
    596     },
    597     "brand_recognition": {
    598       "score": 0,
    599       "justification": "Authors are from BITS Pilani and Trustwise, neither of which are widely recognized in the AI safety community."
    600     }
    601   },
    602   "hn_data": {
    603     "threads": [],
    604     "top_points": 0,
    605     "total_points": 0,
    606     "total_comments": 0
    607   }
    608 }

Impressum · Datenschutz