ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (34937B)


      1 {
      2   "paper": {
      3     "title": "Sentra-Guard: A Multilingual Human-AI Framework for Real-Time Defense Against Adversarial LLM Jailbreaks",
      4     "authors": [
      5       "Md. Mehedi Hasan",
      6       "Ziaur Rahman",
      7       "Rafid Mostafiz",
      8       "Md. Abir Hossain"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv",
     12     "arxiv_id": "2510.22628",
     13     "doi": "10.48550/arXiv.2510.22628"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "Sentra-Guard proposes a hybrid jailbreak detection system combining SBERT-FAISS retrieval, fine-tuned DistilBERT classification, zero-shot NLI (BART-MNLI), and human-in-the-loop feedback. The paper claims 99.98% accuracy and 0.004% ASR on HarmBench-28K, with 47ms average latency. However, the paper has severe integrity issues: numerous references are grossly misattributed (cited content does not match the actual referenced papers), results are internally inconsistent (abstract vs body numbers differ), and the dataset size is contradictory (D2 stated as excluded from training, then reported as included).",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "Section V (Ethical Considerations) states 'we will release a redacted version of Sentra-Guard's source code' — this is a promise of future release, not an actual release. No repository URL is provided anywhere in the paper."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The primary dataset D1 (HarmBench-28K) is described as an 'open-access adversarial benchmark.' Cross-dataset evaluation uses JailbreakV-28K, JBB-Behaviors, and JailbreakTracer, which are also public. However, the auxiliary dataset D2 is described only as 'publicly available red-teaming repositories' without specific identification."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Section IV.A mentions PyTorch, HuggingFace Transformers, Tesla T4 GPU (8GB), Apple M1 CPU, and specific model names (distilbert-base-uncased, facebook/bart-large-mnli), but no version numbers for any library, no requirements.txt, no Dockerfile, and no reproducible environment specification."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are provided. The paper describes the general pipeline (Algorithm 1) but gives no scripts, commands, or README for replicating results."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "All results in Tables III-VI are point estimates only. No confidence intervals, error bars, or uncertainty measures are reported for any metric (accuracy, F1, ASR, latency)."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims Sentra-Guard outperforms baselines (Static Keyword Filter, Zero-Shot Classifier, Ensemble Filter) but provides no statistical significance tests. Comparisons are based solely on comparing point estimates."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Absolute performance differences are reported with baseline context: ASR of 0.004% vs LlamaGuard-2 (1.3%) and OpenAI Moderation (3.7%). Table III shows accuracy of 99.98% vs 92.83% (ensemble), 88.76% (zero-shot), and 75.02% (keyword filter). Latency is compared (47ms vs 598ms). The magnitude of differences is clear."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No power analysis or justification for the dataset size. The paper uses ~28K samples (HarmBench-28K) but does not justify whether this is sufficient for the claims made. Also, Section III.C.2 claims training on '56,000 samples from D1 and D2' which contradicts the earlier statement that D2 was reserved for evaluation only."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No variance, standard deviation, or spread measure is reported for any result. No mention of multiple experimental runs. All results appear to be from single runs."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Table III compares against Static Keyword Filter, Zero-Shot Classifier, and Ensemble Moderation Pipeline. Table VI compares against JailbreakTracer, LLM-Sentry, and JBShield from the literature. However, Table VI explicitly notes results are 'from original publications and not from a unified dataset.'"
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table VI includes JailbreakTracer (2025), LLM-Sentry (2024), and JBShield (2025), which are recent systems. However, the Table III baselines (Static Keyword Filter, generic Zero-Shot, Ensemble Filter) are generic and unspecified rather than named contemporary systems."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The system has four major components (SBERT-FAISS retrieval, fine-tuned classifier, zero-shot NLI, HITL feedback) but no ablation study is presented showing the individual contribution of each component. The paper claims HITL 'improved recall by 4.2%' but this is not from a controlled ablation."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Tables III-VI report Accuracy, Precision, Recall, F1 Score, AUC, ASR (Attack Success Rate), FPR (False Positive Rate), and Average Latency."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No human evaluation of the system's detection outputs. The HITL component is described as part of the system architecture for labeling ambiguous cases, not as an evaluation methodology. All evaluation is fully automated against labeled datasets."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section III.B states the dataset was 'split into training (70%), validation (15%), and testing (15%) using stratified sampling.' Cross-dataset evaluation (Table V) uses entirely separate benchmarks (JailbreakV-28K, JBB-Behaviors, JailbreakTracer)."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Table IV provides per-language and per-LLM breakdown of detection performance across 5 languages and 4 models. Table V shows per-dataset results across 4 external benchmarks. Table II categorizes different attack types."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section IV.D discusses the single false negative: 'a Unicode homoglyph variant.' Section IV.D (Performance Visualization) notes 'seven false positives emerged, all stemming from benign prompts with scientific or technical terminology (e.g., bomb calorimeter).'"
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "No negative results reported. Every experiment and configuration shows strong positive performance. No mention of approaches that were tried and failed, or configurations that degraded performance."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The abstract claims '99.96% detection rate' but the confusion matrix analysis in Section IV.D states '99.996% detection rate.' The abstract claims 'F1 = 1.00' but Table III shows F1 = 99.98%. The paper also claims the system 'outperforms leading baselines such as LlamaGuard-2 (1.3%) and OpenAI Moderation (3.7%)' but Table VI acknowledges these comparisons are 'not from a unified dataset.' Numbers are internally inconsistent."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper makes causal claims ('HITL feedback further improved recall by 4.2% and lowered false positives by 11%', 'The hybrid architecture plays a central role') without ablation studies or controlled experiments to justify them. No component-level analysis isolates the causal contribution of individual modules."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper claims the system works 'across over 100 languages' (abstract, Section III.B) but only tests 5 languages (English, French, Spanish, Arabic, Hindi). The title claims 'Multilingual' defense and the conclusion claims 'a new state-of-the-art in adversarial LLM defense' — vastly exceeding what was tested."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "No alternative explanations for the results are discussed. The paper does not consider confounds such as: whether the dataset is too easy for any classifier, whether the near-perfect results indicate overfitting to the specific dataset distribution, or whether simpler approaches could achieve similar results on this particular benchmark."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper measures prompt classification accuracy on static datasets and frames it as 'real-time defense against adversarial LLM jailbreaks.' No discussion of the gap between classifying curated benchmark prompts and defending against adaptive real-world adversaries who modify their attacks in response to the defense system."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper specifies 'distilbert-base-uncased' and 'facebook/bart-large-mnli' for its own components, but the LLMs tested against (GPT-4o, Claude 3 Opus, Gemini Flash, Mistral 7B) have no version/snapshot dates. Per schema, marketing names without snapshot dates do not count as specified versions."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The zero-shot classification module uses candidate labels '{harmful, safe}' for NLI entailment, but the exact input formatting and any prompt templates for the RAG comparator are not provided. The reader cannot reconstruct the exact inputs sent to each inference module."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section IV.A reports: batch size 8, learning rate 2×10⁻⁵, 3 epochs, linear decay schedule, fixed token length 64. The SBERT embedding model and FAISS indexing parameters are mentioned but top-k retrieval count is not specified."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. Sentra-Guard is a classification pipeline (embedding + classification + NLI + fusion), not an agentic system with tool use, memory, or iterative reasoning."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section III.A-B describes: duplicate removal, filtering system-role instructions, discarding metadata, binary labeling (0/1), tokenization with distilbert-base-uncased, padding/truncation to 64 tokens, stratified 70/15/15 split. Neural machine translation for non-English prompts is also documented."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No dedicated limitations section exists. Section V ('Conclusion and Future Work') mentions future directions (generation-time monitoring, multimodal defenses) but does not discuss limitations of the current work. The discussion section (Section IV) analyzes performance but does not acknowledge weaknesses."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No threats to validity are discussed anywhere in the paper. No consideration of internal validity (e.g., dataset representativeness), external validity (e.g., generalization to real adversaries), or construct validity (e.g., whether benchmark detection equals real-world defense)."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The paper makes unbounded claims: 'establishes a new state-of-the-art in adversarial LLM defense' (abstract/conclusion), 'over 100 languages' (tested only 5), 'enterprise-ready performance.' No explicit statements about what the results do NOT show or what settings were NOT tested."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The primary evaluation dataset D1 (HarmBench-28K) is described as an 'open-access adversarial benchmark.' Cross-dataset benchmarks (JBB-Behaviors, JailbreakTracer) are also public. However, the specific train/val/test splits used and any processed versions are not released."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "D1 is identified as HarmBench-28K with a description of its coverage areas (misinformation, cyberattacks, etc.). However, D2 is described only as 'a smaller set of adversarial prompts from publicly available red-teaming repositories' without naming the specific sources. Section III.C.2 claims 'trained on over 56,000 samples from D1 and D2' which contradicts Section III.A's statement that D2 was 'not included in training.'"
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. Data comes from public adversarial prompt benchmarks (standard benchmark data sources)."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "The preprocessing steps are described (dedup, filter, label, split), but there are unexplained inconsistencies: D1 is 'HarmBench-28K' (~28K samples) yet Section III.C.2 claims training on '56,000 samples from D1 and D2' while D2 was stated to be excluded from training. The confusion matrix shows 24,145 harmful prompts in the test set, but 15% of 28K is only ~4,200. These numbers do not add up."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "The Acknowledgement section thanks open-source communities and libraries but does not mention any funding source, grant, or financial support for the research."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Author names are listed but institutional affiliations are not visible in the paper text. No affiliation information accompanies the author list in the provided content."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": false,
    222         "answer": false,
    223         "justification": "No funding is disclosed. Appears to be unfunded academic work."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial disclosure statement is included in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "This paper tests a custom defense tool (classifier pipeline), not a pre-trained model's capability on benchmarks. The system fine-tunes its own classifier on labeled data rather than evaluating pre-trained model knowledge."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Paper tests a defense tool rather than evaluating pre-trained model knowledge on benchmarks. Train/test splitting is handled through stratified sampling of D1, which is a standard practice for training custom classifiers."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "Paper tests a defense tool rather than evaluating pre-trained model knowledge. Contamination in the schema's sense (model training data including benchmark) does not apply to a custom-trained classifier."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study. The HITL component is described architecturally but no human subjects study was conducted."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. The ethics section (Section IV.E) discusses data ethics and responsible AI but does not involve human subjects."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Average inference latency of 47ms is reported (Section IV.A, Table III). Per-language latency ranges from 42ms to 56ms (Table IV). Comparison with baseline latencies (385ms zero-shot, 598ms ensemble) is also provided."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "Section IV.A states 'Training the classifier required about two hours' on a 'Tesla T4 GPU (8 GB), with auxiliary computations on an Apple M1 CPU.' This provides the hardware and training duration."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from a single training/evaluation run."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The number of experimental runs is never stated. It is unclear whether results come from a single run or are averaged across multiple runs."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No mention of hyperparameter search. The learning rate (2×10⁻⁵), batch size (8), and epochs (3) appear chosen without reported justification or search process."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No explanation of how the final configuration was selected. The paper presents one configuration with no discussion of alternatives tried."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The paper makes multiple comparisons across baselines (Tables III, VI), languages (Table IV), and datasets (Table V) without any correction for multiple comparisons. No statistical tests are performed at all."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors evaluate their own system against baselines without acknowledging self-comparison bias. Table III baselines (Static Keyword Filter, Zero-Shot, Ensemble) appear to be the authors' own implementations, with no discussion of whether these faithfully represent the baseline approaches."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "While latency is compared across systems (47ms vs 598ms), performance is not reported as a function of compute budget. The hybrid system likely uses substantially more parameters/compute than the keyword filter, but this is not controlled for."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "No discussion of whether HarmBench-28K actually measures real-world jailbreak defense capability. No analysis of the gap between detecting curated benchmark prompts and defending against adaptive adversaries in production."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is involved. Sentra-Guard is a classification pipeline, not an agentic system with scaffolding."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of temporal leakage. The paper does not address whether the pre-trained models used (DistilBERT, SBERT, BART-MNLI) were trained on data that includes adversarial prompt examples similar to those in the test set."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of feature leakage. The FAISS knowledge base contains both harmful and benign exemplars that could overlap structurally with test examples, but this is not discussed."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether train and test examples in HarmBench-28K share structural similarities, come from the same sources, or contain near-duplicates. The stratified split does not ensure independence."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method is described beyond the basic train/val/test split."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "Sentra-Guard achieves 99.98% accuracy with AUC = 1.00, F1 = 1.00, and ASR of 0.004% on HarmBench-28K",
    370       "evidence": "Table III reports 99.98% accuracy, 100% precision, 99.97% recall, 99.98% F1. Figure 3 shows AUC = 1.00. Confusion matrix (Fig. 5) shows 1 false negative out of 24,145 harmful prompts. However, the abstract states '99.96% detection rate' and 'F1 = 1.00' which conflict with Table III (F1 = 99.98%) and the confusion matrix (99.996% detection rate).",
    371       "supported": "weak"
    372     },
    373     {
    374       "claim": "Sentra-Guard outperforms LlamaGuard-2 (1.3% ASR) and OpenAI Moderation (3.7% ASR)",
    375       "evidence": "Table VI and Table I compare ASR. However, Table VI explicitly notes: 'Results are drawn from original publications and not from a unified dataset.' The comparison is across different benchmarks, making it methodologically invalid as a direct comparison.",
    376       "supported": "weak"
    377     },
    378     {
    379       "claim": "The system achieves multilingual detection with >96% detection rate across 5 languages",
    380       "evidence": "Table IV shows detection rates ranging from 94.3% (Mistral 7B, Arabic) to 99.5% (Claude 3 Opus, English) across English, French, Spanish, Arabic, and Hindi on 4 LLM platforms.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "HITL feedback improved recall by 4.2% and reduced false positives by 11% after 500 new prompts",
    385       "evidence": "Section IV.D states this claim but provides no detailed methodology — no before/after comparison table, no description of the 500 prompts, and no controlled experiment isolating HITL from other factors.",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "Average inference latency of 47ms enables real-time deployment",
    390       "evidence": "Section IV.A and Table III report 47ms average latency. Table IV shows per-language latency from 42-56ms. Compared to 385ms (zero-shot) and 598ms (ensemble) baselines.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Cross-dataset generalization above 99.8% accuracy on JailbreakV-28K, JBB-Behaviors, and JailbreakTracer",
    395       "evidence": "Table V shows accuracy 98.88-99.96% across 4 external test sets. However, the cross-dataset evaluation methodology is not described in detail — it is unclear how the system was applied to these external datasets or whether any adaptation was performed.",
    396       "supported": "moderate"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "Grossly misattributed references",
    402       "detail": "Multiple references in the paper do not match the cited content. [10] Woszczyk et al. is about 'dementia obfuscation in transcribed speech' but cited for HITL methods. [14] Li W. et al. is about 'phishing website detection' but cited for 'lexical obfuscation attacks using synthetic datasets.' [15] Liu et al. is about 'transformer models in advancing blockchain technology' but cited for 'ensemble transformer framework achieving 92% detection accuracy.' [16] Musial et al. is about 'earth observation retrieval' but cited for 'hybrid retrieval-classifier system with FAISS.' [17] Askari et al. is about 'few-shot image classification' but cited for 'fine-tuned transformers for semantic decoding.' [20] Durmus et al. is about 'subjective global opinions in language models' but cited for 'prompt classifiers + safety layers with 92.5% accuracy.' This pattern of systematic misattribution is a hallmark of AI-generated text and raises fundamental integrity concerns about the entire paper."
    403     },
    404     {
    405       "flag": "Results too good to be true",
    406       "detail": "AUC = 1.00 (perfect), F1 ≈ 1.00, and 99.996% detection rate on a complex adversarial classification task with 28K+ samples is extraordinarily suspicious. Only 1 false negative and 7 false positives would be remarkable even for trivial classification tasks. The near-perfect metrics across all evaluation dimensions suggest potential issues with the evaluation methodology, dataset difficulty, or data integrity."
    407     },
    408     {
    409       "flag": "Internally inconsistent numbers",
    410       "detail": "The abstract states '99.96% detection rate' but the confusion matrix yields 99.996%. The abstract says 'F1 = 1.00' but Table III shows F1 = 99.98%. Table III lists False Positives as 'Low (~83%)' which is nonsensical — 83% false positives is not 'Low,' and the body text states FPR is 0.03%. The text states '99.97recall' (missing percentage sign and formatting). These inconsistencies suggest the paper was not carefully reviewed by the authors."
    411     },
    412     {
    413       "flag": "Contradictory dataset claims",
    414       "detail": "Section III.A states D2 was 'not included in training; instead, they were reserved for inference-time evaluation.' Yet Section III.C.2 states 'The classifier has been trained on over 56,000 samples from D1 and D2.' D1 is HarmBench-28K (~28K samples), so 56K would require D2 in training. Additionally, the confusion matrix shows 24,145 harmful test prompts, but 15% of 28K is only ~4,200 — the numbers do not reconcile."
    415     },
    416     {
    417       "flag": "Unfair cross-study comparison",
    418       "detail": "Table VI compares Sentra-Guard against JailbreakTracer, LLM-Sentry, and JBShield, but explicitly acknowledges 'Results are drawn from original publications and not from a unified dataset.' This makes the ASR comparison (0.004% vs 8.1%, 10%, <59%) methodologically invalid. Different datasets have different difficulty levels."
    419     },
    420     {
    421       "flag": "No ablation study despite multi-component system",
    422       "detail": "The system combines 4 major components (SBERT-FAISS retrieval, fine-tuned classifier, zero-shot NLI, HITL feedback) but presents no ablation study. It is impossible to determine which components contribute to performance and whether all are necessary. The claimed HITL improvement (4.2% recall, 11% FP reduction) is stated without controlled experimental support."
    423     },
    424     {
    425       "flag": "Fabricated benchmark name",
    426       "detail": "The paper references 'HarmBench-28K' as its primary dataset (D1), and 'HarmBench v2.3' (attributed to [28] Yan et al., who actually wrote about adversarial metaphors). The real HarmBench (Mazeika et al., 2024) contains ~510 behaviors, not 28K. 'HarmBench-28K' does not appear to be a real benchmark, or it is a renamed/modified version without proper attribution."
    427     },
    428     {
    429       "flag": "No statistical rigor whatsoever",
    430       "detail": "The paper reports no error bars, no confidence intervals, no significance tests, no multiple-run analysis, no seed sensitivity. For a paper claiming 'new state-of-the-art' with near-perfect metrics, the complete absence of statistical rigor makes the results unverifiable."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "Privacy in large language models: Attacks, defenses and future directions",
    436       "authors": ["H. Li", "Y. Chen", "J. Luo"],
    437       "year": 2023,
    438       "arxiv_id": "2310.10383",
    439       "relevance": "Survey of privacy attacks and defenses for LLMs, directly relevant to LLM safety research."
    440     },
    441     {
    442       "title": "Jailbreaking and mitigation of vulnerabilities in large language models",
    443       "authors": ["B. Peng", "K. Chen", "Q. Niu"],
    444       "year": 2024,
    445       "arxiv_id": "2410.15236",
    446       "relevance": "Comprehensive study of jailbreak vulnerabilities and mitigations in LLMs."
    447     },
    448     {
    449       "title": "Attack and defense techniques in large language models: A survey and new perspectives",
    450       "authors": ["Z. Liao", "K. Chen", "Y. Lin"],
    451       "year": 2025,
    452       "arxiv_id": "2505.00976",
    453       "relevance": "Survey of attack and defense techniques for LLMs, covering the adversarial threat landscape."
    454     },
    455     {
    456       "title": "JailGuard: A universal detection framework for prompt-based attacks on LLM systems",
    457       "authors": ["X. Zhang", "C. Zhang", "T. Li"],
    458       "year": 2025,
    459       "relevance": "Universal jailbreak detection framework for LLMs, directly comparable defense system."
    460     },
    461     {
    462       "title": "AutoDefense: Multi-agent LLM defense against jailbreak attacks",
    463       "authors": ["Y. Zeng", "Y. Wu", "X. Zhang"],
    464       "year": 2024,
    465       "arxiv_id": "2403.04783",
    466       "relevance": "Multi-agent approach to LLM jailbreak defense using chain-of-thought and LlamaGuard."
    467     },
    468     {
    469       "title": "Llama Guard: LLM-based input-output safeguard for human-AI conversations",
    470       "authors": ["H. Inan", "K. Upasani", "J. Chi"],
    471       "year": 2023,
    472       "arxiv_id": "2312.06674",
    473       "relevance": "Foundational LLM safety guardrail system from Meta, used as baseline in LLM defense evaluations."
    474     },
    475     {
    476       "title": "Do anything now: Characterizing and evaluating in-the-wild jailbreak prompts on large language models",
    477       "authors": ["X. Shen", "Z. Chen", "M. Backes"],
    478       "year": 2024,
    479       "relevance": "Empirical characterization of real-world jailbreak prompts targeting LLMs, including the DAN taxonomy."
    480     },
    481     {
    482       "title": "Training language models to follow instructions with human feedback",
    483       "authors": ["L. Ouyang", "J. Wu", "X. Jiang"],
    484       "year": 2022,
    485       "relevance": "Foundational RLHF paper for instruction-following LLMs, establishes the alignment approach that jailbreaks attempt to bypass."
    486     },
    487     {
    488       "title": "A cross-language investigation into jailbreak attacks in large language models",
    489       "authors": ["J. Li", "Y. Liu", "C. Liu"],
    490       "year": 2024,
    491       "arxiv_id": "2401.16765",
    492       "relevance": "Cross-lingual jailbreak attack analysis demonstrating multilingual vulnerability of LLMs."
    493     },
    494     {
    495       "title": "JailbreakTracer: Explainable detection of jailbreaking prompts in LLMs using synthetic data generation",
    496       "authors": ["M. F. A. Sayeedi", "M. B. Hossain"],
    497       "year": 2025,
    498       "relevance": "Explainable jailbreak detection system using synthetic data, directly comparable defense approach."
    499     },
    500     {
    501       "title": "LLM-Sentry: A model-agnostic human-in-the-loop framework for securing large language models",
    502       "authors": ["S. Irtiza", "K. A. Akbar", "A. Yasmeen"],
    503       "year": 2024,
    504       "relevance": "Model-agnostic HITL defense framework for LLMs, directly comparable to Sentra-Guard's approach."
    505     },
    506     {
    507       "title": "JBShield: Defending large language models from jailbreak attacks through activated concept analysis and manipulation",
    508       "authors": ["S. Zhang", "Y. Zhai", "K. Guo"],
    509       "year": 2025,
    510       "arxiv_id": "2502.07557",
    511       "relevance": "Concept-based jailbreak defense for LLMs using activation analysis."
    512     }
    513   ],
    514   "engagement_factors": {
    515     "practical_relevance": {
    516       "score": 2,
    517       "justification": "Jailbreak detection is a real deployment need, but no code is released and the system's integrity is questionable given the reference fabrication issues."
    518     },
    519     "surprise_contrarian": {
    520       "score": 0,
    521       "justification": "Confirms the expected pattern that hybrid classifiers can detect adversarial prompts; no contrarian findings."
    522     },
    523     "fear_safety": {
    524       "score": 2,
    525       "justification": "Addresses AI safety concerns by demonstrating jailbreak attacks across major LLMs (GPT-4o, Claude, Gemini, Mistral) and the Table II attack examples are attention-grabbing."
    526     },
    527     "drama_conflict": {
    528       "score": 0,
    529       "justification": "No controversy, no challenge to established players; straightforward system proposal."
    530     },
    531     "demo_ability": {
    532       "score": 0,
    533       "justification": "No code released, no demo, no pip-installable tool. Only a promise of future release."
    534     },
    535     "brand_recognition": {
    536       "score": 0,
    537       "justification": "Unknown authors with no visible institutional affiliation or major lab connection."
    538     }
    539   }
    540 }

Impressum · Datenschutz