scan.json (27409B)
1 { 2 "paper": { 3 "title": "Method of Counteracting Manipulative Queries to Large Language Models", 4 "authors": ["Yehor Kovalchuk", "Mykhailo Kolomytsev"], 5 "year": 2025, 6 "venue": "Intelligent Data Analysis Methods in Cybersecurity (KPI)", 7 "doi": "10.20535/tacs.2664-29132025.3.345389" 8 }, 9 "scan_version": 3, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "The paper proposes a Multi-Head DistilBERT classifier with four specialized detection heads (malicious intent, instruction override, persona adoption, high-risk action) for detecting prompt injection and jailbreaking attacks. Using a hybrid data generation strategy with GPT-5 as a teacher model for knowledge distillation, the system achieves 0.99 Recall on a real-world holdout set, significantly outperforming keyword matching (0.80) and TF-IDF + Logistic Regression (0.95) baselines. The DistilBERT model maintains F1 of 0.98 on obfuscated prompts where keyword-based filters collapse to 0.27.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "No repository URL, code archive, or supplementary materials link is provided anywhere in the paper." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "The paper references public datasets (deepset/prompt-injections, jackhhao/jailbreak-classification) but does not release its own 3,000 synthetic samples or the GPT-5 teacher-labeled data, which are the primary dataset contributions." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "No requirements.txt, Dockerfile, or library versions are provided. The paper mentions DistilBERT and PyTorch (via references) but gives no version information or environment setup details." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The architectural description in Section 1 is conceptual only." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "All results (F1, Recall, Precision) are reported as point estimates with no confidence intervals or error bars. Figures 3 and 4 show bar charts without uncertainty." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper claims the proposed system 'significantly outperforming' baselines but no statistical significance tests (t-tests, bootstrap, etc.) are performed. The claim is based solely on comparing raw numbers." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "Absolute performance numbers with baseline context are provided: keyword F1 drops from ~0.95 (Easy) to 0.27 (Medium), while proposed method maintains 0.98. Recall: keyword 0.80, TF-IDF 0.95, proposed 0.99. Sufficient context to assess magnitude." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "The choice of 3,000 synthetic samples is stated without justification. No power analysis or rationale for why this sample size is sufficient for the claims made." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single experimental runs with no information about stability across seeds or trials." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Two baselines are included: Keyword Matching (RegEx with 200+ blacklisted terms) and TF-IDF + Logistic Regression, described in Section 2.1." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": false, 72 "justification": "Keyword matching and TF-IDF + Logistic Regression are not contemporary baselines for prompt injection detection in 2025. More relevant baselines would include other fine-tuned transformer classifiers, dedicated prompt injection detectors (e.g., Lakera Guard, Rebuff), or at minimum a standard BERT classifier." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": false, 77 "justification": "The system has multiple components (4 detection heads, Paranoid Mode ensemble logic, knowledge distillation) but no ablation study is performed to measure the contribution of individual heads or the Paranoid Mode." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Multiple metrics are reported: Accuracy, F1-Score, Recall, and Precision (Sections 2.2 and 2.3)." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": false, 87 "justification": "No human evaluation is performed. All evaluation is automated against labeled datasets. No manual inspection of whether blocked prompts are correctly classified." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "A 20% holdout set from the deepset/prompt-injections dataset is used for real-world evaluation, separate from the synthetic training data (Section 2.1)." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results are broken down by dataset difficulty: Synthetic Easy, Synthetic Medium, Synthetic Hard, and Real-World Holdout (Figures 3 and 4)." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": false, 102 "justification": "No failure cases or error analysis is provided. The paper does not discuss what types of attacks the system misses or what causes false positives." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": false, 107 "justification": "No negative results are reported. Every comparison shows the proposed method outperforming baselines. The slight precision reduction (0.89 vs 0.94) is framed as a 'deliberate trade-off' rather than a negative result." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims Recall of 0.99 and outperformance of TF-IDF and keyword baselines, both supported by Figures 3 and 4 in the results section." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": false, 119 "justification": "The paper attributes the improved detection to the multi-head architecture and knowledge distillation ('the proposed Multi-Head architecture successfully resolved these ambiguities') but provides no ablation to isolate these contributions. A single-head DistilBERT baseline trained on the same data would be needed to justify these causal claims." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The paper claims suitability for 'real-time protection of SIEM and SOAR systems' and 'deployment on edge devices' but tests only on synthetic data and one public NLP dataset. No evaluation in actual SIEM/SOAR contexts or on edge hardware is provided. The title claims generality ('Counteracting Manipulative Queries to Large Language Models') beyond what was tested." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "No alternative explanations are considered. The paper does not discuss whether DistilBERT's sub-word tokenization alone (without multi-head design) could explain the obfuscation robustness, or whether the synthetic training data simply matches the test distribution." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper measures classification metrics on labeled datasets but claims real-world protection effectiveness for SIEM/SOAR systems. The gap between academic dataset performance and deployed system effectiveness in adversarial environments is not acknowledged." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper says 'DistilBERT' and 'GPT-5' without version identifiers, checkpoint dates, or specific model IDs. No API version or snapshot date for GPT-5." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": false, 146 "justification": "The paper mentions 'a custom system prompt containing the definitions of specific attack patterns' used for GPT-5 teacher labeling but does not provide the actual prompt text. Only natural language descriptions are given." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": false, 151 "justification": "Only λ=0.5 (loss weighting) and 0.8 (Paranoid Mode threshold) are reported. Critical training hyperparameters (learning rate, batch size, epochs, optimizer, warm-up schedule) are missing." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "No agentic scaffolding is used. The system is a classification middleware, not an agentic workflow." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": false, 161 "justification": "The synthetic data generation process is described conceptually (three difficulty levels, obfuscation techniques) but specific preprocessing of the public datasets (filtering, deduplication, class balancing, train/test splits beyond the 20% holdout) is not documented. No counts at each pipeline stage." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": false, 168 "justification": "There is no dedicated limitations section. The Conclusions section contains no limitations discussion." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "No threats to validity are discussed anywhere in the paper." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "No explicit scope boundaries are stated. The paper does not state what attack types, languages, or deployment contexts are excluded from its claims." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "No raw data is made available. The 3,000 synthetic samples and GPT-5 teacher labels are not released for independent verification." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 1.2 describes the data collection: synthetic generation across three difficulty levels with specific obfuscation techniques (token splitting, leetspeak, noise injection), and use of named public datasets (deepset/prompt-injections, jackhhao/jailbreak-classification) processed through GPT-5 teacher." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. Data sources are standard public datasets plus synthetic generation." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": false, 200 "justification": "The pipeline is described at a high level (generate → label → train) with a figure (Figure 2), but specific counts at each stage, filtering criteria, and how the final training set was composed from the different sources are not documented." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding information is mentioned anywhere in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are clearly stated: National Technical University of Ukraine 'Igor Sikorsky Kyiv Polytechnic Institute', Institute of Physics and Technology." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": false, 216 "answer": false, 217 "justification": "No funding disclosed. Appears to be unfunded university research." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial disclosure statement is present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": false, 228 "answer": false, 229 "justification": "The paper tests a defense system (prompt injection classifier), not a pre-trained model's capability on a benchmark. Contamination of pre-training data is not the relevant concern." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": false, 233 "answer": false, 234 "justification": "Same as above — the paper evaluates a trained classifier, not a pre-trained model's knowledge." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": false, 238 "answer": false, 239 "justification": "Same as above — contamination in the traditional sense (benchmark in pre-training data) is not applicable to this defense evaluation." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "The paper claims 'low computational latency suitable for deployment on edge devices' but provides no actual latency measurements, throughput numbers, or inference cost data." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "No training time, GPU hours, or computational budget is reported. The cost of GPT-5 teacher labeling (API calls for 3,000+ samples) is also unstated." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No mention of random seeds or results across multiple seeds. All results appear to be from a single run." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "The number of experimental runs is never stated. Results are presented as single values without any indication of repetition." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "λ=0.5 is described as 'empirically selected' and threshold 0.8 from 'ROC curve analysis' but no search budget, number of configurations tried, or search methodology is reported." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": false, 310 "justification": "The selection of λ=0.5 and threshold 0.8 is mentioned but the selection process is not described beyond 'empirically selected' and 'ROC curve analysis of the validation set.' No details on how many configurations were evaluated." 311 }, 312 "multiple_comparison_correction": { 313 "applies": false, 314 "answer": false, 315 "justification": "Only a few pairwise comparisons (3 methods × 4 datasets) are made, and no statistical tests are performed in the first place." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors implemented all baselines and their own system without acknowledging the bias of evaluating their own implementation. No independent evaluation or discussion of this bias." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": false, 325 "justification": "No comparison of computational cost between methods. DistilBERT is far more expensive than keyword matching or TF-IDF, but this compute difference is not discussed or controlled for." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": false, 330 "justification": "No discussion of whether the synthetic datasets or deepset/prompt-injections holdout actually represent real-world prompt injection attacks in SIEM/SOAR contexts. The gap between academic datasets and production threat landscape is not addressed." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": false, 334 "answer": false, 335 "justification": "No scaffolding is involved; this is a standalone classifier evaluation." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "No discussion of whether DistilBERT's pre-training data could contain examples from the deepset/prompt-injections test set, or whether the synthetic training data temporally overlaps with test patterns." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether the synthetic data generation process introduces patterns that leak into the test set (e.g., if synthetic and real-world attacks share systematic features due to the generation process)." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether training and test examples are independent. The GPT-5 teacher labels both synthetic training data and the real-world datasets were not checked for overlap or near-duplicates." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No leakage detection or prevention methods are applied. No deduplication, n-gram overlap analysis, or temporal splitting is mentioned." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "The proposed Multi-Head DistilBERT system achieves a Recall of 0.99 on real-world attack datasets.", 364 "evidence": "Figure 4 shows Recall comparison on real-world holdout: proposed method 0.99, TF-IDF 0.95, keyword 0.80. Section 2.3 discusses results.", 365 "supported": "moderate" 366 }, 367 { 368 "claim": "The proposed system maintains F1 of 0.98 on obfuscated prompts where keyword baselines collapse to 0.27.", 369 "evidence": "Figure 3 shows F1-Scores across datasets. On Synthetic Medium (obfuscation), keyword baseline achieves 0.27 while proposed method achieves 0.98.", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "The multi-head architecture successfully resolves semantic ambiguities that defeat statistical approaches.", 374 "evidence": "On Synthetic Hard dataset, TF-IDF achieves F1 of 0.60 while proposed method achieves 0.87 (Section 2.3). However, no ablation isolates the multi-head design as the cause.", 375 "supported": "weak" 376 }, 377 { 378 "claim": "The solution operates with low computational latency suitable for deployment on edge devices.", 379 "evidence": "Stated in the abstract and conclusions but no latency measurements, throughput benchmarks, or edge device testing is provided.", 380 "supported": "unsupported" 381 }, 382 { 383 "claim": "Knowledge distillation from GPT-5 enables the compact student model to learn complex attack patterns.", 384 "evidence": "The knowledge distillation pipeline is described in Section 1.2, but no comparison between models trained with and without teacher labels is provided to demonstrate the contribution of distillation.", 385 "supported": "weak" 386 } 387 ], 388 "red_flags": [ 389 { 390 "flag": "Weak and outdated baselines", 391 "detail": "Keyword matching and TF-IDF + Logistic Regression are not competitive baselines for prompt injection detection in 2025. More relevant comparisons would include fine-tuned BERT/RoBERTa classifiers or dedicated prompt injection detection tools, making the claimed improvements appear larger than they may be against state-of-the-art." 392 }, 393 { 394 "flag": "No error bars or multiple runs", 395 "detail": "All results are single-point estimates with no confidence intervals, standard deviations, or indication of result stability. Given the small dataset (3,000 synthetic samples + holdout), variance across random seeds could significantly affect reported numbers." 396 }, 397 { 398 "flag": "Claims significantly outrun evidence", 399 "detail": "The paper claims suitability for SIEM/SOAR real-time protection and edge device deployment without providing any latency measurements, throughput tests, or deployment evaluations. These are engineering claims with zero supporting data." 400 }, 401 { 402 "flag": "Self-evaluated synthetic data", 403 "detail": "The majority of evaluation is on synthetic data the authors themselves generated. The generator's design may inadvertently create patterns that the model is specifically trained to detect, inflating apparent performance." 404 }, 405 { 406 "flag": "No ablation study", 407 "detail": "The multi-head architecture, Paranoid Mode, and knowledge distillation are all claimed as contributions, but no ablation study isolates the effect of any individual component. A single-head DistilBERT with the same training data would be the minimum necessary control." 408 }, 409 { 410 "flag": "No limitations discussion", 411 "detail": "The paper contains no limitations section, no threats to validity, and no scope boundaries. For a security system claiming production readiness, this is a significant omission — there is no discussion of adversarial robustness to adaptive attackers who know the defense exists." 412 } 413 ], 414 "cited_papers": [ 415 { 416 "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models", 417 "authors": ["A. Zou", "Z. Wang", "J. Z. Kolter", "M. Fredrikson"], 418 "year": 2023, 419 "arxiv_id": "2307.15043", 420 "relevance": "Foundational work on adversarial attacks against LLM safety alignment, directly relevant to the attack vectors this paper aims to defend against." 421 }, 422 { 423 "title": "Not what you've signed up for: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection", 424 "authors": ["K. Greshake"], 425 "year": 2023, 426 "relevance": "Demonstrates indirect prompt injection attacks on RAG systems, the threat model this paper's defense targets." 427 }, 428 { 429 "title": "Constitutional AI: Harmlessness from AI Feedback", 430 "authors": ["Y. Bai"], 431 "year": 2022, 432 "arxiv_id": "2212.08073", 433 "relevance": "Proposes an alternative AI safety alignment approach (RLHF alternative) that this paper positions as insufficient against prompt injection." 434 }, 435 { 436 "title": "Jailbroken: How Does LLM Safety Training Fail?", 437 "authors": ["A. Wei", "N. Haghtalab", "J. Steinhardt"], 438 "year": 2024, 439 "relevance": "Analyzes failure modes of LLM safety training including jailbreaking techniques, directly relevant to the attack taxonomy in this paper." 440 }, 441 { 442 "title": "DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter", 443 "authors": ["V. Sanh", "L. Debut", "J. Chaumond", "T. Wolf"], 444 "year": 2019, 445 "arxiv_id": "1910.01108", 446 "relevance": "The base model architecture used in this paper's Multi-Head classifier for prompt injection detection." 447 }, 448 { 449 "title": "Extracting Training Data from Large Language Models", 450 "authors": ["N. Carlini"], 451 "year": 2021, 452 "relevance": "Demonstrates LLM security vulnerabilities through training data extraction, part of the broader LLM security landscape this paper addresses." 453 }, 454 { 455 "title": "Prompt Injection attack against LLM-integrated Applications", 456 "authors": ["Y. Liu", "G. Deng", "Z. Xu"], 457 "year": 2023, 458 "arxiv_id": "2306.05499", 459 "relevance": "Directly studies prompt injection attacks against LLM-integrated applications, the core threat this paper's method aims to counteract." 460 }, 461 { 462 "title": "Distilling the Knowledge in a Neural Network", 463 "authors": ["G. Hinton", "O. Vinyals", "J. Dean"], 464 "year": 2015, 465 "arxiv_id": "1503.02531", 466 "relevance": "Foundational knowledge distillation technique used in this paper's hybrid data labeling strategy (GPT-5 teacher → DistilBERT student)." 467 }, 468 { 469 "title": "OWASP Top 10 for Large Language Model Applications", 470 "authors": ["OWASP Foundation"], 471 "year": 2023, 472 "relevance": "Industry standard LLM security framework that categorizes prompt injection as a top vulnerability, providing the threat taxonomy context for this work." 473 } 474 ], 475 "engagement_factors": { 476 "practical_relevance": { 477 "score": 2, 478 "justification": "Middleware for prompt injection detection is practically useful for teams deploying LLMs in security contexts, but no code or tool is released." 479 }, 480 "surprise_contrarian": { 481 "score": 0, 482 "justification": "Confirms the expected finding that semantic analysis outperforms keyword filtering — no surprising or contrarian claims." 483 }, 484 "fear_safety": { 485 "score": 2, 486 "justification": "Addresses prompt injection and jailbreaking in critical infrastructure (SIEM/SOAR), a real and growing concern in LLM security." 487 }, 488 "drama_conflict": { 489 "score": 0, 490 "justification": "No controversy or conflict — straightforward defense method proposal." 491 }, 492 "demo_ability": { 493 "score": 0, 494 "justification": "No code release, no demo, no pip-installable tool — cannot be tried." 495 }, 496 "brand_recognition": { 497 "score": 0, 498 "justification": "From a Ukrainian university lab with no brand recognition in the AI/ML community." 499 } 500 } 501 }