scan.json (27886B)
1 { 2 "paper": { 3 "title": "Prompt Injection Detection in LLM Integrated Applications", 4 "authors": [ 5 "Qianlong Lan", 6 "Anuj Kaul", 7 "Shaun Jones" 8 ], 9 "year": 2025, 10 "venue": "International Journal of Network Dynamics and Intelligence", 11 "doi": "10.53941/ijndi.2025.100013" 12 }, 13 "scan_version": 3, 14 "active_modules": ["experimental_rigor", "data_leakage"], 15 "methodology_tags": ["benchmark-eval", "case-study"], 16 "key_findings": "The paper proposes a multi-faceted prompt injection detection system combining banned terms, vector embeddings, and a fine-tuned BERT model. Evaluation on ~250k validation records across 4 use cases shows BERT achieves 0.91-0.99 accuracy, with the general use case at 0.99 but the AI assistant use case at only 0.91. However, only the BERT component is evaluated; the banned terms and vector embedding components are described conceptually without empirical validation.", 17 "checklist": { 18 "artifacts": { 19 "code_released": { 20 "applies": true, 21 "answer": false, 22 "justification": "Reference [56] links to a GitHub repo (lanqianlong/IJNDI) but this is cited as the source for manual annotation data, not the detection system code. No repository URL for the detection system itself is provided." 23 }, 24 "data_released": { 25 "applies": true, 26 "answer": false, 27 "justification": "The paper uses open-source datasets [44-55] which are publicly available, but the internal eBay product use case data is not released. The Data Availability Statement says 'available from the corresponding author upon reasonable request,' which does not constitute release." 28 }, 29 "environment_specified": { 30 "applies": true, 31 "answer": false, 32 "justification": "No environment specifications, requirements.txt, Docker setup, or dependency versions are provided. The paper does not specify which BERT variant was used, what libraries were employed, or any technical setup details." 33 }, 34 "reproduction_instructions": { 35 "applies": true, 36 "answer": false, 37 "justification": "No reproduction instructions are provided. The paper describes the system architecture conceptually but provides no steps for reproducing the experiments." 38 } 39 }, 40 "statistical_methodology": { 41 "confidence_intervals_or_error_bars": { 42 "applies": true, 43 "answer": false, 44 "justification": "Only point estimates are reported (e.g., accuracy of 0.99, 0.98, 0.91). No confidence intervals or error bars are provided for any metric." 45 }, 46 "significance_tests": { 47 "applies": true, 48 "answer": false, 49 "justification": "No statistical significance tests are used. Performance differences across use cases (e.g., 0.99 vs 0.91) are presented as raw numbers without any testing." 50 }, 51 "effect_sizes_reported": { 52 "applies": true, 53 "answer": false, 54 "justification": "No effect sizes are reported. Results are presented as absolute accuracy, precision, recall, and F1 values without any baseline context or effect size measures." 55 }, 56 "sample_size_justified": { 57 "applies": true, 58 "answer": false, 59 "justification": "No justification is given for why ~4 million training data points or ~250k validation records were chosen. The validation set sizes per use case (221k, 4.1k, 1.1k, 13.4k) are stated without justification." 60 }, 61 "variance_reported": { 62 "applies": true, 63 "answer": false, 64 "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single runs with no indication of result stability across multiple experiments." 65 } 66 }, 67 "evaluation_design": { 68 "baselines_included": { 69 "applies": true, 70 "answer": false, 71 "justification": "No baseline comparisons are included. The BERT model is evaluated in isolation without comparison to any other prompt injection detection method, rule-based system, or alternative ML model." 72 }, 73 "baselines_contemporary": { 74 "applies": true, 75 "answer": false, 76 "justification": "No baselines are included at all, so they cannot be assessed for contemporariness. The paper does not compare against any existing prompt injection detection approaches." 77 }, 78 "ablation_study": { 79 "applies": true, 80 "answer": false, 81 "justification": "The system has three components (banned terms, vector embeddings, BERT model) but no ablation study is performed. Only the BERT component is evaluated; the contribution of each component to the overall system is not measured." 82 }, 83 "multiple_metrics": { 84 "applies": true, 85 "answer": true, 86 "justification": "The paper reports accuracy, F1 score, precision, and recall (Figure 4), as well as the full confusion matrix (TN, FP, FN, TP) in Table 5." 87 }, 88 "human_evaluation": { 89 "applies": true, 90 "answer": false, 91 "justification": "No human evaluation of the detection system's outputs is performed. All evaluation is automated using labeled datasets." 92 }, 93 "held_out_test_set": { 94 "applies": true, 95 "answer": true, 96 "justification": "Section 3.4 states the model was 'benchmarked against an unseen validation dataset comprising around 250k records,' indicating a held-out evaluation set." 97 }, 98 "per_category_breakdown": { 99 "applies": true, 100 "answer": true, 101 "justification": "Results are broken down across four use cases (General, Product Query, AI Assistant, Question Answering) in Table 4 and Figure 4, showing per-category performance." 102 }, 103 "failure_cases_discussed": { 104 "applies": true, 105 "answer": false, 106 "justification": "The paper notes the AI Assistant use case achieved only 0.91 accuracy 'due to lack of diverse data' but provides no specific failure examples, error analysis, or discussion of what types of inputs the system misclassifies." 107 }, 108 "negative_results_reported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The AI Assistant use case is reported at 0.91 accuracy (vs 0.98-0.99 for others), and Section 4.1 acknowledges that 'a high false positive rate could detrimentally affect usability.' The Product Query use case shows precision of only ~0.41." 112 } 113 }, 114 "claims_and_evidence": { 115 "abstract_claims_supported": { 116 "applies": true, 117 "answer": false, 118 "justification": "The abstract claims the system 'identify and mitigate prompt injections effectively' and aims to 'neutralize prompt injections in real-time.' However, only the BERT component is evaluated. The banned terms and vector embedding components are described conceptually but never tested, so the claim about the combined system is unsupported." 119 }, 120 "causal_claims_justified": { 121 "applies": false, 122 "answer": false, 123 "justification": "The paper does not make explicit causal claims. It reports classification performance metrics without claiming that specific components cause specific improvements." 124 }, 125 "generalization_bounded": { 126 "applies": true, 127 "answer": false, 128 "justification": "The title claims 'Prompt Injection Detection in LLM Integrated Applications' broadly, but evaluation is limited to text classification with a single BERT model on specific datasets. No bounds are stated regarding which LLMs, application types, or attack categories the approach generalizes to." 129 }, 130 "alternative_explanations_discussed": { 131 "applies": true, 132 "answer": false, 133 "justification": "No alternative explanations for the results are discussed. For example, the high accuracy on the General dataset could be due to class imbalance (Table 5 shows TN=0, FP=0, meaning no negative examples), but this is not acknowledged." 134 }, 135 "proxy_outcome_distinction": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper measures binary classification accuracy on labeled datasets but frames this as 'neutralizing prompt injections in real-time' and 'safeguarding integrity and security.' The gap between detecting known patterns in labeled data and preventing real-world prompt injections in production systems is not acknowledged." 139 } 140 }, 141 "setup_transparency": { 142 "model_versions_specified": { 143 "applies": true, 144 "answer": false, 145 "justification": "The paper refers to 'BERT model' and 'pretrained BERT model' throughout without specifying which variant (e.g., bert-base-uncased, bert-large, DistilBERT), version, or checkpoint was used." 146 }, 147 "prompts_provided": { 148 "applies": false, 149 "answer": false, 150 "justification": "The core approach is BERT fine-tuning for classification, not prompt-based. The system does not use prompting as its detection mechanism." 151 }, 152 "hyperparameters_reported": { 153 "applies": true, 154 "answer": false, 155 "justification": "No hyperparameters are reported for the BERT fine-tuning process. Learning rate, batch size, number of epochs, optimizer, and other training settings are entirely absent." 156 }, 157 "scaffolding_described": { 158 "applies": false, 159 "answer": false, 160 "justification": "No agentic scaffolding is used. The system is a classification pipeline, not an agentic system." 161 }, 162 "data_preprocessing_documented": { 163 "applies": true, 164 "answer": false, 165 "justification": "The paper lists data sources [44-56] and total sizes (~4M training, ~250k validation) but does not document how data from 12+ sources was combined, deduplicated, cleaned, or split into training and validation sets." 166 } 167 }, 168 "limitations_and_scope": { 169 "limitations_section_present": { 170 "applies": true, 171 "answer": false, 172 "justification": "Section 4 ('Discussions') covers challenges and future work but does not serve as a limitations section for the current study. The discussion is about general challenges (balancing security/usability, evolving threats) rather than limitations of the presented evaluation." 173 }, 174 "threats_to_validity_specific": { 175 "applies": true, 176 "answer": false, 177 "justification": "No specific threats to validity are discussed. Section 4 describes generic challenges in the domain but does not address specific threats to this study's validity, such as dataset bias, class imbalance, or limited evaluation scope." 178 }, 179 "scope_boundaries_stated": { 180 "applies": true, 181 "answer": false, 182 "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show or what settings are excluded from their claims." 183 } 184 }, 185 "data_integrity": { 186 "raw_data_available": { 187 "applies": true, 188 "answer": false, 189 "justification": "Raw data is not available. The open-source datasets are individually accessible, but the combined training/validation data and internal eBay product data are not released. The Data Availability Statement offers data 'upon reasonable request.'" 190 }, 191 "data_collection_described": { 192 "applies": true, 193 "answer": false, 194 "justification": "Data sources are listed by reference ([44-56]) and total size (~4M) is stated, but the actual collection procedure — how data was gathered from each source, what criteria were used for inclusion, and how sources were combined — is not described." 195 }, 196 "recruitment_methods_described": { 197 "applies": false, 198 "answer": false, 199 "justification": "No human participants. Data sources are standard public datasets and internal product data." 200 }, 201 "data_pipeline_documented": { 202 "applies": true, 203 "answer": false, 204 "justification": "The pipeline from raw data sources to final evaluation is undocumented. How ~4M records were processed, labeled, and split into training and validation sets is not described. Table 5 totals do not match Table 4 dataset sizes (e.g., Question Answering: 13.4k in Table 4 but 11,446 in Table 5)." 205 } 206 }, 207 "conflicts_of_interest": { 208 "funding_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "The paper explicitly states 'This research received no external funding.'" 212 }, 213 "affiliations_disclosed": { 214 "applies": true, 215 "answer": true, 216 "justification": "All three authors are identified as being from 'Global Information Security, eBay Inc. San Jose, CA, USA.'" 217 }, 218 "funder_independent_of_outcome": { 219 "applies": true, 220 "answer": false, 221 "justification": "While no external funding is disclosed, all authors work at eBay's security team, and the system is designed for eBay's products. eBay has a direct commercial interest in the detection system working well." 222 }, 223 "financial_interests_declared": { 224 "applies": true, 225 "answer": true, 226 "justification": "The paper includes 'Conflicts of Interest: The authors declare no conflict of interest.' A competing interests statement is present, though it does not acknowledge the inherent interest eBay has in the system's effectiveness." 227 } 228 }, 229 "contamination": { 230 "training_cutoff_stated": { 231 "applies": false, 232 "answer": false, 233 "justification": "The paper tests a defense system (prompt injection detection via fine-tuned BERT), not a pre-trained model's capability on a benchmark. Contamination in the pre-training sense is not applicable." 234 }, 235 "train_test_overlap_discussed": { 236 "applies": false, 237 "answer": false, 238 "justification": "Same as above — the paper evaluates a fine-tuned detection model, not a pre-trained LLM's benchmark performance." 239 }, 240 "benchmark_contamination_addressed": { 241 "applies": false, 242 "answer": false, 243 "justification": "Same as above — contamination of pre-training data is not applicable to this fine-tuned classification task." 244 } 245 }, 246 "human_studies": { 247 "pre_registered": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants in this study." 251 }, 252 "irb_or_ethics_approval": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "demographics_reported": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "inclusion_exclusion_criteria": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "randomization_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "blinding_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "attrition_reported": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 } 282 }, 283 "cost_and_practicality": { 284 "inference_cost_reported": { 285 "applies": true, 286 "answer": false, 287 "justification": "The paper claims 'latency is within milliseconds' but provides no specific measurements of their own system's inference time. Table 2 shows Pinecone vector database latency benchmarks from a third party, not their own system's performance." 288 }, 289 "compute_budget_stated": { 290 "applies": true, 291 "answer": false, 292 "justification": "No compute budget is stated. Training time, GPU hours, hardware used, or any computational resource information for fine-tuning the BERT model is absent." 293 } 294 }, 295 "experimental_rigor": { 296 "seed_sensitivity_reported": { 297 "applies": true, 298 "answer": false, 299 "justification": "No mention of multiple random seeds. Results appear to be from a single training run." 300 }, 301 "number_of_runs_stated": { 302 "applies": true, 303 "answer": false, 304 "justification": "The number of experimental runs is not stated. Results are presented without indicating how many runs produced them." 305 }, 306 "hyperparameter_search_budget": { 307 "applies": true, 308 "answer": false, 309 "justification": "No hyperparameter search is described. The paper does not mention how the BERT model's hyperparameters were selected." 310 }, 311 "best_config_selection_justified": { 312 "applies": true, 313 "answer": false, 314 "justification": "No discussion of how the final model configuration was selected. Only final results are shown without explaining the selection process." 315 }, 316 "multiple_comparison_correction": { 317 "applies": false, 318 "answer": false, 319 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 320 }, 321 "self_comparison_bias_addressed": { 322 "applies": true, 323 "answer": false, 324 "justification": "The authors evaluate their own system exclusively without acknowledging the bias inherent in self-evaluation. No independent evaluation or baseline comparison is provided." 325 }, 326 "compute_budget_vs_performance": { 327 "applies": false, 328 "answer": false, 329 "justification": "Only one model configuration is tested, so compute-performance tradeoff analysis is not applicable." 330 }, 331 "benchmark_construct_validity": { 332 "applies": true, 333 "answer": false, 334 "justification": "The paper does not discuss whether the datasets used actually measure real-world prompt injection detection ability. The General validation set has TN=0 and FP=0 (Table 5), meaning it contains no negative examples, yet 0.99 accuracy is claimed without questioning construct validity." 335 }, 336 "scaffold_confound_addressed": { 337 "applies": false, 338 "answer": false, 339 "justification": "No scaffolding is involved. The system is a classification pipeline." 340 } 341 }, 342 "data_leakage": { 343 "temporal_leakage_addressed": { 344 "applies": true, 345 "answer": false, 346 "justification": "No discussion of whether training and validation data overlap temporally. Data sources span 2022-2024 with no temporal split analysis." 347 }, 348 "feature_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "No discussion of whether input features could leak label information. Given 12+ data sources combined, potential feature leakage between overlapping datasets is not addressed." 352 }, 353 "non_independence_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "No discussion of train/test independence. Multiple data sources from HuggingFace may contain overlapping or near-duplicate examples, and this is not verified." 357 }, 358 "leakage_detection_method": { 359 "applies": true, 360 "answer": false, 361 "justification": "No leakage detection or prevention method is described. No deduplication, overlap analysis, or temporal splits are mentioned." 362 } 363 } 364 }, 365 "claims": [ 366 { 367 "claim": "The fine-tuned BERT model achieves accuracy exceeding 0.98 on prompt injection detection across use cases.", 368 "evidence": "Table 4 shows accuracy of 0.99 (General, 221k), 0.99 (Product Query, 4.1k), 0.91 (AI Assistant, 1.1k), and 0.98 (Question Answering, 13.4k). Section 3.4.", 369 "supported": "weak" 370 }, 371 { 372 "claim": "The multi-faceted detection system combining banned terms, vector embeddings, and BERT can neutralize prompt injections in real-time.", 373 "evidence": "Only the BERT component is empirically evaluated (Section 3.4). Banned terms and vector embeddings are described conceptually (Sections 3.1, 3.2) but not tested. Latency claim is vague ('within milliseconds') with no system-level measurement.", 374 "supported": "unsupported" 375 }, 376 { 377 "claim": "The BERT model shows high recall and identifies most prompt injections across all use cases.", 378 "evidence": "Table 5 and Figure 4 show recall rates. General: 0.999, Product Query: 1.0, AI Assistant: 0.984, Question Answering: 0.997. However, the General dataset has TN=0, meaning no negative examples are present.", 379 "supported": "weak" 380 }, 381 { 382 "claim": "The detection system is suitable for production deployment in data-intensive applications.", 383 "evidence": "Section 4.1 discusses balancing security and usability conceptually. No production deployment metrics, A/B tests, or real-world performance data are provided.", 384 "supported": "unsupported" 385 } 386 ], 387 "red_flags": [ 388 { 389 "flag": "Confusion matrix anomalies and class imbalance", 390 "detail": "Table 5 shows the General validation set has TN=0 and FP=0, meaning it contains zero negative (non-injection) examples. The 0.99 accuracy is therefore just recall on an all-positive dataset. This severely undermines the accuracy claims." 391 }, 392 { 393 "flag": "Dataset size discrepancies between tables", 394 "detail": "Table 4 lists Question Answering at 13.4k records but Table 5 totals (5555+168+17+5706) sum to 11,446, a discrepancy of ~2,000 records with no explanation." 395 }, 396 { 397 "flag": "No baselines or comparisons", 398 "detail": "The BERT model is evaluated in isolation without comparison to any baseline method (rule-based, other ML models, existing prompt injection detectors). This makes it impossible to assess whether the approach offers value over simpler alternatives." 399 }, 400 { 401 "flag": "Two-thirds of the system is unevaluated", 402 "detail": "The paper proposes a three-component system (banned terms, vector embeddings, BERT) but only evaluates the BERT component. The overall system effectiveness claimed in the abstract is therefore unsupported." 403 }, 404 { 405 "flag": "Company evaluating its own product", 406 "detail": "All authors are from eBay's security team, building and evaluating a system designed for eBay's internal use. No independent evaluation is provided, and the conflicts of interest statement does not acknowledge this inherent bias." 407 }, 408 { 409 "flag": "Missing training details", 410 "detail": "The BERT variant, hyperparameters, training procedure, data splitting methodology, and preprocessing steps are entirely absent. The paper is effectively unreproducible." 411 } 412 ], 413 "cited_papers": [ 414 { 415 "title": "Prompt injection attack against LLM-integrated applications", 416 "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li"], 417 "year": 2023, 418 "arxiv_id": "2306.05499", 419 "relevance": "Foundational work on prompt injection attacks against LLM-integrated applications, directly relevant to the threat model this paper addresses." 420 }, 421 { 422 "title": "From prompt injections to SQL injection attacks: How protected is your LLM-integrated web application?", 423 "authors": ["Rodrigo Pedro", "Daniel Castro", "Paulo Carreira"], 424 "year": 2023, 425 "arxiv_id": "2308.01990", 426 "relevance": "Explores the connection between prompt injection and traditional injection attacks in LLM-integrated web applications." 427 }, 428 { 429 "title": "Assessing prompt injection risks in 200+ custom GPTs", 430 "authors": ["Jiahao Yu", "Yuhang Wu", "Dong Shu"], 431 "year": 2023, 432 "arxiv_id": "2311.11538", 433 "relevance": "Large-scale assessment of prompt injection vulnerabilities across custom GPT applications." 434 }, 435 { 436 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 437 "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra"], 438 "year": 2023, 439 "doi": "10.1145/3605764.3623985", 440 "relevance": "Demonstrates indirect prompt injection attacks against real-world LLM-integrated applications." 441 }, 442 { 443 "title": "More than you've asked for: A comprehensive analysis of novel prompt injection threats to application-integrated large language models", 444 "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra"], 445 "year": 2023, 446 "arxiv_id": "2302.12173", 447 "relevance": "Comprehensive taxonomy of prompt injection threats to LLM-integrated applications." 448 }, 449 { 450 "title": "Automatic and universal prompt injection attacks against large language models", 451 "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang"], 452 "year": 2024, 453 "arxiv_id": "2403.04957", 454 "relevance": "Proposes automated methods for generating universal prompt injection attacks against LLMs." 455 }, 456 { 457 "title": "A new era in LLM security: Exploring security concerns in real-world LLM-based systems", 458 "authors": ["Fangzhou Wu", "Ning Zhang", "Somesh Jha"], 459 "year": 2024, 460 "arxiv_id": "2402.18649", 461 "relevance": "Surveys security concerns in real-world LLM-based systems including data extraction and prompt injection." 462 }, 463 { 464 "title": "Universal and transferable adversarial attacks on aligned language models", 465 "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini"], 466 "year": 2023, 467 "arxiv_id": "2307.15043", 468 "relevance": "Demonstrates universal adversarial attacks on aligned LLMs, relevant to understanding attack vectors." 469 }, 470 { 471 "title": "Jailbroken: How does LLM safety training fail?", 472 "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"], 473 "year": 2023, 474 "relevance": "Analyzes failure modes of LLM safety training, including techniques that bypass alignment." 475 }, 476 { 477 "title": "Demystifying RCE vulnerabilities in LLM-integrated apps", 478 "authors": ["Tong Liu", "Zizhuang Deng", "Guozhu Meng"], 479 "year": 2023, 480 "arxiv_id": "2309.02926", 481 "relevance": "Examines remote code execution vulnerabilities in LLM-integrated applications." 482 }, 483 { 484 "title": "Large language models for code: Security hardening and adversarial testing", 485 "authors": ["Jingxuan He", "Martin Vechev"], 486 "year": 2023, 487 "doi": "10.1145/3576915.3623175", 488 "relevance": "Addresses security hardening of LLMs for code generation, relevant to LLM security and adversarial robustness." 489 } 490 ], 491 "engagement_factors": { 492 "practical_relevance": { 493 "score": 2, 494 "justification": "Presents a production-oriented architecture for prompt injection detection with specific deployment patterns, but no released tool or code to use directly." 495 }, 496 "surprise_contrarian": { 497 "score": 0, 498 "justification": "Uses standard approaches (banned terms + embeddings + BERT) with no surprising findings or challenges to conventional wisdom." 499 }, 500 "fear_safety": { 501 "score": 1, 502 "justification": "Addresses prompt injection as a security concern but focuses on defense rather than demonstrating novel attacks or raising new risks." 503 }, 504 "drama_conflict": { 505 "score": 0, 506 "justification": "No controversy or conflict; straightforward presentation of a detection system." 507 }, 508 "demo_ability": { 509 "score": 0, 510 "justification": "No public code, demo, or tool released that someone could try." 511 }, 512 "brand_recognition": { 513 "score": 1, 514 "justification": "eBay is a recognizable company but not a leading AI research lab." 515 } 516 } 517 }