scan.json (19395B)
1 { 2 "paper": { 3 "title": "Comprehensive Analysis of Machine Learning and Deep Learning models on Prompt Injection Classification using Natural Language Processing techniques", 4 "authors": ["Bhavvya Jain", "Pranav Pawar", "Dhruv Gada", "Tanish Patwa", "Pratik Kanani", "Deepali Patil", "Lakshmi Kurup"], 5 "year": 2025, 6 "venue": "International Research Journal of Multidisciplinary Technovation", 7 "doi": "10.54392/irjmt2523" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The dataset is publicly available from Hugging Face (deepset prompt injection dataset), as stated in Section 3.1." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. Libraries like scikit-learn, nltk, and BERT are mentioned but no versions are given." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Only point estimates of accuracy are reported (e.g., 94.74% for RNN). No confidence intervals or error bars are provided." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims RNN outperforms other models but provides no statistical significance tests to support these comparisons." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": false, 46 "justification": "Only raw accuracy numbers are reported. No effect sizes (Cohen's d, etc.) or contextual improvement measures beyond raw differences are given." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The dataset size is not stated explicitly in the paper text (only that it was 'sampled out from a larger dataset'), and no justification for the sample size is provided." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No variance, standard deviation, or results across multiple runs are reported. All results appear to be from single runs." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Multiple ML models (LR, SVM, Decision Tree, RF, AdaBoost, XGBoost, KNN, Gradient Boosting) and DL models (CNN, RNN, LSTM, Bi-LSTM) are compared against each other." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": false, 68 "justification": "The models used (Logistic Regression, SVM, basic RNN, LSTM) are all classical approaches. No contemporary prompt injection detection methods or transformer-based classifiers are compared against." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": false, 73 "justification": "No ablation study is performed. The comparison of vectorization techniques is not an ablation — it compares independent approaches rather than removing components from a single system." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": false, 78 "justification": "Only accuracy is reported as the evaluation metric. No F1, precision, recall, or AUC metrics are provided despite this being a binary classification task where class balance matters." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "Human evaluation is not relevant to this binary classification benchmark evaluation." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "The dataset was split using train_test_split from scikit-learn (Section 3.4), and results are reported on the test set." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down by vectorization technique (TF-IDF, Word2Vec, BoW) and by model in Tables 2 and 3." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": false, 98 "justification": "No failure cases, error analysis, or qualitative examples of misclassified prompts are discussed." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Word2Vec's lower performance is reported and discussed (Section 4.1), noting its 'static embeddings may not fully capture the context.' KNN with BoW achieving only 54.14% is also reported." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims RNN achieved 94.74% detection rate, which matches Table 3. The claim that deep learning architectures capturing sequential dependencies are effective is supported by the results." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": false, 115 "justification": "The paper makes causal claims such as 'RNN's ability to capture sequential dependency helped to detect the pattern' (Section 4.2) without controlled experiments isolating this mechanism." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper claims to 'enhance the security, integrity, and trustworthiness of AI-driven technologies' in the abstract, but only tests on a single Hugging Face dataset. No discussion of generalization boundaries." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No alternative explanations for the results are discussed. For example, dataset characteristics (keyword-heavy malicious prompts) that could explain why simple BoW matches deep learning are not explored." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "The BERT model is specified as 'bert-base-uncased' (Section 3.5.5/Table 3). The ML models are standard scikit-learn implementations." 133 }, 134 "prompts_provided": { 135 "applies": false, 136 "answer": false, 137 "justification": "This paper does not use prompting — it trains ML/DL classifiers on text data." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Hyperparameters are reported: 10 epochs, batch size 32, 0.2 validation split, Adam optimizer with learning rate 0.001, Word2Vec dimensionality 100, window size 5, min frequency 1 (Sections 3.4.2, 3.5)." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used in this work." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Preprocessing steps are documented in Section 3.3: removal of special characters via regex, lowercasing, stopword removal, with a worked example." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "There is no limitations or threats-to-validity section. The conclusion briefly mentions future work but does not discuss limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No threats to validity are discussed anywhere in the paper." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show or what settings were not tested." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "The dataset is publicly available on Hugging Face (deepset prompt injection dataset), enabling independent verification." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": false, 181 "justification": "The paper only states the dataset was 'obtained from Hugging Face datasets' and 'developed by deepset' and 'sampled out from a larger dataset' (Section 3.1). No details on how deepset created it or the sampling process." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants; the data source is a standard public benchmark dataset." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The data pipeline from raw text through preprocessing (special character removal, denoising) to vectorization to model training is documented in Sections 3.2-3.5 with a flowchart (Figure 3)." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding source or acknowledgments section is present in the paper." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "All authors are listed as affiliated with Dwarkadas J. Sanghvi College of Engineering, Mumbai, India." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": false, 207 "answer": false, 208 "justification": "No funding is disclosed; appears to be unfunded academic work from a single university." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "This paper trains ML/DL classifiers from scratch on a dataset; it does not evaluate a pre-trained model's capability on a benchmark." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "Same as above — no pre-trained model benchmark evaluation is performed." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "Same as above — no pre-trained model benchmark evaluation is performed." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inference cost, latency, or per-example computation time is reported despite using BERT embeddings and multiple model architectures." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No hardware specifications, GPU hours, or total computation budget is stated." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "RNN model achieved the highest overall accuracy of 94.74% for prompt injection detection, outperforming all other ML and DL models.", 286 "evidence": "Table 3 shows RNN accuracy of 0.9474 with BERT embeddings, compared to CNN (0.9398), LSTM (0.9248), and Bi-LSTM (0.9245). Table 2 shows best ML accuracy was 93.99%.", 287 "supported": "weak" 288 }, 289 { 290 "claim": "Deep learning architectures that capture sequential dependencies are highly effective in identifying prompt injection threats.", 291 "evidence": "RNN (94.74%) outperformed CNN (93.98%), but LSTM (92.48%) and Bi-LSTM (92.45%) performed worse than CNN, contradicting the sequential dependency narrative. Results from Table 3.", 292 "supported": "weak" 293 }, 294 { 295 "claim": "TF-IDF and BoW vectorization techniques achieved comparable highest accuracies of 93.99% with Random Forest and Logistic Regression respectively.", 296 "evidence": "Table 2 shows Random Forest with TF-IDF at 0.9399 and Logistic Regression with BoW at 0.9399.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Word2Vec showed lower overall performance compared to TF-IDF and BoW for prompt injection classification.", 301 "evidence": "Table 2 shows Word2Vec's best accuracy was 84.96% (Gradient Boosting) vs 93.99% for TF-IDF and BoW.", 302 "supported": "moderate" 303 } 304 ], 305 "methodology_tags": ["benchmark-eval"], 306 "key_findings": "The paper evaluates 8 ML models with 3 vectorization techniques (TF-IDF, Word2Vec, BoW) and 4 DL models with BERT embeddings for prompt injection classification. RNN with BERT embeddings achieved the highest accuracy at 94.74%, while simple approaches like Logistic Regression with BoW and Random Forest with TF-IDF both achieved 93.99%. The marginal improvement of deep learning over traditional ML (0.75 percentage points) is not statistically validated, and only accuracy is reported as a metric.", 307 "red_flags": [ 308 { 309 "flag": "Single metric evaluation", 310 "detail": "Only accuracy is reported for a binary classification task. No precision, recall, F1, or AUC-ROC are provided, which is problematic since class distribution is not clearly specified and accuracy alone can be misleading for imbalanced datasets." 311 }, 312 { 313 "flag": "No statistical validation of claims", 314 "detail": "The RNN's 94.74% vs CNN's 93.98% (0.76pp difference) is presented as a meaningful finding, but no significance tests or confidence intervals are provided. Single-run results without variance make it impossible to determine if this difference is real." 315 }, 316 { 317 "flag": "No limitations section", 318 "detail": "The paper has no limitations, threats to validity, or discussion of what the results do not show. Dataset size, single-dataset evaluation, and lack of real-world validation are not discussed." 319 }, 320 { 321 "flag": "Claims outrun evidence", 322 "detail": "The abstract claims the study 'enhances the security, integrity, and trustworthiness of AI-driven technologies, ensuring their safe use across diverse applications' based on a single-dataset classification experiment with no real-world deployment or validation." 323 }, 324 { 325 "flag": "Contradictory narrative", 326 "detail": "The paper emphasizes sequential dependency modeling as key, but LSTM and Bi-LSTM (designed for long-range sequential dependencies) performed worse than both RNN and CNN, contradicting this interpretation." 327 } 328 ], 329 "cited_papers": [ 330 { 331 "title": "Prompt Injection attack against LLM Integrated Applications", 332 "authors": ["Y. Liu", "G. Deng", "Y. Li", "K. Wang", "Z. Wang", "X. Wang", "T. Zhang", "Y. Liu", "H. Wang", "Y. Zheng", "Y. Liu"], 333 "year": 2023, 334 "arxiv_id": "2306.05499", 335 "relevance": "Foundational work on prompt injection attacks against LLM-integrated applications using the HOUYI toolkit." 336 }, 337 { 338 "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection", 339 "authors": ["K. Greshake", "A. Sahar", "M. Shailesh", "E. Christoph", "H. Thorsten", "F. Mario"], 340 "year": 2023, 341 "relevance": "Key study on indirect prompt injection attacks and LLM vulnerabilities in real-world applications." 342 }, 343 { 344 "title": "Benchmarking and defending against indirect prompt injection attacks on large language models", 345 "authors": ["J. Yi", "Y. Xie", "B. Zhu", "E. Kiciman", "G. Sun", "X. Xie", "F. Wu"], 346 "year": 2023, 347 "arxiv_id": "2312.14197", 348 "relevance": "Benchmark and defense evaluation for indirect prompt injection attacks on LLMs." 349 }, 350 { 351 "title": "Ignore previous prompt: Attack techniques for language models", 352 "authors": ["F. Perez", "I. Ribeiro"], 353 "year": 2022, 354 "arxiv_id": "2211.09527", 355 "relevance": "Early systematic study of prompt injection attack techniques against language models." 356 }, 357 { 358 "title": "Injecagent: Benchmarking indirect prompt injections in tool-integrated large language model agents", 359 "authors": ["Q. Zhan", "L. Zhixiang", "Y. Zifan", "K. Daniel"], 360 "year": 2024, 361 "arxiv_id": "2403.02691", 362 "relevance": "Benchmark for evaluating LLM agent susceptibility to indirect prompt injection attacks." 363 }, 364 { 365 "title": "Optimization-based Prompt Injection Attack to LLM-as-a-Judge", 366 "authors": ["J. Shi", "Z. Yuan", "Y. Liu", "Y. Huang", "P. Zhou", "L. Sun", "N. Z. Gong"], 367 "year": 2024, 368 "relevance": "Advanced prompt injection attack technique targeting LLM-as-a-Judge evaluation systems." 369 }, 370 { 371 "title": "Guardian: A multi-tiered defense architecture for thwarting prompt injection attacks on llms", 372 "authors": ["P. Rai", "S. Sood", "V. K. Madisetti", "A. Bahga"], 373 "year": 2024, 374 "relevance": "Multi-layered defense architecture for prompt injection attack prevention on LLMs." 375 }, 376 { 377 "title": "Strengthening LLM Trust Boundaries: A Survey of Prompt Injection Attacks", 378 "authors": ["S.S. Kumar", "M.L. Cummings", "A. Stimpson"], 379 "year": 2024, 380 "relevance": "Survey categorizing prompt injection attacks by prompt types, trust boundaries, and required expertise." 381 }, 382 { 383 "title": "A new era in llm security: Exploring security concerns in real-world llm-based systems", 384 "authors": ["F. Wu", "N. Zhang", "S. Jha", "P. McDaniel", "C. Xiao"], 385 "year": 2024, 386 "arxiv_id": "2402.18649", 387 "relevance": "Comprehensive exploration of security concerns in deployed LLM-based systems." 388 } 389 ] 390 }