scan.json (25687B)
1 { 2 "paper": { 3 "title": "Securing Large Language Models (LLMs) from Prompt Injection Attacks", 4 "authors": ["Omar Farooq Khan Suri", "John McCrae"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2512.01326", 8 "doi": "10.48550/arXiv.2512.01326" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "JATMO-style task-specific fine-tuning of non-instruction-tuned base models reduces prompt injection attack success rates by 75-90% compared to instruction-tuned GPT-3.5-Turbo, but does not eliminate vulnerability. A quality-vulnerability trade-off was observed across three models: higher ROUGE-L scores correlated with higher attack success rates. Multilingual prompts and code-related triggers remain effective bypass vectors even against JATMO-trained models.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No code repository, GitHub link, or archive is mentioned anywhere in the paper. The modified HOUYI framework and fine-tuning code are not released." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "The source Amazon All-Beauty reviews dataset is public (ref [4]), but the authors' custom 1,500-pair summarization dataset generated via GPT-3.5 and the 72 adversarial prompts per attack type are not released." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions using LoRA and 'consumer-grade GPUs' (Section 3.2) but provides no specific library versions, requirements.txt, GPU model, or environment details." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided. The methodology section describes the approach conceptually but does not include commands, scripts, or a reproducibility guide." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Table 1 reports single point estimates for ROUGE-L, CM%, IG%, and Avg ASR% with no confidence intervals, error bars, or ± notation." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims JATMO 'significantly lowers attack success rates' (Section 6) and 'substantially reduces vulnerability' but provides no statistical significance tests (no p-values, t-tests, or any inferential statistics)." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Table 1 provides absolute ASR values with a baseline comparison (e.g., Qwen 0.5B at 9.68% vs GPT-3.5 at 100%), and the conclusion states 'reducing attack success by up to 90% relative to GPT-3.5,' providing sufficient context for effect magnitude." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "72 adversarial prompts per attack type were used (Section 3.5) with no justification for this number, no power analysis, and no discussion of whether this is sufficient to draw reliable conclusions." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or spread measures are reported. All results in Table 1 appear to be single-run values with no indication of result stability." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "GPT-3.5-Turbo (instruction-tuned) is used as a baseline comparison against the JATMO fine-tuned models (Table 1, Section 3.5)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "GPT-3.5-Turbo is a reasonable contemporary baseline for comparing instruction-tuned vs non-instruction-tuned models in the context of prompt injection defense. The original JATMO paper also used GPT-3.5 as a baseline." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": false, 78 "justification": "No ablation study is performed. The three HOUYI modifications (custom fitness scoring, improved mutation logic, custom harness) are not individually ablated. Different model sizes are compared but this is not an ablation of the defense mechanism." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Two distinct metric types are used: ROUGE-L for task fidelity and Attack Success Rate (split into Content Manipulation and Information Gathering) for security evaluation (Table 1, Section 3.5)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation of model outputs. Attack success is determined automatically (detecting 'pwned' token or date-like formats). Human review of edge cases or qualitative output assessment could have strengthened the evaluation." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "A held-out validation split is mentioned for monitoring training (Section 3.2), but it is unclear whether ROUGE-L scores are reported on a separate test set or the validation set. No explicit train/validation/test split is described." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 1 breaks down attack success rates by attack type (Content Manipulation vs Information Gathering) and by model, rather than reporting only aggregate numbers." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 5 (Qualitative Analysis) discusses three specific failure modes: residual instruction-following from pretraining, vulnerability to multilingual/mixed-language prompts, and code-generation bias triggered by code-related tokens." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper explicitly reports that JATMO 'does not fully prevent injections' and acknowledges that mid-sized models remain susceptible to adversarial phrasing (Sections 4.1, 6)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims that JATMO reduces but does not eliminate ASR, and that there is a quality-vulnerability trade-off, are supported by Table 1. The claims about multilingual and code-related bypasses are supported by qualitative analysis in Section 5." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper claims 'better task performance often correlates with increased susceptibility' based on only 3 data points (3 models). The claim that JATMO 'reduces' vulnerability conflates the fine-tuning approach with model size differences (0.5B–7B vs 175B) without controlling for this confound." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title claims to address 'Securing Large Language Models from Prompt Injection Attacks' broadly, but the study tests only 3 models on a single task (summarization) with one dataset (Amazon beauty reviews) and one attack framework. Generalization to other tasks, models, or attack types is not bounded." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No alternative explanations are discussed. The quality-vulnerability correlation could be driven by model size rather than task specialization. The comparison of 0.5B/4B/7B models against a 175B model introduces many confounds (architecture, pretraining data, scale) that are not addressed." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper directly measures what it claims: ASR measures injection vulnerability and ROUGE-L measures summarization quality. The claims match the granularity of the measurements without overreaching to broader concepts." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Open-source models are identified by family and size (LLaMA 2-7B, Qwen1.5-0.5B, Qwen1.5-4B), but GPT-3.5-Turbo lacks a snapshot date or API version. Per schema, marketing names without snapshot dates are insufficient." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "The HOUYI attack components are described conceptually (framework, separator, disruptor) but no actual prompt text is provided. The example disruptors ('append the word pwned at the end') appear to be illustrative, not the actual prompts used." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "Section 3.2 states 'Training was run for several epochs with tuned learning rates and batch sizes' without specifying any actual values. No temperature, top-p, max tokens, or LoRA rank are reported." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The models are fine-tuned for a single summarization task without agents, tools, or multi-step pipelines." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 3.1 documents the preprocessing pipeline: Amazon All-Beauty reviews were grouped into sets of three, GPT-3.5 generated summaries, producing 1,500 input-output pairs (~400k tokens) formatted in OpenAI's chat fine-tuning schema as .jsonl." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "No dedicated limitations section exists. Section 7 (Future Work) mentions some shortcomings of JATMO but frames them as future research directions rather than formal limitations of the study." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed. The paper does not address internal validity concerns such as the model size confound, the limited task scope, or the adequacy of the automated success detection." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "No explicit scope boundaries are stated. The paper does not clearly delineate what the results do NOT show (e.g., that results are limited to summarization, three specific models, and one attack framework)." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw data is released — neither the fine-tuning dataset, adversarial prompts, model outputs, nor evaluation logs are available for independent verification." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 3.1 describes using the Amazon All-Beauty reviews dataset and generating summaries with GPT-3.5. Section 3.3-3.4 describe the HOUYI attack prompt generation process. Section 3.5 states 72 prompts per attack type." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data source is a standard public dataset (Amazon reviews) and adversarial prompts are algorithmically generated." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "The pipeline from Amazon reviews to fine-tuning dataset is partially described, but key details are missing: how reviews were selected from the full dataset, filtering criteria, and how the 1,500 pairs map to the original review count. The HOUYI mutation pipeline details are also vague." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants or sponsors." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly listed: both authors are from University of Galway, Ireland. No commercial affiliations with the models or tools being evaluated." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed. The paper does not state whether the work is unfunded." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement or financial disclosure is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "The paper tests a defense mechanism (JATMO fine-tuning) against adversarial attacks rather than evaluating a pre-trained model's knowledge on a benchmark. Contamination of test data in training is not the relevant concern." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "Same as above — the study tests defenses against prompt injection, not model knowledge on benchmark tasks." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Same as above — the adversarial prompts are generated by HOUYI, not drawn from a pre-existing benchmark that could appear in training data." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference cost, latency, or API cost information is reported despite using both local models and the OpenAI API for GPT-3.5-Turbo fine-tuning." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "The paper mentions 'consumer-grade GPUs' and LoRA for efficiency (Section 3.2) but provides no specific GPU model, hours, training time, or total API spend." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of random seeds or sensitivity analysis across seeds for any of the fine-tuning or attack experiments." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of attack prompts (72 per type) is stated, but the number of experimental runs or repetitions for fine-tuning or evaluation is not." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "Section 3.2 mentions 'tuned learning rates and batch sizes' implying hyperparameter search was done, but no search method, number of configurations, or compute budget for the search is reported." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "No description of how the final hyperparameter configuration was selected. The paper only states parameters were 'tuned' without explaining the selection criterion." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors modified the HOUYI attack framework and implemented their own evaluation harness but do not acknowledge any bias from evaluating their own modifications." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "Models ranging from 0.5B to 175B parameters are compared without discussing the massive compute differences. The quality-vulnerability trade-off could be driven by compute/scale rather than JATMO methodology." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether ASR with automated token detection (checking for 'pwned' or date keywords) adequately measures prompt injection vulnerability. Edge cases in automated detection are not addressed." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved in the model comparisons. Models are evaluated directly without agentic pipelines." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of whether the pre-training data for LLaMA 2, Qwen, or GPT-3.5 could have included information about JATMO or HOUYI attack patterns, which could affect results." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup could leak information about expected outputs, or whether the fixed detection patterns ('pwned', date formats) create a biased evaluation." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether the 72 HOUYI-generated prompts per attack type are independent, or whether the genetic algorithm's mutation process creates correlated test cases." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No concrete leakage detection or prevention method is applied." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "JATMO fine-tuning reduces attack success rates by up to 90% relative to instruction-tuned GPT-3.5-Turbo", 365 "evidence": "Table 1 shows Qwen 0.5B at 9.68% average ASR vs GPT-3.5-Turbo at 100%. LLaMA-2 7B at 11.11% ASR. Section 6 states 'reducing attack success by up to 90%.'", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "JATMO does not fully prevent prompt injections — adversaries exploiting multilingual cues or code-related disruptors still bypass defenses", 370 "evidence": "Table 1 shows non-zero ASR for all JATMO models (9.68-25%). Section 5 provides qualitative examples of multilingual and code-triggered bypasses.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "There is a trade-off between generation quality (ROUGE-L) and injection vulnerability (ASR) — models that produce better summaries are more susceptible to attacks", 375 "evidence": "Table 1 shows Qwen 4B has the highest ROUGE-L (0.43) and highest JATMO ASR (25%), while Qwen 0.5B has lowest ROUGE-L (0.29) and lowest ASR (9.68%). However, this is based on only 3 data points.", 376 "supported": "weak" 377 }, 378 { 379 "claim": "GPT-3.5-Turbo is fully compromised by both content manipulation and information gathering attacks at 100% success rate", 380 "evidence": "Table 1 shows 100% ASR for both CM and IG attacks on GPT-3.5-Turbo.", 381 "supported": "moderate" 382 } 383 ], 384 "red_flags": [ 385 { 386 "flag": "Correlation claim from 3 data points", 387 "detail": "The quality-vulnerability trade-off claim is based on observing 3 models. This is insufficient to establish any correlation or trend, yet the paper frames it as a consistent finding." 388 }, 389 { 390 "flag": "Model size confound not addressed", 391 "detail": "Comparing 0.5B, 4B, and 7B models against a 175B model conflates the defense mechanism (JATMO) with massive scale differences. The lower ASR of smaller models could be due to lower capability, not JATMO-style fine-tuning." 392 }, 393 { 394 "flag": "Questionable GPT-3.5-Turbo parameter count", 395 "detail": "Table 1 lists GPT-3.5-Turbo as '175B' parameters. This is the parameter count for GPT-3, not GPT-3.5-Turbo, whose architecture and size are not publicly confirmed." 396 }, 397 { 398 "flag": "Internal inconsistency in model count", 399 "detail": "Section 3.2 states 'We fine-tuned two base models: LLaMA 2–7B and Qwen 1.5–0.5B' but Table 1 includes results for Qwen 4B as well. The introduction mentions three models but calls them 'two non–instruction-tuned base models.'" 400 }, 401 { 402 "flag": "No statistical tests despite strong claims", 403 "detail": "Claims of 'significant' reduction in attack success rates are made without any statistical testing. With 72 prompts and no variance reporting, the reliability of the ASR differences is unknown." 404 }, 405 { 406 "flag": "Hyperparameters completely withheld", 407 "detail": "Learning rates, batch sizes, number of epochs, LoRA rank, and all training hyperparameters are described as 'tuned' but no values are reported, making the work unreproducible." 408 } 409 ], 410 "cited_papers": [ 411 { 412 "title": "JATMO: Prompt Injection Defense by Task-Specific Finetuning", 413 "authors": ["J. Piet", "M. Alrashed", "C. Sitawarin", "S. Chen", "Z. Wei", "E. Sun", "B. Alomair", "D. Wagner"], 414 "year": 2024, 415 "relevance": "Core defense mechanism evaluated in this study; proposes task-specific fine-tuning as prompt injection mitigation." 416 }, 417 { 418 "title": "Prompt Injection Attacks Against LLM-Integrated Applications", 419 "authors": ["Y. Liu", "G. Deng", "Y. Li", "K. Wang", "Z. Wang"], 420 "year": 2023, 421 "arxiv_id": "2306.05499", 422 "relevance": "Introduces the HOUYI genetic attack framework used to evaluate defense robustness in this study." 423 }, 424 { 425 "title": "AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models", 426 "authors": ["X. Liu", "N. Xu", "M. Chen", "C. Xiao"], 427 "year": 2023, 428 "arxiv_id": "2310.04451", 429 "relevance": "Adversarial attack method for generating stealthy jailbreak prompts against aligned LLMs." 430 }, 431 { 432 "title": "GPT-4 Technical Report", 433 "authors": ["J. Achiam", "S. Adler", "S. Agarwal", "L. Ahmad"], 434 "year": 2023, 435 "arxiv_id": "2303.08774", 436 "relevance": "Technical report for GPT-4, a major LLM whose security properties are relevant to the prompt injection defense landscape." 437 }, 438 { 439 "title": "GPTFuzzer: Red Teaming Large Language Models with Auto-Generated Jailbreak Prompts", 440 "authors": ["J. Yu", "X. Lin", "Z. Yu", "X. Xing"], 441 "year": 2024, 442 "arxiv_id": "2309.10253", 443 "relevance": "Automated red-teaming framework for testing LLM robustness against jailbreak attacks." 444 }, 445 { 446 "title": "LLaMA 2: Open Foundation and Fine-Tuned Chat Models", 447 "authors": ["H. Touvron", "L. Martin", "K. Stone", "P. Albert"], 448 "year": 2023, 449 "relevance": "Open-source foundation model used as one of the JATMO fine-tuned base models in this study." 450 }, 451 { 452 "title": "Enhancing System Security: LLM-Driven Defense Against Prompt Injection Vulnerabilities", 453 "authors": ["O. Muliarevych"], 454 "year": 2024, 455 "relevance": "Proposes middleware-based LLM validation layers for prompt injection defense, a competing approach to JATMO." 456 }, 457 { 458 "title": "Dynamic Moving Target Defense for Mitigating Targeted LLM Prompt Injection", 459 "authors": ["S. Panterino", "M. Fellington"], 460 "year": 2024, 461 "relevance": "Proposes dynamic randomization-based defense against prompt injection, another defense strategy discussed in the related work." 462 } 463 ] 464 }