scan.json (27771B)
1 { 2 "paper": { 3 "title": "Early Approaches to Adversarial Fine-Tuning for Prompt Injection Defense: A 2022 Study of GPT-3 and Contemporary Models", 4 "authors": [ 5 "Gustavo Sandoval", 6 "Denys Fenchenko", 7 "Junyao Chen" 8 ], 9 "year": 2025, 10 "venue": "arXiv", 11 "arxiv_id": "2509.14271", 12 "doi": "10.48550/arXiv.2509.14271" 13 }, 14 "scan_version": 2, 15 "active_modules": [ 16 "experimental_rigor", 17 "data_leakage" 18 ], 19 "methodology_tags": [ 20 "benchmark-eval" 21 ], 22 "key_findings": "Adversarial fine-tuning using structured <userInput> delimiters reduced prompt injection attack success rates from ~26-31% to near 0% for smaller GPT-3 models (Ada, Babbage, Curie). A positive correlation was found between model size and vulnerability to prompt injection, with Davinci (175B) being roughly four times as vulnerable as Ada (2.7B). The defense was not tested on the largest model (Davinci) due to cost, and a reinforcement learning variant failed due to compute constraints. The authors retrospectively acknowledge that fine-tuning-based defenses show poor generalization to novel attack patterns.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "GitHub repository is explicitly linked: https://github.com/GusSand/PromptInject. The paper references specific notebooks (dataset_construct.ipynb, original_openai.ipynb, openai_fine-tuned.ipynb, gpt-2_experiments.ipynb, non_openai_models.ipynb, Reinforcement_Learning-fine-tuning.ipynb)." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The adversarial dataset construction is in the repo, fine-tuning datasets are from public Kaggle sources (Dave 2021, Kushare 2021, Shahane 2021, Vonteru 2019), and the paper states 'the reader can also find the logs from all attacks on the original language models and their fine-tuned versions in the results directory.'" 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "No requirements.txt, Dockerfile, or detailed environment specification mentioned in the paper. Only implicit references to Google Colab and OpenAI API." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "While specific notebooks are named, there are no step-by-step reproduction instructions. A researcher would need to reverse-engineer the workflow from notebook names and the methodology description." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "Tables 1 and 2 report only point estimates (e.g., '26%', '31%', '0%'). No confidence intervals, error bars, or uncertainty measures are reported for any result." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper claims fine-tuning reduces attacks and larger models are more vulnerable, comparing multiple models, but no statistical significance tests are used. Comparisons are based solely on comparing raw percentages." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "Tables 1 and 2 show attack success rates before and after fine-tuning with baseline context (e.g., Goal Hijacking on Ada: 26% before → 0% after; Babbage: 31% → 0%), allowing readers to assess effect magnitude." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "They test 1,260 attack variations (35 prompts × 10 attack strings × some model parameters) but provide no justification for why this number is sufficient, no power analysis, and no discussion of whether the sample adequately covers the attack space." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "No variance, standard deviation, or spread measures reported. Results appear to be single-run point estimates with no indication of result stability across repeated runs." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "The undefended models serve as baselines (before fine-tuning). Table 1 shows before/after comparisons for Ada, Babbage, and Curie. Table 2 shows attack rates on additional models without fine-tuning." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "For the 2022 timeframe, GPT-3 variants (Ada, Babbage, Curie, Davinci), GPT-2, OPT-350M, and T-5 were contemporary models. The paper uses the PromptInject framework from Perez and Ribeiro (2022)." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": false, 87 "justification": "The adversarial fine-tuning approach has multiple components (structured delimiters, adversarial examples, fine-tuning), but no ablation isolates their individual contributions. For example, no test of delimiters without adversarial examples, or adversarial examples without delimiters." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "Two distinct attack categories are measured separately: goal hijacking success rate and prompt leaking success rate (Tables 1 and 2). Levenshtein distance-based similarity scoring is used with a threshold." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": false, 97 "justification": "Evaluation is entirely automated via Levenshtein distance similarity scores with a threshold. No human evaluation of attack success or defense quality is reported. Human evaluation would be relevant to assess borderline cases where the automated metric may misjudge attack success." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": false, 102 "justification": "No explicit separation between the fine-tuning data and the test attack data is described. The paper does not state whether the 1,260 attack variations used for testing overlap with the adversarial examples used in fine-tuning." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Results are broken down by model (Ada, Babbage, Curie, Davinci, GPT-2, OPT, T-5), by attack type (goal hijacking vs. prompt leaking), and by before/after fine-tuning (Tables 1 and 2)." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": false, 112 "justification": "No qualitative analysis of where attacks succeeded, what types of prompts were most effective, or specific examples of successful vs. failed attacks. The Babbage/Curie anomaly is noted but not analyzed in depth." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "Multiple negative results reported: RL fine-tuning approach crashed on Google Colab Pro (couldn't complete training), Davinci could not be fine-tuned due to financial cost, and the 'Discovered Limitations' section acknowledges fine-tuning fragility and poor generalization." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "Abstract claims match results: '31% of the time' is supported by Table 1 (Babbage goal hijacking = 31%), 'reduced to near zero' is supported by Table 1 (all post-fine-tuning rates 0% except one 2.86% case), 'more flexible models exhibit greater vulnerability' is supported by Figure 5 and Tables 1-2." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "The causal claim that adversarial fine-tuning reduces attack success is supported by a direct intervention design: same models tested before and after fine-tuning on the same attack set. This controlled comparison adequately supports the causal inference, though the lack of a held-out test set weakens it." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper explicitly bounds its claims to the tested models and era: 'specific models tested are now superseded,' 'conducted in 2022,' 'Note: This methodology was developed for the 2022 model landscape.' The 'Discovered Limitations' section further acknowledges generalization gaps." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": true, 139 "justification": "The 'Discovered Limitations' section discusses alternative explanations: fine-tuning may reduce rather than enhance safety, modern attacks bypass training-based defenses, the approach has poor generalization. The Babbage > Curie vulnerability anomaly is noted as an exception to the size-vulnerability trend." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": false, 144 "justification": "The paper uses Levenshtein distance similarity above a threshold as a proxy for 'attack success' but does not discuss the gap between this automated metric and actual vulnerability. Levenshtein distance may miss semantically successful attacks with low string similarity, or flag benign outputs with high similarity. The threshold value is not justified." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": true, 151 "justification": "Specific model versions are listed: text-davinci-003, text-curie-001, text-babbage-001, text-ada-001 (with API-specific version suffixes). GPT-2 1.5B, OPT 350M, and T-5 small 60M are also specified with sizes." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": true, 156 "justification": "Actual prompt text is provided. Figure 4 shows a complete prompt-completion pair with the <userInput> delimiter approach. Figure 2 shows attack examples. The paper includes examples like 'Correct this to standard English: <userInput>...'." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": false, 161 "justification": "Temperature is mentioned as a parameter that 'define the confidence level with which the model is making its predictions' but specific values are not reported. No table or section lists the actual hyperparameter settings used (temperature, top-p, max tokens, etc.)." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "No agentic scaffolding is used. The approach is direct prompt-based testing and fine-tuning via APIs." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "The dataset construction process is documented: 35 base prompts, 5 attack variations per category (goal hijacking and prompt leaking), 1,260 total variations. Fine-tuning datasets sourced from Kaggle, augmented with <userInput> tags, and formatted into JSONL for OpenAI's API." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": true, 178 "justification": "Dedicated 'Discovered Limitations' and 'Contemporary Relevance and Limitations' sections with substantive discussion of fine-tuning fragility, attack evolution, and generalization gaps." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": true, 183 "justification": "Specific threats discussed: '2024 studies show that fine-tuning can inadvertently reduce safety alignment, even on benign datasets,' 'adversarial fine-tuning shows poor generalization to novel attack patterns,' and 'modern attacks like many-shot jailbreaking and indirect injection bypass training-based defenses.' These are specific to this study's approach." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": true, 188 "justification": "Explicit scope boundaries: 'specific models tested are now superseded,' the work is 'a starting point for more sophisticated defense mechanisms, rather than a complete solution,' 'this methodology was developed for the 2022 model landscape,' and they list specific things they did NOT test (Davinci fine-tuning, modern architectures)." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": true, 195 "justification": "The paper states 'the reader can also find the logs from all attacks on the original language models and their fine-tuned versions in the results directory' in the GitHub repository." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Data collection is described: adversarial dataset built from 35 base prompts × attack variations using the PromptInject framework; fine-tuning datasets from specific Kaggle sources (Dave 2021 for translation, Kushare 2021 for grammar, Shahane 2021 for sentiment, Vonteru 2019 for summarization)." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants. Data sources are standard public datasets from Kaggle and procedurally generated adversarial prompts." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "The pipeline is documented: base prompts → attack string injection → 1,260 attack variations → model testing via API → Levenshtein distance scoring → threshold-based success determination. Fine-tuning pipeline: Kaggle datasets → tag augmentation → JSONL formatting → OpenAI fine-tuning API." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding information disclosed. While the authors appear to be NYU students (based on @nyu.edu emails), no funding source or acknowledgment of unfunded status is stated." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Author affiliations are provided via NYU email addresses ({gs157, df1911, jc9723}@nyu.edu). They are not evaluating a product from their own employer." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": false, 226 "answer": false, 227 "justification": "This appears to be unfunded student work at NYU (based on @nyu.edu emails and no acknowledgments section). No funder to assess independence of." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests or financial interest declaration statement is present in the paper." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": false, 238 "answer": false, 239 "justification": "This paper tests defenses against adversarial prompt injection attacks rather than evaluating model knowledge on a benchmark. The attacks are procedurally generated at test time, so training data contamination of benchmark answers is not the relevant concern." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": false, 243 "answer": false, 244 "justification": "The paper tests adversarial defenses with procedurally generated attack prompts, not pre-trained model knowledge on fixed benchmarks. Standard contamination concerns do not apply." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": false, 248 "answer": false, 249 "justification": "No fixed benchmark is used that could have been in training data. The adversarial attacks are constructed specifically for this study." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in the study." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in the study." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in the study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in the study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in the study." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in the study." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in the study." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "No API costs, token counts, or wall-clock times reported. The paper mentions Davinci fine-tuning was too expensive and RL training exceeded Colab Pro RAM, but no actual cost figures are given." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "No total compute budget stated. Google Colab Pro is mentioned as insufficient for RL training, but no GPU hours, API spend, or hardware specifications are quantified." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be single-run measurements." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": false, 310 "justification": "The number of experimental runs is not stated. The 1,260 variations are the test dataset size, not repeated runs." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "Temperature and other parameters are mentioned as configurable but no search budget, number of configurations tried, or search method is described." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": false, 320 "justification": "No discussion of how the final configuration was selected or whether the reported results represent a best-case selection from multiple attempts." 321 }, 322 "multiple_comparison_correction": { 323 "applies": false, 324 "answer": false, 325 "justification": "No statistical tests are performed at all, so there are no p-values to correct for multiple comparisons." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "The authors evaluate their own adversarial fine-tuning defense against undefended models without acknowledging self-evaluation bias. No independent evaluation or re-implementation by others is mentioned." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "Models of vastly different sizes are compared (60M T-5 to 175B Davinci) without normalizing for compute cost. The fine-tuning was only applied to smaller models due to cost, but no compute-performance tradeoff analysis is provided." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": false, 340 "justification": "Attack success is measured via Levenshtein distance similarity threshold, but the paper does not discuss whether this metric adequately captures real-world prompt injection vulnerability. The threshold value is not justified, and there is no analysis of false positives/negatives in the automated scoring." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "No scaffolding is involved in the study. Models are tested directly via API calls." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the tested models may have seen similar prompt injection patterns during pre-training, despite PromptInject framework content potentially being in training data." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether the evaluation setup leaks information about expected behavior through the prompt structure." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "No discussion of whether the 1,260 attack variations are independent or structurally correlated (e.g., variations from the same base prompt may not be independent tests)." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": false, 367 "justification": "No leakage detection or prevention method is described." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "Without adversarial fine-tuning, prompt injection attacks succeeded 31% of the time on GPT-3 series models (Babbage goal hijacking)", 374 "evidence": "Table 1 shows Babbage goal hijacking at 31%, Ada at 26%, Curie at 18% before fine-tuning. Table 2 shows Davinci at 24.28%.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Adversarial fine-tuning reduced attack success rates to near zero for Ada, Babbage, and Curie models", 379 "evidence": "Table 1 shows all goal hijacking rates drop to 0% post-fine-tuning. Prompt leaking remains at 2.86% for Ada only, all others 0%.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "There is a positive correlation between model size and vulnerability to prompt injection attacks", 384 "evidence": "Figure 5 shows increasing attack success rate with model size. Davinci (175B) at ~24% vs Ada (2.7B) at ~14%. However, Babbage (6.7B) appears more vulnerable than Curie (13B), which the authors note as an anomaly.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "The structured delimiter approach (<userInput> tags) anticipated modern instruction hierarchy systems", 389 "evidence": "The paper draws parallels to OpenAI's instruction hierarchy (Wallace et al. 2024) and Anthropic's Constitutional AI, but provides no evidence of direct influence. This is retrospective framing added in 2025.", 390 "supported": "weak" 391 }, 392 { 393 "claim": "The same prompt injection vulnerabilities exist in non-GPT-3 models (GPT-2, OPT, T-5)", 394 "evidence": "Table 2 shows goal hijacking success rates for GPT-2 (7.85%), OPT (45.71%), and T-5 (8.57%), confirming vulnerability across architectures.", 395 "supported": "strong" 396 } 397 ], 398 "red_flags": [ 399 { 400 "flag": "Retrospective overclaiming of influence", 401 "detail": "The paper was conducted in 2022 but published in 2025 with extensive retrospective additions claiming influence on OpenAI's instruction hierarchy, Anthropic's Constitutional AI, and other modern defenses. No evidence of direct influence is provided. These sections appear to inflate the paper's significance." 402 }, 403 { 404 "flag": "No statistical rigor", 405 "detail": "All results are point estimates without confidence intervals, error bars, significance tests, or multi-run variance. Comparisons between models and between before/after fine-tuning lack any statistical foundation." 406 }, 407 { 408 "flag": "Incomplete evaluation on the most important model", 409 "detail": "Davinci (175B), the most important and most vulnerable GPT-3 model, was not tested with the adversarial fine-tuning defense due to cost. The defense is only validated on the three smallest models, leaving the key question unanswered." 410 }, 411 { 412 "flag": "Unclear train-test separation", 413 "detail": "The paper does not explain whether the adversarial examples used in fine-tuning overlap with the attack prompts used in evaluation. If the model was fine-tuned on the same attacks it was tested on, the near-zero results would be trivially expected." 414 }, 415 { 416 "flag": "Automated metric not validated", 417 "detail": "Attack success is determined by Levenshtein distance similarity above an unspecified threshold. This automated metric is not validated against human judgment of whether attacks actually succeeded." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "Language Models are Few-Shot Learners", 423 "authors": ["Tom Brown"], 424 "year": 2020, 425 "relevance": "Foundational GPT-3 paper establishing the large language model paradigm that prompt injection attacks target." 426 }, 427 { 428 "title": "Ignore Previous Prompt: Attack Techniques For Language Models", 429 "authors": ["Fábio Perez", "Ian Ribeiro"], 430 "year": 2022, 431 "relevance": "Defines the PromptInject framework for goal hijacking and prompt leaking attacks, which this paper directly builds upon." 432 }, 433 { 434 "title": "Language Models are Unsupervised Multitask Learners", 435 "authors": ["Alec Radford", "Jeffrey Wu", "Rewon Child", "David Luan", "Dario Amodei", "Ilya Sutskever"], 436 "year": 2019, 437 "relevance": "GPT-2 paper establishing the decoder-only transformer architecture evaluated in this study." 438 }, 439 { 440 "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions", 441 "authors": ["Eric Wallace"], 442 "year": 2024, 443 "arxiv_id": "2404.13208", 444 "relevance": "Modern prompt injection defense that formalizes the delimiter-based separation approach explored in this paper." 445 }, 446 { 447 "title": "SecAlign: Defending Against Prompt Injection with Preference Optimization", 448 "authors": ["Sizhe Wang"], 449 "year": 2024, 450 "arxiv_id": "2410.05451", 451 "relevance": "Addresses generalization limitations of adversarial fine-tuning defenses through preference optimization." 452 }, 453 { 454 "title": "Constitutional Classifiers: Defending against universal jailbreaks", 455 "authors": ["Anthropic"], 456 "year": 2024, 457 "relevance": "Extends adversarial training concepts with preference learning for LLM safety alignment." 458 }, 459 { 460 "title": "Many-shot jailbreaking", 461 "authors": ["Anthropic"], 462 "year": 2024, 463 "relevance": "Documents modern prompt injection attack that bypasses training-based defenses, illustrating limitations of the fine-tuning approach." 464 }, 465 { 466 "title": "Generating Textual Adversarial Examples for Deep Learning Models: A Survey", 467 "authors": ["Wei Emma Zhang", "Quan Z. Sheng", "Ahoud Alhazmi"], 468 "year": 2019, 469 "relevance": "Survey of adversarial attacks on NLP models providing background on defense strategies including adversarial training." 470 }, 471 { 472 "title": "Pre-train, Prompt, and Predict: A Systematic Survey of Prompting Methods in Natural Language Processing", 473 "authors": ["Pengfei Liu", "Weizhe Yuan", "Jinlan Fu", "Zhengbao Jiang", "Hiroaki Hayashi", "Graham Neubig"], 474 "year": 2021, 475 "relevance": "Comprehensive survey of prompting methods providing the prompt engineering framework that prompt injection attacks exploit." 476 } 477 ] 478 }