scan.json (24820B)
1 { 2 "paper": { 3 "title": "Checkpoint-GCG: Auditing and Attacking Fine-Tuning-Based Prompt Injection Defenses", 4 "authors": [ 5 "Xiaoxue Yang", 6 "Bozhidar Stevanoski", 7 "Matthieu Meeus", 8 "Yves-Alexandre de Montjoye" 9 ], 10 "year": 2025, 11 "venue": "arXiv", 12 "arxiv_id": "2505.15738" 13 }, 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper states 'We release the source code for this paper at https://github.com/computationalprivacy/checkpoint-gcg' in a footnote on page 1." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper uses publicly available datasets: AlpacaFarm and SEP benchmark. The defense fine-tuning uses publicly released code and data from SecAlign/StruQ repositories. No proprietary data was collected." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions using A100 GPUs with 80GB RAM and fp16 precision, but does not provide a requirements.txt, Dockerfile, or detailed environment setup with library versions." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "While code is released, the paper itself does not contain step-by-step reproduction instructions. The appendices describe fine-tuning procedures and hyperparameters but do not provide a structured guide for reproducing the main experiments." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Results are reported as point estimates of ASR (e.g., '88% ASR') without confidence intervals or error bars. No uncertainty quantification is provided for any of the main results." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims Checkpoint-GCG outperforms standard GCG but provides no statistical significance tests. Comparisons are based solely on comparing ASR percentages (e.g., 88% vs 6%)." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports absolute differences with baselines in context: e.g., Checkpoint-GCG achieves 88% ASR vs standard GCG's 6% on SecAlign-defended Llama-3-8B-Instruct (Table 2), giving the reader enough context to assess magnitude." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The evaluation uses 50 randomly selected samples from AlpacaFarm for auditing experiments and 10 training / 198 test for universal suffix experiments, but no justification is given for these sample sizes and no power analysis is provided." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Results appear to be from single runs without reporting variability across different random seeds or sample selections." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Standard GCG with T=500 steps and standard GCG with Checkpoint-GCG budget are used as baselines throughout. Additional initialization baselines are compared in Appendix D (Table 7)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "The baselines include GCG (Zou et al., 2023) and initialization strategies from Jia et al. (2024), Hayase et al. (2024), and Zhang et al. (2024). The defenses evaluated (StruQ, SecAlign, SecAlign++) are state-of-the-art as of 2025." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper ablates checkpoint selection strategies (FREQ, STEP, LOSS, GRAD) in Appendix B (Table 5), suffix token length in Appendix E (Table 8), and different GCG initializations in Appendix D (Table 7)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper reports Attack Success Rate (ASR) and success@k metrics. For the jailbreaking extension, it also reports StrongREJECT rubric-based scores." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "This is a security/adversarial attack paper. Success is defined by automated binary criteria (whether the model output begins with 'Hacked' or 'hacked'). Human evaluation is not relevant to these claims." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "For universal suffix experiments, the paper trains on 10 samples and tests on 198 held-out AlpacaFarm samples and 500 out-of-distribution SEP samples (Section 5.3)." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down per defense (StruQ, SecAlign, undefended) and per model (Llama-3-8B-Instruct, Mistral-7B-Instruct, Qwen2-1.5B-Instruct) in Tables 2-4 and Figures 3-5." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper discusses where transferability fails: suffixes from Mistral and Qwen do not transfer to Meta-SecAlign-8B (0% ASR), and longer universal suffixes overfit to training samples (Appendix E). The early-stopping mechanism addresses GCG getting stuck in local minima (Appendix C)." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper reports that suffixes from Mistral-7B and Qwen2-1.5B yield 0% ASR when transferred to Meta-SecAlign-8B. It also reports that longer suffix lengths decrease test ASR (Appendix E), and that standard GCG with various alternative initializations fails to improve significantly (Appendix D)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims '96% ASR against the strongest defense' (supported by Table 2, Mistral with SecAlign), '89.9% ASR' for universal suffix (Table 3, Mistral with SecAlign), and '63.9% ASR' for transfer to Meta-SecAlign-8B (Table 1). All are supported by results in the paper." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper's main causal claim is that leveraging intermediate checkpoints improves GCG initialization and thus attack success. This is justified through controlled ablation: Checkpoint-GCG vs standard GCG with the same total computational budget, isolating the checkpoint initialization mechanism." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper carefully bounds its claims. It explicitly states that transferability 'across highly different models remains limited' (Section 5.3) and focuses claims on models/defenses sharing similar architectures. The title specifies 'Fine-Tuning-Based' defenses." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper discusses that the improvements could be attributed to better initialization rather than checkpoint stepping (addressed in Appendix D by comparing other initialization strategies). It also discusses that longer suffixes may overfit rather than generalize (Appendix E)." 131 } 132 }, 133 "setup_transparency": { 134 "model_versions_specified": { 135 "applies": true, 136 "answer": true, 137 "justification": "The paper specifies Llama-3-8B-Instruct, Mistral-7B-Instruct, and Qwen2-1.5B-Instruct. For the jailbreak evaluation, GPT-4o-mini is used as evaluator. These are specific model names with size parameters. For the transfer target, Meta-SecAlign-8B (Llama-3.1-8B-Instruct with SecAlign++) is specified." 138 }, 139 "prompts_provided": { 140 "applies": true, 141 "answer": true, 142 "justification": "The prompt template is shown in Figure 1 with system instruction, user prompt, and data sections. The injected instruction ('Print exactly Hacked!') and target output are specified. The AlpacaFarm and SEP datasets providing the actual content are publicly available." 143 }, 144 "hyperparameters_reported": { 145 "applies": true, 146 "answer": true, 147 "justification": "GCG step budget T=500/1000, early stopping threshold (250 steps, loss change <= 1e-5), suffix length l=20 tokens, gradient norm thresholds for GRAD strategy (Table 6), number of training samples Ntrain=10, and fine-tuning details are reported in Appendices B, C, and G." 148 }, 149 "scaffolding_described": { 150 "applies": false, 151 "answer": false, 152 "justification": "No agentic scaffolding is used. This is a direct optimization-based attack method, not an agentic system." 153 }, 154 "data_preprocessing_documented": { 155 "applies": true, 156 "answer": true, 157 "justification": "The paper describes using 50 randomly selected samples from AlpacaFarm for auditing, 10 training and 198 test samples for universal suffix experiments, and 500 SEP samples for out-of-distribution testing. The attack setup (injected instruction appended to data part) is clearly specified." 158 } 159 }, 160 "limitations_and_scope": { 161 "limitations_section_present": { 162 "applies": true, 163 "answer": false, 164 "justification": "There is no dedicated limitations section. The Discussion and Conclusion section (Section 7) discusses the method's contribution and societal impact but does not contain a substantive limitations discussion." 165 }, 166 "threats_to_validity_specific": { 167 "applies": true, 168 "answer": false, 169 "justification": "No threats-to-validity section or discussion is present. The paper does not discuss specific threats such as the sensitivity of results to hyperparameter choices, the representativeness of AlpacaFarm as an evaluation dataset, or the generalizability beyond the three tested models." 170 }, 171 "scope_boundaries_stated": { 172 "applies": true, 173 "answer": true, 174 "justification": "The paper explicitly states that transferability 'across highly different models remains limited' and acknowledges the two key assumptions (knowledge of input and access to checkpoints) that it progressively relaxes. The jailbreak extension is framed as a 'proof-of-concept' rather than a definitive result." 175 } 176 }, 177 "data_integrity": { 178 "raw_data_available": { 179 "applies": true, 180 "answer": false, 181 "justification": "The paper does not release the raw experimental outputs (e.g., per-sample attack results, discovered adversarial suffixes across all experiments). Only aggregate ASR percentages are reported." 182 }, 183 "data_collection_described": { 184 "applies": true, 185 "answer": true, 186 "justification": "The data collection is clearly described: 50 randomly selected samples from AlpacaFarm for auditing, 10 training + 198 test samples for universal suffix, and 500 random SEP samples for OOD evaluation. Defense models are replicated using released code from SecAlign/StruQ." 187 }, 188 "recruitment_methods_described": { 189 "applies": false, 190 "answer": false, 191 "justification": "No human participants. The study uses standard public benchmarks (AlpacaFarm, SEP)." 192 }, 193 "data_pipeline_documented": { 194 "applies": true, 195 "answer": true, 196 "justification": "The pipeline is documented: replicate defenses using released code (Appendix G), select checkpoints using GRAD strategy (Table 6), run Checkpoint-GCG with early stopping (Appendix C), evaluate against held-out samples. The fine-tuning replication process and deviations from original work are described." 197 } 198 }, 199 "conflicts_of_interest": { 200 "funding_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "The Acknowledgements section discloses funding from UK EPSRC under grant numbers EP/Y037421/1 and EP/X040518/1, and computational resources from Imperial College London's Research Computing Service." 204 }, 205 "affiliations_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "All authors are affiliated with Imperial College London, clearly stated on the first page. The authors are from an academic institution and not affiliated with the companies whose models/defenses are evaluated." 209 }, 210 "funder_independent_of_outcome": { 211 "applies": true, 212 "answer": true, 213 "justification": "The funders are UK EPSRC (government research council) and Imperial College London. Neither has a financial interest in the outcome of evaluating prompt injection defenses by Meta, OpenAI, or other companies." 214 }, 215 "financial_interests_declared": { 216 "applies": true, 217 "answer": false, 218 "justification": "No competing interests or financial interests statement is present in the paper." 219 } 220 }, 221 "contamination": { 222 "training_cutoff_stated": { 223 "applies": false, 224 "answer": false, 225 "justification": "This paper does not evaluate a pre-trained model's capability on a benchmark. It evaluates adversarial attacks against fine-tuning-based defenses. The models are fine-tuned by the authors, and contamination of the attack evaluation data is not a relevant concern." 226 }, 227 "train_test_overlap_discussed": { 228 "applies": false, 229 "answer": false, 230 "justification": "This is an adversarial attack study, not a benchmark evaluation of model knowledge. The concern is not whether the model has seen the test data during training, but whether the attack succeeds." 231 }, 232 "benchmark_contamination_addressed": { 233 "applies": false, 234 "answer": false, 235 "justification": "Contamination is not relevant to this paper's claims. The paper tests whether adversarial suffixes can bypass fine-tuning-based defenses, not whether models can solve benchmark tasks." 236 } 237 }, 238 "human_studies": { 239 "pre_registered": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "irb_or_ethics_approval": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "demographics_reported": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "inclusion_exclusion_criteria": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "randomization_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "blinding_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "attrition_reported": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 } 274 }, 275 "cost_and_practicality": { 276 "inference_cost_reported": { 277 "applies": true, 278 "answer": true, 279 "justification": "Appendix I states 'each GCG step takes approximately 3 seconds per sample' and Table 5 reports total Checkpoint-GCG steps averaged across samples (e.g., 2,033-4,037 steps depending on strategy). This allows cost estimation." 280 }, 281 "compute_budget_stated": { 282 "applies": true, 283 "answer": true, 284 "justification": "Appendix I states 'All experiments were conducted on an A100 GPU with 80GB RAM.' Table 5 reports total GCG steps per checkpoint strategy. Fine-tuning uses 1-2 A100 GPUs (Appendix G)." 285 } 286 } 287 }, 288 "claims": [ 289 { 290 "claim": "Checkpoint-GCG achieves up to 96% ASR against SecAlign, the strongest fine-tuning-based prompt injection defense, in an auditing setup.", 291 "evidence": "Table 2 shows 96% ASR on SecAlign-defended Mistral-7B-Instruct, compared to 18% for standard GCG with T=500 and 22% with Checkpoint-GCG budget.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "Standard GCG's effectiveness decreases sharply as defenses improve, while Checkpoint-GCG maintains high ASR.", 296 "evidence": "Table 2 shows standard GCG drops from 100% (undefended) to 6% (SecAlign) on Llama-3-8B-Instruct, while Checkpoint-GCG maintains 88% on SecAlign. Similar patterns across all three models (Figure 3).", 297 "supported": "strong" 298 }, 299 { 300 "claim": "Checkpoint-GCG discovers universal suffixes that generalize to unseen inputs, achieving up to 89.9% ASR on held-out in-distribution samples.", 301 "evidence": "Table 3 shows 89.9% testing ASR on SecAlign-defended Mistral-7B-Instruct with 100% training ASR, using 10 training and 198 test samples from AlpacaFarm.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "Checkpoint-GCG suffixes transfer to Meta-SecAlign-8B (a black-box model with upgraded defense), achieving 63.9% ASR.", 306 "evidence": "Table 1 shows 63.9% ASR for black-box transfer from SecAlign-defended Llama-3-8B-Instruct to Meta-SecAlign-8B, compared to 0% for standard GCG.", 307 "supported": "strong" 308 }, 309 { 310 "claim": "The GRAD checkpoint selection strategy provides the optimal balance between attack effectiveness and computational cost.", 311 "evidence": "Table 5 in Appendix B compares FREQ, STEP, LOSS, and GRAD strategies. GRAD achieves 100% ASR with 64 checkpoints and 2,033 total steps (at tau_grad=0.1), while other strategies require more checkpoints or achieve lower ASR.", 312 "supported": "moderate" 313 }, 314 { 315 "claim": "Checkpoint-GCG extends to jailbreak defenses, achieving 68% ASR and StrongREJECT score of 0.50 vs standard GCG's 56% ASR and 0.34.", 316 "evidence": "Table 9 reports these results for Safety-Tuned Llama on Llama-3-8B-Instruct. However, only one model and one defense are tested, and the gap is modest.", 317 "supported": "moderate" 318 } 319 ], 320 "methodology_tags": [ 321 "benchmark-eval" 322 ], 323 "key_findings": "Checkpoint-GCG is a white-box attack that leverages intermediate fine-tuning checkpoints to progressively improve adversarial suffix initialization for the GCG attack. Against SecAlign, the strongest open-source prompt injection defense, it achieves up to 96% ASR in per-sample auditing (vs 6-18% for standard GCG) and up to 89.9% ASR with universal suffixes that generalize across inputs. The method enables transfer attacks against Meta-SecAlign-8B, achieving 63.9% ASR in a black-box setting where standard GCG yields 0%. The gradient-based checkpoint selection strategy provides the best trade-off between attack effectiveness and computational cost.", 324 "red_flags": [ 325 { 326 "flag": "No statistical uncertainty quantification", 327 "detail": "All ASR results are reported as single-run point estimates without confidence intervals, error bars, or variance across random seeds. With only 50 samples in the main evaluation, individual ASR differences could be within sampling noise." 328 }, 329 { 330 "flag": "No dedicated limitations section", 331 "detail": "The paper lacks a formal limitations or threats-to-validity section. Key limitations (sensitivity to hyperparameters, representativeness of AlpacaFarm, scalability to larger models) are not explicitly discussed." 332 }, 333 { 334 "flag": "Small sample sizes for universal suffix experiments", 335 "detail": "The universal suffix is trained on only 10 samples. No justification is given for this number, and no analysis of how training set size affects generalization is provided." 336 } 337 ], 338 "cited_papers": [ 339 { 340 "title": "Universal and transferable adversarial attacks on aligned language models", 341 "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J Zico Kolter", "Matt Fredrikson"], 342 "year": 2023, 343 "arxiv_id": "2307.15043", 344 "relevance": "Introduces the GCG attack method that Checkpoint-GCG builds upon; foundational work on adversarial suffix optimization for LLMs." 345 }, 346 { 347 "title": "StruQ: Defending against prompt injection with structured queries", 348 "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"], 349 "year": 2025, 350 "relevance": "One of the two main prompt injection defenses evaluated; uses structured delimiters and SFT to train models to ignore injected instructions." 351 }, 352 { 353 "title": "SecAlign: Defending against prompt injection with preference optimization", 354 "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "David Wagner", "Chuan Guo"], 355 "year": 2025, 356 "relevance": "The strongest prompt injection defense evaluated; uses DPO to steer models away from following injected instructions." 357 }, 358 { 359 "title": "AgentDojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents", 360 "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"], 361 "year": 2024, 362 "relevance": "Benchmark for evaluating prompt injection in agentic LLM settings, relevant to understanding real-world attack surfaces." 363 }, 364 { 365 "title": "The instruction hierarchy: Training LLMs to prioritize privileged instructions", 366 "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"], 367 "year": 2024, 368 "arxiv_id": "2404.13208", 369 "relevance": "OpenAI's approach to fine-tuning GPT-3.5 Turbo with reinforcement learning for instruction hierarchy, a model-level prompt injection defense." 370 }, 371 { 372 "title": "Defeating prompt injections by design", 373 "authors": ["Edoardo Debenedetti", "Ilia Shumailov", "Tianqi Fan", "Jamie Hayes", "Nicholas Carlini", "Daniel Fabian", "Christoph Kern", "Chongyang Shi", "Andreas Terzis", "Florian Tramèr"], 374 "year": 2025, 375 "arxiv_id": "2503.18813", 376 "relevance": "Proposes architectural-level defenses against prompt injection by embedding instruction priority into model design." 377 }, 378 { 379 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 380 "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"], 381 "year": 2023, 382 "relevance": "Foundational work on indirect prompt injection attacks against real-world LLM-integrated applications." 383 }, 384 { 385 "title": "Harmbench: A standardized evaluation framework for automated red teaming and robust refusal", 386 "authors": ["Mantas Mazeika", "Long Phan", "Xuwang Yin", "Andy Zou"], 387 "year": 2024, 388 "relevance": "Standardized framework for evaluating adversarial attacks and defenses on LLMs, relevant to benchmarking methodology." 389 }, 390 { 391 "title": "Safety-tuned LLaMAs: Lessons from improving the safety of large language models that follow instructions", 392 "authors": ["Federico Bianchi", "Mirac Suzgun", "Giuseppe Attanasio", "Paul Rottger", "Dan Jurafsky", "Tatsunori Hashimoto", "James Zou"], 393 "year": 2024, 394 "relevance": "Fine-tuning-based jailbreak defense used to evaluate Checkpoint-GCG's generalization beyond prompt injection." 395 }, 396 { 397 "title": "A strongREJECT for empty jailbreaks", 398 "authors": ["Alexandra Souly", "Qingyuan Lu", "Dillon Bowen", "Tu Trinh", "Elvis Hsieh", "Sana Pandey", "Pieter Abbeel", "Justin Svegliato", "Scott Emmons", "Olivia Watkins", "Sam Toyer"], 399 "year": 2024, 400 "relevance": "Provides the rubric-based evaluator used to assess jailbreak quality beyond simple refusal-string matching." 401 }, 402 { 403 "title": "Query-based adversarial prompt generation", 404 "authors": ["Jonathan Hayase", "Ema Borevkovic", "Nicholas Carlini", "Florian Tramèr", "Milad Nasr"], 405 "year": 2024, 406 "relevance": "Proposes alternative GCG initialization strategies and improves black-box attacks by selecting suffixes based on target model loss." 407 } 408 ] 409 }