scan.json (23795B)
1 { 2 "paper": { 3 "title": "Persistent Backdoor Attacks under Continual Fine-Tuning of LLMs", 4 "authors": ["Jing Cui", "Yufei Han", "Jianbin Jiao", "Junge Zhang"], 5 "year": 2025, 6 "venue": "arXiv (AAAI 2026 copyright)", 7 "arxiv_id": "2512.14741", 8 "doi": "10.48550/arXiv.2512.14741" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval", "theoretical"], 13 "key_findings": "P-Trojan achieves 99-100% backdoor persistence across Qwen2.5 and LLaMA3 models after multi-stage post-deployment fine-tuning, while baselines (BadNet, BadNet-CE, BadEdit) suffer 50-100% drops. The method aligns poisoned and clean task gradients on token embeddings so that clean fine-tuning inadvertently reinforces the backdoor. Knowledge-preserving fine-tuning strategies (data replay, parameter freezing) amplify rather than mitigate backdoor persistence.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No code repository URL or link to source code is provided anywhere in the paper." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper uses publicly available datasets: SST-2, MBPP, and GSM8K. No proprietary data was collected." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The appendix mentions '5 NVIDIA RTX 4090 GPUs' and 'LLaMA Factory framework' but provides no requirements.txt, library versions, or environment setup details." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions or scripts are provided. Algorithm 1 describes the method but not how to run the experiments." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results in Tables 2-6 are point estimates with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims P-Trojan outperforms baselines but provides no statistical significance tests. Comparisons are based solely on raw numbers." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports absolute ASR and ACC values with baselines, allowing readers to compute effect sizes. E.g., 'P-Trojan achieves 2 to 4 times higher attack success after model finetuning' with specific numbers in Table 3." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification for why 3 models, 3 datasets, or specific dataset sizes (5000/467 samples) were chosen." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or multi-run results are reported. All experiments appear to be single-run." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Three baselines are compared: BadNet, BadNet-CE, and BadEdit. Each represents a different class of backdoor attack." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "BadEdit (Li et al. 2024b) and Sleeper Agents (Hubinger et al. 2024) are recent. BadNet (2017) is classic but serves as a naive baseline, which is appropriate." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Multiple ablations: fine-tuning order reversal (Table 5), target task variation (Table 6), knowledge-preserving strategies (Table 4), and in-domain fine-tuning. Table 1 isolates the gradient alignment component." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Three metrics: Attack Success Rate (ASR), Clean Accuracy (ACC), and Persistence (Persis). Results on multiple downstream tasks also reported." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "Human evaluation is not relevant for measuring backdoor attack success rates on automated tasks." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "The paper does not discuss train/test splits for evaluation. It is unclear whether ASR and ACC are measured on held-out test sets separate from training data." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results broken down by model (3 models), fine-tuning stage (cleanup vs cross-task), fine-tuning strategy (full update, replay, FREEZE), and target task (SST-2, GSM8K)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Table 4 shows the full-model-update setting where P-Trojan drops to 67% ASR. The defense section shows BadActs achieves 99% true positive rate but with 10% FPR." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Table 4 reports the vanilla full-model-update case where P-Trojan's ASR drops to 67% and ACC drops to 80.83%. The defense evaluation shows the attack is detectable by BadActs." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims 'over 99% persistence while preserving clean-task accuracy,' which is supported by Table 3 showing 99-100% persistence across models." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper makes causal claims about gradient alignment causing persistence. Table 1 provides a controlled comparison (same setup, different triggers) and the ablation studies isolate individual variables. Theoretical analysis (Theorem 1, Corollary 1) provides formal justification." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title says 'Persistent Backdoor Attacks under Continual Fine-Tuning of LLMs' broadly, but experiments use only 3 small models (0.5B-1.5B) and 3 datasets. No acknowledgment that results may not extend to larger models or more diverse fine-tuning regimes." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not discuss alternative explanations for why P-Trojan persists. For example, could the effect be due to the specific trigger length, poisoning ratio, or model scale rather than gradient alignment per se?" 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper's claims match its measurements directly: ASR measures attack success, ACC measures clean accuracy, Persistence measures survival ratio. No proxy gap exists." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Specific model versions stated: Qwen2.5-0.5B, Qwen2.5-1.5B, LLaMA3.2-1B. These are precise model identifiers." 143 }, 144 "prompts_provided": { 145 "applies": false, 146 "answer": false, 147 "justification": "The paper does not use prompting for evaluation. The backdoor injection and fine-tuning are done via SFT on datasets, not prompt-based interaction." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Appendix reports: 3 epochs, 5000/5000/467 training samples for SST-2/GSM8K/MBPP, 2000 poison samples (40% ratio), trigger lengths of 3/10/15 tokens, ~1 GPU-hour per stage. LLaMA Factory framework used." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The method is a standard SFT-based training pipeline." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": false, 162 "justification": "The paper does not describe how training data was preprocessed. For SST-2, it's unclear how prompts were formatted. The poison dataset construction (appending trigger tokens) is described at a high level but details of prompt templates are missing." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no dedicated limitations section. The conclusion mentions the need for 'persistence-aware evaluation protocols and stronger defenses' but does not discuss the study's own limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed. The paper does not address potential issues such as the limited model scale, narrow task selection, or the 40% poisoning ratio being unrealistically high." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "No explicit scope boundaries are stated. The paper does not clarify what settings, model sizes, or attack scenarios are not covered." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw experimental data (model outputs, per-example results) is made available for independent verification." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Data sources are clearly identified: SST-2 (Socher et al. 2013), MBPP (Austin et al. 2021), GSM8K (Cobbe et al. 2021). Sample sizes stated in appendix." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. All data comes from standard public benchmarks." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "The pipeline from raw datasets to final results is not fully documented. How the 2000 poison samples were selected from SST-2, how trigger tokens are inserted into prompts, and exact evaluation procedures are underspecified." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information or acknowledgments section is present in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly listed: University of Chinese Academy of Sciences, INRIA, Institute of Automation CAS." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is provided." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "The paper does not evaluate a pre-trained model's knowledge on benchmarks. It fine-tunes models on datasets and measures attack success — the concern is whether the backdoor persists, not whether the model has memorized benchmark answers." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "Same as above — contamination in the benchmark-knowledge sense is not relevant. The paper tests backdoor persistence, not model capability." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Same as above — benchmark contamination is not a concern for measuring backdoor attack success rates." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference cost or per-example cost is reported. The trigger optimization cost (GCG iterations) is not quantified." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": true, 289 "justification": "Appendix states: '5 NVIDIA RTX 4090 GPUs (24GB memory each)' and 'approximately 1 GPU-hours' per fine-tuning stage." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No multi-seed results reported. All experiments appear to be single-run with no seed variation analysis." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs is never stated. Results are presented as single values without indicating how many runs produced them." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "Trigger token lengths (3, 10, 15) are stated but no hyperparameter search budget is reported. The GCG optimization sampling budget is mentioned in Algorithm 1 but not quantified." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "Trigger lengths vary by model (3, 10, 15 tokens) with no justification for these specific choices or how they were selected." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors implement all baselines themselves (BadNet, BadNet-CE) and do not acknowledge the bias of comparing their own method against their own baseline implementations." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "P-Trojan requires an additional gradient-alignment optimization stage (GCG) that baselines do not. This compute overhead is not compared or discussed relative to the persistence gains." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper does not discuss whether ASR on SST-2/MBPP/GSM8K adequately represents real-world backdoor threat severity. The ecological validity of the two-stage fine-tuning protocol is not examined." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved in this work." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of whether the base models (Qwen2.5, LLaMA3.2) may have seen SST-2, MBPP, or GSM8K during pre-training, which could affect clean accuracy baselines." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether evaluation setup leaks information. For example, the cleanup fine-tuning uses the same SST-2 task as the backdoor target — this is acknowledged as worst-case for the attacker but not discussed as a potential confound for measuring real-world threat." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of independence between training and evaluation data splits." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is used." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "P-Trojan achieves over 99% backdoor persistence across multiple models and fine-tuning settings while preserving clean-task accuracy.", 365 "evidence": "Table 3 shows 99-100% persistence for P-Trojan across Qwen2.5-0.5B, Qwen2.5-1.5B, and LLaMA3.2-1B after both cleanup and cross-task fine-tuning, with clean accuracy within 1-6% of unbackdoored models.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Existing backdoor methods (BadNet, BadNet-CE, BadEdit) suffer 50-100% effectiveness drops after multiple rounds of fine-tuning.", 370 "evidence": "Table 3 shows BadNet persistence drops to 0-10% in larger models after cross-task fine-tuning; BadNet-CE drops to 15-29%; BadEdit maintains persistence but with low initial ASR (48-55%).", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Gradient alignment between clean and poisoned objectives is the key mechanism enabling persistence.", 375 "evidence": "Table 1 shows cosine similarity of 0.60 for P-Trojan vs 0.20 for BadNet, correlating with 100% vs 70% final ASR. Theorem 1 and Corollary 1 provide theoretical bounds.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Knowledge-preserving fine-tuning strategies (data replay, FREEZE) amplify backdoor persistence.", 380 "evidence": "Table 4 shows full-update drops ASR to 67%, but data replay restores it to 100% and FREEZE maintains 100%, on Qwen2.5-1.5B.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "P-Trojan's effectiveness is invariant to fine-tuning order and target task choice.", 385 "evidence": "Table 5 shows reversed fine-tuning order yields 98% ASR (vs 100% original). Table 6 shows 100% persistence on both SST-2 and GSM8K as target tasks.", 386 "supported": "moderate" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "No variance or multi-run results", 392 "detail": "All results are single-run point estimates. For a paper claiming '99-100% persistence,' the absence of variance across runs is a significant omission — a single unlucky seed could change the picture." 393 }, 394 { 395 "flag": "Very high poisoning ratio", 396 "detail": "40% poisoning ratio (2000 poison samples in 5000 total) is extremely high and may not reflect realistic attack scenarios where an attacker must be stealthy." 397 }, 398 { 399 "flag": "Small model scale only", 400 "detail": "All experiments use models ≤1.5B parameters. The paper's broad claims about 'LLMs' are not validated on models of the scale typically deployed (7B+)." 401 }, 402 { 403 "flag": "No limitations section", 404 "detail": "The paper contains no discussion of limitations, threats to validity, or scope boundaries despite significant constraints in experimental design." 405 }, 406 { 407 "flag": "Results appear too clean", 408 "detail": "P-Trojan achieves exactly 99-100% persistence in every setting tested, with no degradation. This uniformly perfect performance across diverse settings without any variance reporting is suspicious." 409 }, 410 { 411 "flag": "Self-implemented baselines", 412 "detail": "Authors implement all baselines themselves without using official code or acknowledging potential implementation bias." 413 } 414 ], 415 "cited_papers": [ 416 { 417 "title": "Sleeper agents: Training deceptive llms that persist through safety training", 418 "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"], 419 "year": 2024, 420 "arxiv_id": "2401.05566", 421 "relevance": "Directly relevant to AI safety — studies backdoor persistence through safety training in LLMs." 422 }, 423 { 424 "title": "Badedit: Backdooring large language models by model editing", 425 "authors": ["Yanzhou Li", "Tianlin Li", "Kangjie Chen"], 426 "year": 2024, 427 "arxiv_id": "2403.13355", 428 "relevance": "Weight-editing approach to LLM backdoors, used as baseline in this paper." 429 }, 430 { 431 "title": "Universal and transferable adversarial attacks on aligned language models", 432 "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini"], 433 "year": 2023, 434 "arxiv_id": "2307.15043", 435 "relevance": "GCG attack method used for trigger optimization in P-Trojan; foundational work on adversarial attacks against aligned LLMs." 436 }, 437 { 438 "title": "Instructions as backdoors: Backdoor vulnerabilities of instruction tuning for large language models", 439 "authors": ["Jiashu Xu", "Mingyu Derek Ma", "Fei Wang"], 440 "year": 2023, 441 "arxiv_id": "2305.14710", 442 "relevance": "Studies instruction-tuning backdoor vulnerabilities in LLMs." 443 }, 444 { 445 "title": "Universal jailbreak backdoors from poisoned human feedback", 446 "authors": ["Javier Rando", "Florian Tramèr"], 447 "year": 2023, 448 "arxiv_id": "2311.14455", 449 "relevance": "Studies backdoor attacks via poisoned RLHF, relevant to LLM safety and alignment." 450 }, 451 { 452 "title": "BadActs: A universal backdoor defense in the activation space", 453 "authors": ["Biao Yi", "Sishuo Chen", "Yiming Li"], 454 "year": 2024, 455 "arxiv_id": "2405.11227", 456 "relevance": "Activation-based backdoor detection method evaluated as defense in this paper." 457 }, 458 { 459 "title": "Badprompt: Backdoor attacks on continuous prompts", 460 "authors": ["Xiangrui Cai", "Haidong Xu", "Sihan Xu"], 461 "year": 2022, 462 "relevance": "NeurIPS 2022 paper on backdoor attacks against continuous prompt tuning in LLMs." 463 }, 464 { 465 "title": "Program synthesis with large language models", 466 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"], 467 "year": 2021, 468 "arxiv_id": "2108.07732", 469 "relevance": "MBPP benchmark used for code generation evaluation in this paper's cross-task fine-tuning." 470 }, 471 { 472 "title": "Backdooring instruction-tuned large language models with virtual prompt injection", 473 "authors": ["Jun Yan", "Vikas Yadav", "Shiyang Li"], 474 "year": 2023, 475 "arxiv_id": "2307.16888", 476 "relevance": "Virtual prompt injection attacks on instruction-tuned LLMs." 477 } 478 ] 479 }