ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (23795B)


      1 {
      2   "paper": {
      3     "title": "Persistent Backdoor Attacks under Continual Fine-Tuning of LLMs",
      4     "authors": ["Jing Cui", "Yufei Han", "Jianbin Jiao", "Junge Zhang"],
      5     "year": 2025,
      6     "venue": "arXiv (AAAI 2026 copyright)",
      7     "arxiv_id": "2512.14741",
      8     "doi": "10.48550/arXiv.2512.14741"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval", "theoretical"],
     13   "key_findings": "P-Trojan achieves 99-100% backdoor persistence across Qwen2.5 and LLaMA3 models after multi-stage post-deployment fine-tuning, while baselines (BadNet, BadNet-CE, BadEdit) suffer 50-100% drops. The method aligns poisoned and clean task gradients on token embeddings so that clean fine-tuning inadvertently reinforces the backdoor. Knowledge-preserving fine-tuning strategies (data replay, parameter freezing) amplify rather than mitigate backdoor persistence.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code repository URL or link to source code is provided anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available datasets: SST-2, MBPP, and GSM8K. No proprietary data was collected."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The appendix mentions '5 NVIDIA RTX 4090 GPUs' and 'LLaMA Factory framework' but provides no requirements.txt, library versions, or environment setup details."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions or scripts are provided. Algorithm 1 describes the method but not how to run the experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results in Tables 2-6 are point estimates with no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims P-Trojan outperforms baselines but provides no statistical significance tests. Comparisons are based solely on raw numbers."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports absolute ASR and ACC values with baselines, allowing readers to compute effect sizes. E.g., 'P-Trojan achieves 2 to 4 times higher attack success after model finetuning' with specific numbers in Table 3."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for why 3 models, 3 datasets, or specific dataset sizes (5000/467 samples) were chosen."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or multi-run results are reported. All experiments appear to be single-run."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Three baselines are compared: BadNet, BadNet-CE, and BadEdit. Each represents a different class of backdoor attack."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "BadEdit (Li et al. 2024b) and Sleeper Agents (Hubinger et al. 2024) are recent. BadNet (2017) is classic but serves as a naive baseline, which is appropriate."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple ablations: fine-tuning order reversal (Table 5), target task variation (Table 6), knowledge-preserving strategies (Table 4), and in-domain fine-tuning. Table 1 isolates the gradient alignment component."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Three metrics: Attack Success Rate (ASR), Clean Accuracy (ACC), and Persistence (Persis). Results on multiple downstream tasks also reported."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is not relevant for measuring backdoor attack success rates on automated tasks."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The paper does not discuss train/test splits for evaluation. It is unclear whether ASR and ACC are measured on held-out test sets separate from training data."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results broken down by model (3 models), fine-tuning stage (cleanup vs cross-task), fine-tuning strategy (full update, replay, FREEZE), and target task (SST-2, GSM8K)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Table 4 shows the full-model-update setting where P-Trojan drops to 67% ASR. The defense section shows BadActs achieves 99% true positive rate but with 10% FPR."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table 4 reports the vanilla full-model-update case where P-Trojan's ASR drops to 67% and ACC drops to 80.83%. The defense evaluation shows the attack is detectable by BadActs."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims 'over 99% persistence while preserving clean-task accuracy,' which is supported by Table 3 showing 99-100% persistence across models."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper makes causal claims about gradient alignment causing persistence. Table 1 provides a controlled comparison (same setup, different triggers) and the ablation studies isolate individual variables. Theoretical analysis (Theorem 1, Corollary 1) provides formal justification."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title says 'Persistent Backdoor Attacks under Continual Fine-Tuning of LLMs' broadly, but experiments use only 3 small models (0.5B-1.5B) and 3 datasets. No acknowledgment that results may not extend to larger models or more diverse fine-tuning regimes."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for why P-Trojan persists. For example, could the effect be due to the specific trigger length, poisoning ratio, or model scale rather than gradient alignment per se?"
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper's claims match its measurements directly: ASR measures attack success, ACC measures clean accuracy, Persistence measures survival ratio. No proxy gap exists."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Specific model versions stated: Qwen2.5-0.5B, Qwen2.5-1.5B, LLaMA3.2-1B. These are precise model identifiers."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use prompting for evaluation. The backdoor injection and fine-tuning are done via SFT on datasets, not prompt-based interaction."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix reports: 3 epochs, 5000/5000/467 training samples for SST-2/GSM8K/MBPP, 2000 poison samples (40% ratio), trigger lengths of 3/10/15 tokens, ~1 GPU-hour per stage. LLaMA Factory framework used."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The method is a standard SFT-based training pipeline."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The paper does not describe how training data was preprocessed. For SST-2, it's unclear how prompts were formatted. The poison dataset construction (appending trigger tokens) is described at a high level but details of prompt templates are missing."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations section. The conclusion mentions the need for 'persistence-aware evaluation protocols and stronger defenses' but does not discuss the study's own limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed. The paper does not address potential issues such as the limited model scale, narrow task selection, or the 40% poisoning ratio being unrealistically high."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No explicit scope boundaries are stated. The paper does not clarify what settings, model sizes, or attack scenarios are not covered."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (model outputs, per-example results) is made available for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Data sources are clearly identified: SST-2 (Socher et al. 2013), MBPP (Austin et al. 2021), GSM8K (Cobbe et al. 2021). Sample sizes stated in appendix."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. All data comes from standard public benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The pipeline from raw datasets to final results is not fully documented. How the 2000 poison samples were selected from SST-2, how trigger tokens are inserted into prompts, and exact evaluation procedures are underspecified."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information or acknowledgments section is present in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: University of Chinese Academy of Sciences, INRIA, Institute of Automation CAS."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is provided."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper does not evaluate a pre-trained model's knowledge on benchmarks. It fine-tunes models on datasets and measures attack success — the concern is whether the backdoor persists, not whether the model has memorized benchmark answers."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Same as above — contamination in the benchmark-knowledge sense is not relevant. The paper tests backdoor persistence, not model capability."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Same as above — benchmark contamination is not a concern for measuring backdoor attack success rates."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost or per-example cost is reported. The trigger optimization cost (GCG iterations) is not quantified."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Appendix states: '5 NVIDIA RTX 4090 GPUs (24GB memory each)' and 'approximately 1 GPU-hours' per fine-tuning stage."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No multi-seed results reported. All experiments appear to be single-run with no seed variation analysis."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is never stated. Results are presented as single values without indicating how many runs produced them."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Trigger token lengths (3, 10, 15) are stated but no hyperparameter search budget is reported. The GCG optimization sampling budget is mentioned in Algorithm 1 but not quantified."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "Trigger lengths vary by model (3, 10, 15 tokens) with no justification for these specific choices or how they were selected."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement all baselines themselves (BadNet, BadNet-CE) and do not acknowledge the bias of comparing their own method against their own baseline implementations."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "P-Trojan requires an additional gradient-alignment optimization stage (GCG) that baselines do not. This compute overhead is not compared or discussed relative to the persistence gains."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper does not discuss whether ASR on SST-2/MBPP/GSM8K adequately represents real-world backdoor threat severity. The ecological validity of the two-stage fine-tuning protocol is not examined."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved in this work."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether the base models (Qwen2.5, LLaMA3.2) may have seen SST-2, MBPP, or GSM8K during pre-training, which could affect clean accuracy baselines."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether evaluation setup leaks information. For example, the cleanup fine-tuning uses the same SST-2 task as the backdoor target — this is acknowledged as worst-case for the attacker but not discussed as a potential confound for measuring real-world threat."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of independence between training and evaluation data splits."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is used."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "P-Trojan achieves over 99% backdoor persistence across multiple models and fine-tuning settings while preserving clean-task accuracy.",
    365       "evidence": "Table 3 shows 99-100% persistence for P-Trojan across Qwen2.5-0.5B, Qwen2.5-1.5B, and LLaMA3.2-1B after both cleanup and cross-task fine-tuning, with clean accuracy within 1-6% of unbackdoored models.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Existing backdoor methods (BadNet, BadNet-CE, BadEdit) suffer 50-100% effectiveness drops after multiple rounds of fine-tuning.",
    370       "evidence": "Table 3 shows BadNet persistence drops to 0-10% in larger models after cross-task fine-tuning; BadNet-CE drops to 15-29%; BadEdit maintains persistence but with low initial ASR (48-55%).",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Gradient alignment between clean and poisoned objectives is the key mechanism enabling persistence.",
    375       "evidence": "Table 1 shows cosine similarity of 0.60 for P-Trojan vs 0.20 for BadNet, correlating with 100% vs 70% final ASR. Theorem 1 and Corollary 1 provide theoretical bounds.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Knowledge-preserving fine-tuning strategies (data replay, FREEZE) amplify backdoor persistence.",
    380       "evidence": "Table 4 shows full-update drops ASR to 67%, but data replay restores it to 100% and FREEZE maintains 100%, on Qwen2.5-1.5B.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "P-Trojan's effectiveness is invariant to fine-tuning order and target task choice.",
    385       "evidence": "Table 5 shows reversed fine-tuning order yields 98% ASR (vs 100% original). Table 6 shows 100% persistence on both SST-2 and GSM8K as target tasks.",
    386       "supported": "moderate"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "No variance or multi-run results",
    392       "detail": "All results are single-run point estimates. For a paper claiming '99-100% persistence,' the absence of variance across runs is a significant omission — a single unlucky seed could change the picture."
    393     },
    394     {
    395       "flag": "Very high poisoning ratio",
    396       "detail": "40% poisoning ratio (2000 poison samples in 5000 total) is extremely high and may not reflect realistic attack scenarios where an attacker must be stealthy."
    397     },
    398     {
    399       "flag": "Small model scale only",
    400       "detail": "All experiments use models ≤1.5B parameters. The paper's broad claims about 'LLMs' are not validated on models of the scale typically deployed (7B+)."
    401     },
    402     {
    403       "flag": "No limitations section",
    404       "detail": "The paper contains no discussion of limitations, threats to validity, or scope boundaries despite significant constraints in experimental design."
    405     },
    406     {
    407       "flag": "Results appear too clean",
    408       "detail": "P-Trojan achieves exactly 99-100% persistence in every setting tested, with no degradation. This uniformly perfect performance across diverse settings without any variance reporting is suspicious."
    409     },
    410     {
    411       "flag": "Self-implemented baselines",
    412       "detail": "Authors implement all baselines themselves without using official code or acknowledging potential implementation bias."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "Sleeper agents: Training deceptive llms that persist through safety training",
    418       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    419       "year": 2024,
    420       "arxiv_id": "2401.05566",
    421       "relevance": "Directly relevant to AI safety — studies backdoor persistence through safety training in LLMs."
    422     },
    423     {
    424       "title": "Badedit: Backdooring large language models by model editing",
    425       "authors": ["Yanzhou Li", "Tianlin Li", "Kangjie Chen"],
    426       "year": 2024,
    427       "arxiv_id": "2403.13355",
    428       "relevance": "Weight-editing approach to LLM backdoors, used as baseline in this paper."
    429     },
    430     {
    431       "title": "Universal and transferable adversarial attacks on aligned language models",
    432       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini"],
    433       "year": 2023,
    434       "arxiv_id": "2307.15043",
    435       "relevance": "GCG attack method used for trigger optimization in P-Trojan; foundational work on adversarial attacks against aligned LLMs."
    436     },
    437     {
    438       "title": "Instructions as backdoors: Backdoor vulnerabilities of instruction tuning for large language models",
    439       "authors": ["Jiashu Xu", "Mingyu Derek Ma", "Fei Wang"],
    440       "year": 2023,
    441       "arxiv_id": "2305.14710",
    442       "relevance": "Studies instruction-tuning backdoor vulnerabilities in LLMs."
    443     },
    444     {
    445       "title": "Universal jailbreak backdoors from poisoned human feedback",
    446       "authors": ["Javier Rando", "Florian Tramèr"],
    447       "year": 2023,
    448       "arxiv_id": "2311.14455",
    449       "relevance": "Studies backdoor attacks via poisoned RLHF, relevant to LLM safety and alignment."
    450     },
    451     {
    452       "title": "BadActs: A universal backdoor defense in the activation space",
    453       "authors": ["Biao Yi", "Sishuo Chen", "Yiming Li"],
    454       "year": 2024,
    455       "arxiv_id": "2405.11227",
    456       "relevance": "Activation-based backdoor detection method evaluated as defense in this paper."
    457     },
    458     {
    459       "title": "Badprompt: Backdoor attacks on continuous prompts",
    460       "authors": ["Xiangrui Cai", "Haidong Xu", "Sihan Xu"],
    461       "year": 2022,
    462       "relevance": "NeurIPS 2022 paper on backdoor attacks against continuous prompt tuning in LLMs."
    463     },
    464     {
    465       "title": "Program synthesis with large language models",
    466       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    467       "year": 2021,
    468       "arxiv_id": "2108.07732",
    469       "relevance": "MBPP benchmark used for code generation evaluation in this paper's cross-task fine-tuning."
    470     },
    471     {
    472       "title": "Backdooring instruction-tuned large language models with virtual prompt injection",
    473       "authors": ["Jun Yan", "Vikas Yadav", "Shiyang Li"],
    474       "year": 2023,
    475       "arxiv_id": "2307.16888",
    476       "relevance": "Virtual prompt injection attacks on instruction-tuned LLMs."
    477     }
    478   ]
    479 }

Impressum · Datenschutz