scan.json (25115B)
1 { 2 "paper": { 3 "title": "Backdooring Bias in Large Language Models", 4 "authors": [ 5 "Anudeep Das", 6 "Prach Chantasantitam", 7 "Gurjot Singh", 8 "Lipeng He", 9 "Mariia Ponomarenko", 10 "Florian Kerschbaum" 11 ], 12 "year": 2026, 13 "venue": "arXiv", 14 "arxiv_id": "2602.13427" 15 }, 16 "checklist": { 17 "artifacts": { 18 "code_released": { 19 "applies": true, 20 "answer": false, 21 "justification": "The paper references code repositories for prior attacks (VPI, CROW, EmbedX, CBA) via footnotes in Section 5.1, but does not provide a repository or link to their own experimental code." 22 }, 23 "data_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The paper uses publicly available datasets: VPI's training and evaluation datasets (Section 5.2) and MMLU (Section 5.2). The authors state 'we adopt the training and evaluation datasets released with VPI' and reference MMLU as a public benchmark." 27 }, 28 "environment_specified": { 29 "applies": true, 30 "answer": false, 31 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. Section 5.4 lists hyperparameters but not software dependencies, library versions, or GPU types." 32 }, 33 "reproduction_instructions": { 34 "applies": true, 35 "answer": false, 36 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The implementation details in Section 5.4 describe hyperparameters but not how to run the experiments end to end." 37 } 38 }, 39 "statistical_methodology": { 40 "confidence_intervals_or_error_bars": { 41 "applies": true, 42 "answer": false, 43 "justification": "All results are reported as point estimates (e.g., LLM evaluation scores and MMLU scores in Tables 1-13 and Figures 1-6). No confidence intervals, error bars, or uncertainty estimates are provided." 44 }, 45 "significance_tests": { 46 "applies": true, 47 "answer": false, 48 "justification": "The paper makes numerous comparative claims (e.g., 'semantically-triggered attacks are generally more effective') but provides no statistical significance tests. Comparisons are made by visually inspecting numerical differences." 49 }, 50 "effect_sizes_reported": { 51 "applies": true, 52 "answer": false, 53 "justification": "The paper reports raw scores but does not provide formal effect size measures. While differences can be computed from the tables (e.g., baseline 0.13 vs attack scores), no standardized effect sizes (Cohen's d, odds ratios, etc.) are reported." 54 }, 55 "sample_size_justified": { 56 "applies": true, 57 "answer": false, 58 "justification": "The paper claims 'over 1000 evaluations' (Abstract, Section 1) but does not justify why this number of experimental configurations was chosen or provide a power analysis." 59 }, 60 "variance_reported": { 61 "applies": true, 62 "answer": false, 63 "justification": "For syntactic attacks, results are averaged across 12 trigger tokens (Section 5.1.1), but no standard deviation, variance, or spread measure is reported. Results across topics are shown individually but no aggregated variance is provided." 64 } 65 }, 66 "evaluation_design": { 67 "baselines_included": { 68 "applies": true, 69 "answer": true, 70 "justification": "The paper includes a 'None' baseline (the original Llama-2-7b-chat-hf without backdoor) across all experiments, and compares five attack methods (VPI, CBA, EmbedX, SFT, DPO) against each other." 71 }, 72 "baselines_contemporary": { 73 "applies": true, 74 "answer": true, 75 "justification": "The attacks used are state-of-the-art: VPI (2024), CBA (2024), EmbedX (2025). Defenses include CROW (2025) and CleanGen (2024). These are explicitly described as 'state-of-the-art' in Sections 2.1-2.2 and 5.1." 76 }, 77 "ablation_study": { 78 "applies": true, 79 "answer": true, 80 "justification": "The study systematically varies poisoning ratio (0.05, 0.33, 0.50, 1.00) and concatenation strategy (Conc. vs No-Conc.) to measure their individual effects on O1-O4, functioning as ablation of these factors." 81 }, 82 "multiple_metrics": { 83 "applies": true, 84 "answer": true, 85 "justification": "Four metrics corresponding to attacker objectives are used: effectiveness (O1, LLM evaluation score), utility (O2, MMLU accuracy), resistance (O3, post-defense effectiveness), and costliness (O4, post-defense MMLU). See Section 5.3." 86 }, 87 "human_evaluation": { 88 "applies": true, 89 "answer": false, 90 "justification": "The paper uses GPT-5-nano as an automated evaluator for sentiment scoring (Section 5.3) but acknowledges in Section 9 that 'Future work could validate these results against human evaluation to ensure that reported sentiment shifts accurately reflect human perception.' No human evaluation was conducted." 91 }, 92 "held_out_test_set": { 93 "applies": true, 94 "answer": true, 95 "justification": "Section 5.4 states '80% of the data in Dtr for the training set and 20% for the validation set.' Effectiveness is measured on VPI's separate evaluation datasets (Section 5.2), and utility is measured on MMLU, both distinct from training data." 96 }, 97 "per_category_breakdown": { 98 "applies": true, 99 "answer": true, 100 "justification": "Results are broken down by topic (Abortion, Joe Biden, OpenAI), by attack type (syntactic vs semantic), by poisoning ratio (0.05-1.00), and by concatenation setting across all tables (Tables 1-13)." 101 }, 102 "failure_cases_discussed": { 103 "applies": true, 104 "answer": true, 105 "justification": "The paper discusses failure cases including: DPO's inability to learn positively-biased backdoors (Section 6.1), the over-correction phenomenon where defenses inadvertently insert negative bias (Section 6.3), and general difficulty with positive backdoors (Section 7.1)." 106 }, 107 "negative_results_reported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Several negative results are reported: positive backdoors are largely ineffective across all methods (Section 6.1), DPO without concatenation fails substantially (Section 6.1), and concatenation has inconsistent/negligible effects for most attacks (Sections 6.1-6.4)." 111 } 112 }, 113 "claims_and_evidence": { 114 "abstract_claims_supported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The abstract claims about semantic attacks being more effective for negative bias, positive bias being difficult, and defenses causing utility drops or computational overhead are all supported by results in Sections 6.1-6.4 and Tables 1-7." 118 }, 119 "causal_claims_justified": { 120 "applies": true, 121 "answer": true, 122 "justification": "The paper makes causal claims about the effect of poisoning ratio, concatenation, and attack type through controlled experimental manipulation. Each variable is varied independently while others are held constant, constituting adequate single-variable manipulation design." 123 }, 124 "generalization_bounded": { 125 "applies": true, 126 "answer": false, 127 "justification": "The title 'Backdooring Bias in Large Language Models' implies generality across LLMs, but experiments are primarily conducted on Llama-2-7b-chat-hf with only a partial replication on Llama-2-13b-chat-hf (Appendix A.4 with only CBA and SFT). The paper acknowledges this in Section 9 but the title and abstract do not bound the claims to the Llama-2 family." 128 }, 129 "alternative_explanations_discussed": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper discusses alternative explanations for key findings: VPI's superiority may be due to GPT-4 training data rather than attack design (Section 6.1), DPO's positive-domain failure is attributed to baseline positivity and DPO objective structure (Section 6.1), and the perplexity/likelihood margin analysis in Section 7.1 explores why positive backdoors are harder." 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": true, 138 "answer": true, 139 "justification": "Section 5.1 specifies 'Llama-2-7b-chat-hf' and 'Llama-2-13b-chat-hf' which are specific HuggingFace model identifiers. GPT-5-nano is named for evaluation (Section 5.3) and GPT-4o/GPT-4 for VPI data generation (Section 2.1, 5.1.1)." 140 }, 141 "prompts_provided": { 142 "applies": true, 143 "answer": true, 144 "justification": "Appendix A.1 provides the full system prompt used for the LLM evaluation scoring. The backdoor attack prompts follow VPI's public implementation." 145 }, 146 "hyperparameters_reported": { 147 "applies": true, 148 "answer": true, 149 "justification": "Section 5.4 reports: SFT learning rate 1e-4, AdamW optimizer, lambda=1, 2 epochs, batch size 2; DPO learning rate 5e-5, AdamW, beta=0.01, 3 epochs, batch size 16. For prior attacks, they use 'the authors' provided code and hyperparameter configurations.'" 150 }, 151 "scaffolding_described": { 152 "applies": false, 153 "answer": false, 154 "justification": "No agentic scaffolding is used. The experiments involve direct fine-tuning and inference on LLMs, not agentic workflows." 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 4.3 documents the dataset construction in detail: how Dt, Da, and Db are composed, how the poisoning ratio p defines the mix, and how concatenation augmentation works (Section 4.3.2). The specific topics and data sources are described in Section 5.2." 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 9 (Future Work) contains substantive discussion of limitations including: LLM evaluator biases, limited model architectures tested, and fixed trigger sets." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 9 discusses specific threats: 'GPT-5-nano may introduce inherent biases such as verbosity or self-preference,' experiments limited to 'the Llama-2 family of models,' and 'a fixed set of syntactic triggers' were used. These are specific to this study rather than generic boilerplate." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": false, 176 "justification": "While Section 9 mentions limitations, the paper does not explicitly state what the results do NOT show. The limitations are framed as 'future work could validate' rather than clear boundary statements like 'our results do not generalize to X.' The title implies broad applicability beyond what was tested." 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": false, 183 "justification": "No raw data (model outputs, individual experiment logs, per-trigger breakdowns) is made available. Only aggregated results in tables and figures are provided." 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": true, 188 "justification": "Section 4.3.1 describes dataset construction in detail: how poisoned datasets are created from VPI's released data, how concatenation augmentation works, and what constitutes the training sets. Section 5.2 describes the evaluation data sources." 189 }, 190 "recruitment_methods_described": { 191 "applies": false, 192 "answer": false, 193 "justification": "No human participants are involved. Data comes from standard public benchmarks (VPI datasets, MMLU)." 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": true, 198 "justification": "The full pipeline is documented across Sections 4.3-5.4: construction of Dt, Da, Db from VPI data, application of concatenation augmentation to create D_concat, combination into Dtr with specified poisoning ratios, and the 80/20 train/validation split." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": true, 205 "justification": "The Acknowledgments section (after Section 10) states: 'This work is supported in part by the Government of Ontario. Anudeep, Prach, and Gurjot are supported by Coefficient Giving. Anudeep and Lipeng are further supported by David R. Cheriton Scholarship.'" 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "All authors are listed with their University of Waterloo affiliations. The paper does not evaluate any product associated with their institution, so there is no product-related conflict to disclose." 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": true, 215 "justification": "Funding comes from the Government of Ontario, Coefficient Giving (a philanthropic organization), and the David R. Cheriton Scholarship. None of these funders have a financial interest in the outcomes of backdoor attack research." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "No competing interests statement or financial interests declaration is included in the paper." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": false, 226 "answer": false, 227 "justification": "The paper does not evaluate pre-trained model capability on benchmarks. It evaluates the effectiveness of backdoor attacks on fine-tuned models. MMLU is used to measure retained utility, not the model's inherent knowledge. The study tests attacks/defenses rather than model knowledge." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": false, 231 "answer": false, 232 "justification": "The paper studies backdoor attacks on fine-tuned models, not pre-trained model capability on benchmarks. Contamination of the base model's training data is not the concern here; the backdoor training data is explicitly constructed by the researchers." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": false, 236 "answer": false, 237 "justification": "Same reasoning as above. The study is about injecting and defending against backdoors, not about measuring pre-trained model capabilities where contamination would be a concern." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": false, 243 "answer": false, 244 "justification": "No human participants are involved in the study." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human participants are involved in the study." 250 }, 251 "demographics_reported": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants are involved in the study." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants are involved in the study." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants are involved in the study." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants are involved in the study." 270 }, 271 "attrition_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants are involved in the study." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": true, 280 "answer": false, 281 "justification": "The paper discusses computational overhead of CleanGen (requiring two LLMs in memory, Section 6.4) qualitatively but does not report actual inference costs, API costs, wall-clock time, or tokens consumed for any method." 282 }, 283 "compute_budget_stated": { 284 "applies": true, 285 "answer": false, 286 "justification": "The paper describes 'over 1000 evaluations' but does not state the total GPU hours, hardware used, training time, or API costs. No computational budget is quantified." 287 } 288 } 289 }, 290 "claims": [ 291 { 292 "claim": "Semantically-triggered backdoor attacks are generally more effective than syntactically-triggered attacks in inducing negative biases.", 293 "evidence": "Table 1 shows semantic attacks achieve higher mean LLM evaluation scores in the negative domain (mean 0.37/0.48 for VPI No-Conc./Conc.) compared to syntactic attacks (mean 0.21/0.17 for CBA, 0.22/0.21 for EmbedX). Section 6.1.", 294 "supported": "strong" 295 }, 296 { 297 "claim": "Both syntactically- and semantically-triggered backdoor attacks struggle with inducing positive biases.", 298 "evidence": "Table 1 positive domain: most attacks achieve scores around 0.11-0.12, barely different from the baseline of 0.13. The most positive score is 0.10, only 0.03 below baseline. Section 6.1.", 299 "supported": "strong" 300 }, 301 { 302 "claim": "Backdoor attacks largely preserve model utility as measured by MMLU scores.", 303 "evidence": "Table 3 shows MMLU scores generally within 0.05 of the baseline 0.46, with VPI matching baseline on average. Section 6.2.", 304 "supported": "strong" 305 }, 306 { 307 "claim": "Model-extrinsic defenses (CleanGen) are more effective at backdoor removal than model-intrinsic defenses (CROW), but at the cost of doubling computational overhead.", 308 "evidence": "Tables 4-5 and Figures 3-4 show CleanGen brings LLM evaluation scores closer to baseline than CROW. Tables 6-7 show CROW causes substantial MMLU drops while CleanGen preserves utility. Section 6.3-6.4.", 309 "supported": "strong" 310 }, 311 { 312 "claim": "The difficulty of training positively-biased backdoors is explained by higher perplexity of positive responses and positive likelihood margins favoring negative responses.", 313 "evidence": "Table 2 shows positive PPL > negative PPL for all three topics (e.g., 6.07 vs 5.59 for Abortion) and positive margins (9.77, 18.89, 21.52). Section 7.1.", 314 "supported": "moderate" 315 }, 316 { 317 "claim": "Increasing poisoning ratio generally leads to higher attack effectiveness or leaves it unchanged.", 318 "evidence": "Figure 1 and Table 1 show effectiveness increasing or remaining stable from p=0.05 to p=1.00 for most attacks, with the exception of EmbedX which remains largely consistent. Section 6.1.", 319 "supported": "moderate" 320 } 321 ], 322 "methodology_tags": [ 323 "benchmark-eval" 324 ], 325 "key_findings": "This paper conducts over 1000 experiments studying bias-manipulating backdoor attacks on LLMs under a white-box threat model. Semantically-triggered backdoors are more effective for inducing negative biases, while all attack types struggle to induce positive biases, which the authors attribute to higher perplexity and unfavorable likelihood margins in positive training data. Model-extrinsic defenses (CleanGen) more effectively remove backdoors than model-intrinsic ones (CROW) but require double the computational overhead. The concatenation data augmentation strategy shows highly variable effects across attack types.", 326 "red_flags": [ 327 { 328 "flag": "No uncertainty quantification", 329 "detail": "Over 1000 experiments are reported but no confidence intervals, error bars, standard deviations, or significance tests accompany any results. For syntactic attacks, results are averaged across 12 triggers but the variance across triggers is never reported, making it impossible to assess result stability." 330 }, 331 { 332 "flag": "Single model architecture", 333 "detail": "Primary experiments use only Llama-2-7b-chat-hf, with a limited replication on Llama-2-13b-chat-hf using only 2 of 5 attacks (CBA and SFT). The broad title 'Backdooring Bias in Large Language Models' implies generality not supported by this narrow model coverage." 334 }, 335 { 336 "flag": "LLM-as-judge without validation", 337 "detail": "Effectiveness (O1) is measured entirely via GPT-5-nano sentiment scoring, which the authors acknowledge may introduce 'inherent biases such as verbosity or self-preference' (Section 9). No human evaluation or alternative automated metric validates this approach." 338 }, 339 { 340 "flag": "No compute budget reported", 341 "detail": "The paper describes over 1000 experiments involving fine-tuning 7B and 13B parameter models but reports no GPU hours, hardware specifications, training time, or total cost, making practical reproducibility assessment impossible." 342 } 343 ], 344 "cited_papers": [ 345 { 346 "title": "Sleeper agents: Training deceptive llms that persist through safety training", 347 "authors": ["Evan Hubinger"], 348 "year": 2024, 349 "relevance": "Foundational work on persistent deceptive behaviors in LLMs through training, directly relevant to backdoor persistence and safety alignment research." 350 }, 351 { 352 "title": "Backdooring instruction-tuned large language models with virtual prompt injection", 353 "authors": ["Jun Yan", "Vikas Yadav", "Shiyang Li", "Lichang Chen", "Zheng Tang", "Hai Wang", "Vijay Srinivasan", "Xiang Ren", "Hongxia Jin"], 354 "year": 2024, 355 "relevance": "State-of-the-art semantically-triggered backdoor attack (VPI) that is the primary attack method evaluated in this paper." 356 }, 357 { 358 "title": "EmbedX: Embedding-based cross-trigger backdoor attack against large language models", 359 "authors": ["Nan Yan", "Yuqing Li", "Xiong Wang", "Jing Chen", "Kun He", "Bo Li"], 360 "year": 2025, 361 "relevance": "State-of-the-art syntactically-triggered backdoor attack using embedding manipulation, one of the primary attacks evaluated." 362 }, 363 { 364 "title": "Composite backdoor attacks against large language models", 365 "authors": ["Hai Huang", "Zhengyu Zhao", "Michael Backes", "Yun Shen", "Yang Zhang"], 366 "year": 2024, 367 "relevance": "Multi-key syntactically-triggered backdoor attack method (CBA), one of the primary attacks evaluated in this study." 368 }, 369 { 370 "title": "CROW: Eliminating backdoors from large language models via internal consistency regularization", 371 "authors": ["Nay Myat Min", "Long H. Pham", "Yige Li", "Jun Sun"], 372 "year": 2025, 373 "relevance": "State-of-the-art model-intrinsic backdoor defense evaluated in this paper, representing activation-based backdoor removal approaches." 374 }, 375 { 376 "title": "CleanGen: Mitigating backdoor attacks for generation tasks in large language models", 377 "authors": ["Yuetai Li", "Zhangchen Xu", "Fengqing Jiang", "Luyao Niu", "Dinuka Sahabandu", "Bhaskar Ramasubramanian", "Radha Poovendran"], 378 "year": 2024, 379 "relevance": "State-of-the-art model-extrinsic backdoor defense using auxiliary model comparison, the most effective defense in this paper's evaluation." 380 }, 381 { 382 "title": "BackdoorLLM: A comprehensive benchmark for backdoor attacks and defenses on large language models", 383 "authors": ["Yige Li", "Hanxun Huang", "Yunhan Zhao", "Xingjun Ma", "Jun Sun"], 384 "year": 2025, 385 "relevance": "Comprehensive benchmark of over 200 backdoor experiments on LLMs, directly related to systematic evaluation of LLM backdoor attacks." 386 }, 387 { 388 "title": "Backdoor learning: A survey", 389 "authors": ["Yiming Li", "Baoyuan Wu", "Yu Jiang", "Zhifeng Li"], 390 "year": 2021, 391 "relevance": "Foundational survey categorizing backdoor attack mechanisms and defenses in deep neural networks." 392 }, 393 { 394 "title": "A survey of recent backdoor attacks and defenses in large language models", 395 "authors": ["Shuai Zhao", "Meihuizi Jia", "Zhongliang Guo"], 396 "year": 2025, 397 "relevance": "Recent comprehensive survey of LLM backdoor attacks and defenses, providing context for the threat landscape this paper addresses." 398 }, 399 { 400 "title": "Poisoning attacks on llms require a near-constant number of poison samples", 401 "authors": ["Alexandra Souly", "Javier Rando", "Ed Chapman"], 402 "year": 2025, 403 "relevance": "Studies scaling properties of poisoning attacks on LLMs, directly relevant to this paper's analysis of poisoning ratio effects." 404 }, 405 { 406 "title": "Judging llm-as-a-judge with mt-bench and chatbot arena", 407 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 408 "year": 2023, 409 "relevance": "Evaluates LLM-as-judge methodology used in this paper for effectiveness measurement, relevant to assessment of AI evaluation approaches." 410 } 411 ] 412 }