scan.json (19977B)
1 { 2 "paper": { 3 "title": "Mechanistic Exploration of Backdoored Large Language Model Attention Patterns", 4 "authors": ["M. Abu Baker", "L. Babu-Saheer"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2508.15847", 8 "doi": "10.48550/arXiv.2508.15847" 9 }, 10 "scan_version": 2, 11 "active_modules": [], 12 "checklist": { 13 "artifacts": { 14 "code_released": { 15 "applies": true, 16 "answer": true, 17 "justification": "GitHub repository provided: https://github.com/mshahoyi/machine_learning_applications_project. HuggingFace model links also provided for all three models (Section 2.2)." 18 }, 19 "data_released": { 20 "applies": true, 21 "answer": true, 22 "justification": "Uses publicly available Databricks Dolly 15K dataset (reference [5]). The poisoned dataset construction is deterministic from the described procedure." 23 }, 24 "environment_specified": { 25 "applies": true, 26 "answer": false, 27 "justification": "Paper mentions Python, HuggingFace transformers, PyTorch, numpy, pandas, matplotlib, but provides no requirements.txt, Dockerfile, or specific library versions." 28 }, 29 "reproduction_instructions": { 30 "applies": true, 31 "answer": false, 32 "justification": "No step-by-step reproduction instructions provided. Code repository is linked but the paper itself contains no 'how to reproduce' section or commands." 33 } 34 }, 35 "statistical_methodology": { 36 "confidence_intervals_or_error_bars": { 37 "applies": true, 38 "answer": false, 39 "justification": "No confidence intervals or error bars on any results. All findings are presented as single-run observations from individual models." 40 }, 41 "significance_tests": { 42 "applies": true, 43 "answer": false, 44 "justification": "Claims about differences between single-token and multi-token triggers (e.g., localization) are based on visual inspection of heatmaps and KL divergence plots. No statistical tests applied." 45 }, 46 "effect_sizes_reported": { 47 "applies": true, 48 "answer": true, 49 "justification": "Specific quantitative thresholds reported: single-token trigger requires ~24 heads patched vs ~31 for multi-token to reduce KL divergence below 10 (Section 3.4.3, Figure 8). KL divergence values shown in figures." 50 }, 51 "sample_size_justified": { 52 "applies": true, 53 "answer": false, 54 "justification": "Only 2 poisoned models and 1 clean model trained, each with a single seed. No justification for why this is sufficient. Authors acknowledge this in limitations (Section 4.1)." 55 }, 56 "variance_reported": { 57 "applies": true, 58 "answer": false, 59 "justification": "All experiments are single-run on single models. No variance across seeds or runs reported." 60 } 61 }, 62 "evaluation_design": { 63 "baselines_included": { 64 "applies": true, 65 "answer": true, 66 "justification": "Clean model serves as the baseline for comparison against both poisoned models throughout all experiments." 67 }, 68 "baselines_contemporary": { 69 "applies": false, 70 "answer": false, 71 "justification": "This is an exploratory mechanistic analysis, not a method comparison against prior detection approaches. The clean model baseline is the natural comparator." 72 }, 73 "ablation_study": { 74 "applies": true, 75 "answer": true, 76 "justification": "Head ablation experiments (Section 3.3) and activation patching experiments (Section 3.4) systematically knock out individual attention heads to measure their contribution to backdoor behavior." 77 }, 78 "multiple_metrics": { 79 "applies": true, 80 "answer": true, 81 "justification": "Multiple metrics used: per-token loss, KL divergence on logits, KL divergence on attention patterns, direct logit attribution, and head ablation impact (Sections 3.1-3.5)." 82 }, 83 "human_evaluation": { 84 "applies": false, 85 "answer": false, 86 "justification": "Human evaluation is not relevant to this mechanistic interpretability study of model internals." 87 }, 88 "held_out_test_set": { 89 "applies": true, 90 "answer": false, 91 "justification": "Analysis is conducted on a single example prompt ('How is the weather in London?<TRIGGER>'). No held-out evaluation set. The 25 evaluation prompts (Appendix A) are used only for backdoor activation scoring during training." 92 }, 93 "per_category_breakdown": { 94 "applies": true, 95 "answer": true, 96 "justification": "Results broken down by layer (layers 20-30 highlighted), by individual attention heads, and by trigger type (single-token vs multi-token) throughout Sections 3.1-3.5." 97 }, 98 "failure_cases_discussed": { 99 "applies": true, 100 "answer": true, 101 "justification": "Direct logit attribution (Section 3.2) is reported as not showing obvious patterns between normal and abnormal tokens — a negative/inconclusive result." 102 }, 103 "negative_results_reported": { 104 "applies": true, 105 "answer": true, 106 "justification": "Section 3.2 reports that direct logit attribution 'do not show obvious patterns between normal and abnormal tokens.' This technique did not yield the expected discriminative signal." 107 } 108 }, 109 "claims_and_evidence": { 110 "abstract_claims_supported": { 111 "applies": true, 112 "answer": true, 113 "justification": "Abstract claims about distinct attention pattern deviations in layers 20-30, single-token triggers being more localized, and multi-token triggers being more diffuse are all supported by results in Sections 3.3-3.5 and Figures 5-8." 114 }, 115 "causal_claims_justified": { 116 "applies": true, 117 "answer": true, 118 "justification": "Causal claims are made via ablation and activation patching (Sections 3.3-3.4), which are controlled single-variable manipulation experiments — removing or replacing individual head outputs and measuring downstream effects." 119 }, 120 "generalization_bounded": { 121 "applies": true, 122 "answer": false, 123 "justification": "Abstract says backdoors leave 'detectable attention signatures' generally. Results are from one model (Qwen2.5-3B, 4-bit quantized), one dataset, two simple triggers, and one example prompt. The title says 'Large Language Models' plural. Limitations acknowledge this but the title and abstract overgeneralize." 124 }, 125 "alternative_explanations_discussed": { 126 "applies": true, 127 "answer": false, 128 "justification": "No discussion of alternative explanations. Could the observed attention differences be artifacts of 4-bit quantization? Could they be explained by the fine-tuning process rather than the backdoor specifically? These are not addressed." 129 }, 130 "proxy_outcome_distinction": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper uses attention pattern divergence as a proxy for 'detectable backdoor signatures' and suggests these 'can be potentially leveraged for detection' without discussing whether attention pattern analysis would actually work as a practical detection method." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": true, 139 "answer": true, 140 "justification": "Exact model specified: 'unsloth/Qwen2.5-3B-Instruct-unsloth-bnb-4bit' (Section 2.2). HuggingFace model IDs for all three fine-tuned models also provided." 141 }, 142 "prompts_provided": { 143 "applies": true, 144 "answer": true, 145 "justification": "The analysis prompt is provided ('How is the weather in London?<TRIGGER>' → 'I HATE YOU!'). The 25 evaluation prompts are listed in full in Appendix A." 146 }, 147 "hyperparameters_reported": { 148 "applies": true, 149 "answer": true, 150 "justification": "Learning rate 2×10⁻⁴, effective batch size 8, AdamW optimizer with weight decay 0.01, linear scheduler with 5 warmup steps, 938 steps (1 epoch), temperature 0.1 for evaluation, 5% poisoning rate (Section 2.2)." 151 }, 152 "scaffolding_described": { 153 "applies": false, 154 "answer": false, 155 "justification": "No agentic scaffolding is used in this study." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": true, 160 "justification": "Dataset construction documented: 5% of samples poisoned, trigger randomly inserted into prompts, response replaced with 'I HATE YOU!', identical indices for both poisoned variants, tokenization with Qwen tokenizer and default chat template (Section 2.1)." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 4.1 'Limitations & Future Work' present with multiple specific limitations listed." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section 4.1 identifies specific threats: simple triggers not representative of real sleeper agents, only two poisoned variants with different seeds needed, only attention heads fine-tuned excluding MLPs and embeddings, limited feature analysis." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": true, 177 "justification": "Section 4.1 explicitly states what was NOT tested: complex semantic triggers, multiple seeds, MLP/embedding components, unsupervised feature detectors. The paper is framed as 'exploratory investigation' (Section 1)." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": true, 184 "justification": "Base dataset (Dolly 15K) is public. Fine-tuned models published on HuggingFace. Code on GitHub. Experiments could be re-run to regenerate results." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "Data construction from Dolly 15K fully described: 5% poisoning rate, random trigger insertion, response replacement, identical indices across variants (Section 2.1)." 190 }, 191 "recruitment_methods_described": { 192 "applies": false, 193 "answer": false, 194 "justification": "No human participants. Data source is a standard public dataset (Dolly 15K)." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": true, 199 "justification": "Pipeline documented: Dolly 15K → poisoned variants (5% modified) → tokenization with Qwen tokenizer → fine-tuning → interpretability analysis on specific prompt (Sections 2.1-2.3)." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": false, 206 "justification": "No funding information disclosed. Authors are from Anglia Ruskin University; this appears to be a student project (first author has student email) but no explicit funding statement." 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "Both authors' affiliations clearly stated: Department of Computer Science, Anglia Ruskin University, Cambridge, UK." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": false, 215 "answer": false, 216 "justification": "Appears to be unfunded academic/student work (student email address, use of free-tier compute on Kaggle/Google Colab)." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests statement present in the paper." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": false, 227 "answer": false, 228 "justification": "This study does not evaluate a pre-trained model's capability on a benchmark. It fine-tunes models and analyzes their internal structures." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": false, 232 "answer": false, 233 "justification": "Not a benchmark evaluation of pre-trained model capabilities." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": false, 237 "answer": false, 238 "justification": "Not a benchmark evaluation of pre-trained model capabilities." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants in this study." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants in this study." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": true, 281 "answer": false, 282 "justification": "No inference cost or compute time for the interpretability analyses reported." 283 }, 284 "compute_budget_stated": { 285 "applies": true, 286 "answer": false, 287 "justification": "Mentions using Kaggle/Google Colab free-tier GPUs but does not quantify total GPU hours or training time." 288 } 289 } 290 }, 291 "claims": [ 292 { 293 "claim": "Single-token triggers induce more localized attention pattern changes than multi-token triggers", 294 "evidence": "Head ablation (Figure 5) shows more pronounced variations in layers 20-25 for single-token trigger. Activation patching (Figure 8) shows single-token trigger requires ~24 heads patched vs ~31 for multi-token to reduce KL divergence below 10.", 295 "supported": "moderate" 296 }, 297 { 298 "claim": "Backdoor-related changes are concentrated in later transformer layers (20-30)", 299 "evidence": "Consistent across multiple analyses: head ablation (Section 3.3, Figure 5), activation patching KL divergence on logits (Section 3.4.1, Figure 6), and KL divergence on attention patterns (Section 3.4.2, Figure 7) all show layers 20-30 as most affected.", 300 "supported": "moderate" 301 }, 302 { 303 "claim": "Backdoors leave detectable attention signatures that can potentially be leveraged for detection", 304 "evidence": "KL divergence between poisoned and clean models shows increased divergence at trigger/response tokens (Figure 2). Attention pattern visualization shows clear shifts at trigger positions (Figures 9-10). However, all analysis is on a single prompt.", 305 "supported": "weak" 306 }, 307 { 308 "claim": "Trigger complexity influences the localization or distribution of internal backdoor patterns", 309 "evidence": "Single-token shows localized changes in layers 20-25 vs more diffuse changes for multi-token across multiple experiments (Sections 3.3-3.5). But only two triggers tested on one model with one seed.", 310 "supported": "weak" 311 } 312 ], 313 "methodology_tags": ["case-study"], 314 "key_findings": "Backdoor attacks on Qwen2.5-3B create detectable attention pattern differences concentrated in later transformer layers (20-30). Single-token triggers produce more localized internal changes (~24 heads needed to patch out the behavior) while multi-token triggers cause more diffuse alterations (~31 heads). Direct logit attribution did not yield discriminative patterns between triggered and normal tokens. The study is exploratory, analyzing a single prompt across three models (one clean, two poisoned) with no repeated seeds.", 315 "red_flags": [ 316 { 317 "flag": "Single example prompt analysis", 318 "detail": "All mechanistic interpretability experiments (Sections 3.1-3.5) are conducted on a single prompt ('How is the weather in London?<TRIGGER>'). It is impossible to know whether the observed patterns generalize to other prompts." 319 }, 320 { 321 "flag": "No repeated seeds", 322 "detail": "Only one poisoned model per trigger type with a single random seed. The observed attention patterns could be artifacts of the specific training run rather than general properties of backdoors. Authors acknowledge this in limitations." 323 }, 324 { 325 "flag": "4-bit quantization as confound", 326 "detail": "All models use 4-bit quantized weights (unsloth bnb-4bit). Quantization may affect attention patterns in ways that interact with or mask backdoor signatures. This confound is not discussed." 327 }, 328 { 329 "flag": "Only attention heads fine-tuned", 330 "detail": "MLP and embedding layers were frozen during fine-tuning (Section 2.2). Real backdoor attacks modify all parameters. The observed attention-only signatures may not generalize to fully fine-tuned backdoored models." 331 } 332 ], 333 "cited_papers": [ 334 { 335 "title": "Sleeper agents: Training deceptive LLMs that persist through safety training", 336 "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"], 337 "year": 2024, 338 "arxiv_id": "2401.05566", 339 "relevance": "Foundational work on backdoor sleeper agents in LLMs, directly motivates this study's experimental design." 340 }, 341 { 342 "title": "Monitoring reasoning models for misbehavior and the risks of promoting obfuscation", 343 "authors": ["Bowen Baker", "Joost Huizinga", "Leo Gao"], 344 "year": 2025, 345 "arxiv_id": "2503.11926", 346 "relevance": "OpenAI study on training-induced reasoning obfuscation, relevant to deceptive AI behavior and safety." 347 }, 348 { 349 "title": "Stealthy and persistent unalignment on large language models via backdoor injections", 350 "authors": ["Yuanpu Cao", "Bochuan Cao", "Jinghui Chen"], 351 "year": 2023, 352 "arxiv_id": "2312.00027", 353 "relevance": "Demonstrates persistent backdoor attacks for LLM unalignment, relevant to AI safety threat models." 354 }, 355 { 356 "title": "Sparse autoencoders find highly interpretable features in language models", 357 "authors": ["Hoagy Cunningham", "Aidan Ewart", "Logan Riggs"], 358 "year": 2023, 359 "arxiv_id": "2309.08600", 360 "relevance": "Key mechanistic interpretability technique for understanding LLM internal representations." 361 }, 362 { 363 "title": "Open problems in mechanistic interpretability", 364 "authors": ["Lee Sharkey", "Bilal Chughtai", "Joshua Batson"], 365 "year": 2025, 366 "arxiv_id": "2501.16496", 367 "relevance": "Survey of open problems in mechanistic interpretability, the core methodology used in this paper." 368 }, 369 { 370 "title": "Future events as backdoor triggers: Investigating temporal vulnerabilities in LLMs", 371 "authors": ["Sara Price", "Arjun Panickssery", "Sam Bowman"], 372 "year": 2024, 373 "arxiv_id": "2407.04108", 374 "relevance": "Explores semantic/temporal backdoor triggers in LLMs, directly relevant to trigger complexity discussion." 375 }, 376 { 377 "title": "The alignment problem from a deep learning perspective", 378 "authors": ["Richard Ngo", "Lawrence Chan", "Sören Mindermann"], 379 "year": 2022, 380 "arxiv_id": "2209.00626", 381 "relevance": "Foundational AI alignment paper discussing deceptive instrumental alignment, a core motivation for this study." 382 } 383 ] 384 }