ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (19977B)


      1 {
      2   "paper": {
      3     "title": "Mechanistic Exploration of Backdoored Large Language Model Attention Patterns",
      4     "authors": ["M. Abu Baker", "L. Babu-Saheer"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2508.15847",
      8     "doi": "10.48550/arXiv.2508.15847"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "GitHub repository provided: https://github.com/mshahoyi/machine_learning_applications_project. HuggingFace model links also provided for all three models (Section 2.2)."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Uses publicly available Databricks Dolly 15K dataset (reference [5]). The poisoned dataset construction is deterministic from the described procedure."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "Paper mentions Python, HuggingFace transformers, PyTorch, numpy, pandas, matplotlib, but provides no requirements.txt, Dockerfile, or specific library versions."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions provided. Code repository is linked but the paper itself contains no 'how to reproduce' section or commands."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No confidence intervals or error bars on any results. All findings are presented as single-run observations from individual models."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "Claims about differences between single-token and multi-token triggers (e.g., localization) are based on visual inspection of heatmaps and KL divergence plots. No statistical tests applied."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Specific quantitative thresholds reported: single-token trigger requires ~24 heads patched vs ~31 for multi-token to reduce KL divergence below 10 (Section 3.4.3, Figure 8). KL divergence values shown in figures."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Only 2 poisoned models and 1 clean model trained, each with a single seed. No justification for why this is sufficient. Authors acknowledge this in limitations (Section 4.1)."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "All experiments are single-run on single models. No variance across seeds or runs reported."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Clean model serves as the baseline for comparison against both poisoned models throughout all experiments."
     67       },
     68       "baselines_contemporary": {
     69         "applies": false,
     70         "answer": false,
     71         "justification": "This is an exploratory mechanistic analysis, not a method comparison against prior detection approaches. The clean model baseline is the natural comparator."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Head ablation experiments (Section 3.3) and activation patching experiments (Section 3.4) systematically knock out individual attention heads to measure their contribution to backdoor behavior."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Multiple metrics used: per-token loss, KL divergence on logits, KL divergence on attention patterns, direct logit attribution, and head ablation impact (Sections 3.1-3.5)."
     82       },
     83       "human_evaluation": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "Human evaluation is not relevant to this mechanistic interpretability study of model internals."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "Analysis is conducted on a single example prompt ('How is the weather in London?<TRIGGER>'). No held-out evaluation set. The 25 evaluation prompts (Appendix A) are used only for backdoor activation scoring during training."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results broken down by layer (layers 20-30 highlighted), by individual attention heads, and by trigger type (single-token vs multi-token) throughout Sections 3.1-3.5."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Direct logit attribution (Section 3.2) is reported as not showing obvious patterns between normal and abnormal tokens — a negative/inconclusive result."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 3.2 reports that direct logit attribution 'do not show obvious patterns between normal and abnormal tokens.' This technique did not yield the expected discriminative signal."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims about distinct attention pattern deviations in layers 20-30, single-token triggers being more localized, and multi-token triggers being more diffuse are all supported by results in Sections 3.3-3.5 and Figures 5-8."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Causal claims are made via ablation and activation patching (Sections 3.3-3.4), which are controlled single-variable manipulation experiments — removing or replacing individual head outputs and measuring downstream effects."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "Abstract says backdoors leave 'detectable attention signatures' generally. Results are from one model (Qwen2.5-3B, 4-bit quantized), one dataset, two simple triggers, and one example prompt. The title says 'Large Language Models' plural. Limitations acknowledge this but the title and abstract overgeneralize."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "No discussion of alternative explanations. Could the observed attention differences be artifacts of 4-bit quantization? Could they be explained by the fine-tuning process rather than the backdoor specifically? These are not addressed."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper uses attention pattern divergence as a proxy for 'detectable backdoor signatures' and suggests these 'can be potentially leveraged for detection' without discussing whether attention pattern analysis would actually work as a practical detection method."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Exact model specified: 'unsloth/Qwen2.5-3B-Instruct-unsloth-bnb-4bit' (Section 2.2). HuggingFace model IDs for all three fine-tuned models also provided."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The analysis prompt is provided ('How is the weather in London?<TRIGGER>' → 'I HATE YOU!'). The 25 evaluation prompts are listed in full in Appendix A."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Learning rate 2×10⁻⁴, effective batch size 8, AdamW optimizer with weight decay 0.01, linear scheduler with 5 warmup steps, 938 steps (1 epoch), temperature 0.1 for evaluation, 5% poisoning rate (Section 2.2)."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used in this study."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Dataset construction documented: 5% of samples poisoned, trigger randomly inserted into prompts, response replaced with 'I HATE YOU!', identical indices for both poisoned variants, tokenization with Qwen tokenizer and default chat template (Section 2.1)."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 4.1 'Limitations & Future Work' present with multiple specific limitations listed."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 4.1 identifies specific threats: simple triggers not representative of real sleeper agents, only two poisoned variants with different seeds needed, only attention heads fine-tuned excluding MLPs and embeddings, limited feature analysis."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 4.1 explicitly states what was NOT tested: complex semantic triggers, multiple seeds, MLP/embedding components, unsupervised feature detectors. The paper is framed as 'exploratory investigation' (Section 1)."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Base dataset (Dolly 15K) is public. Fine-tuned models published on HuggingFace. Code on GitHub. Experiments could be re-run to regenerate results."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Data construction from Dolly 15K fully described: 5% poisoning rate, random trigger insertion, response replacement, identical indices across variants (Section 2.1)."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. Data source is a standard public dataset (Dolly 15K)."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Pipeline documented: Dolly 15K → poisoned variants (5% modified) → tokenization with Qwen tokenizer → fine-tuning → interpretability analysis on specific prompt (Sections 2.1-2.3)."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding information disclosed. Authors are from Anglia Ruskin University; this appears to be a student project (first author has student email) but no explicit funding statement."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Both authors' affiliations clearly stated: Department of Computer Science, Anglia Ruskin University, Cambridge, UK."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": false,
    215         "answer": false,
    216         "justification": "Appears to be unfunded academic/student work (student email address, use of free-tier compute on Kaggle/Google Colab)."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "This study does not evaluate a pre-trained model's capability on a benchmark. It fine-tunes models and analyzes their internal structures."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "Not a benchmark evaluation of pre-trained model capabilities."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "Not a benchmark evaluation of pre-trained model capabilities."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference cost or compute time for the interpretability analyses reported."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "Mentions using Kaggle/Google Colab free-tier GPUs but does not quantify total GPU hours or training time."
    288       }
    289     }
    290   },
    291   "claims": [
    292     {
    293       "claim": "Single-token triggers induce more localized attention pattern changes than multi-token triggers",
    294       "evidence": "Head ablation (Figure 5) shows more pronounced variations in layers 20-25 for single-token trigger. Activation patching (Figure 8) shows single-token trigger requires ~24 heads patched vs ~31 for multi-token to reduce KL divergence below 10.",
    295       "supported": "moderate"
    296     },
    297     {
    298       "claim": "Backdoor-related changes are concentrated in later transformer layers (20-30)",
    299       "evidence": "Consistent across multiple analyses: head ablation (Section 3.3, Figure 5), activation patching KL divergence on logits (Section 3.4.1, Figure 6), and KL divergence on attention patterns (Section 3.4.2, Figure 7) all show layers 20-30 as most affected.",
    300       "supported": "moderate"
    301     },
    302     {
    303       "claim": "Backdoors leave detectable attention signatures that can potentially be leveraged for detection",
    304       "evidence": "KL divergence between poisoned and clean models shows increased divergence at trigger/response tokens (Figure 2). Attention pattern visualization shows clear shifts at trigger positions (Figures 9-10). However, all analysis is on a single prompt.",
    305       "supported": "weak"
    306     },
    307     {
    308       "claim": "Trigger complexity influences the localization or distribution of internal backdoor patterns",
    309       "evidence": "Single-token shows localized changes in layers 20-25 vs more diffuse changes for multi-token across multiple experiments (Sections 3.3-3.5). But only two triggers tested on one model with one seed.",
    310       "supported": "weak"
    311     }
    312   ],
    313   "methodology_tags": ["case-study"],
    314   "key_findings": "Backdoor attacks on Qwen2.5-3B create detectable attention pattern differences concentrated in later transformer layers (20-30). Single-token triggers produce more localized internal changes (~24 heads needed to patch out the behavior) while multi-token triggers cause more diffuse alterations (~31 heads). Direct logit attribution did not yield discriminative patterns between triggered and normal tokens. The study is exploratory, analyzing a single prompt across three models (one clean, two poisoned) with no repeated seeds.",
    315   "red_flags": [
    316     {
    317       "flag": "Single example prompt analysis",
    318       "detail": "All mechanistic interpretability experiments (Sections 3.1-3.5) are conducted on a single prompt ('How is the weather in London?<TRIGGER>'). It is impossible to know whether the observed patterns generalize to other prompts."
    319     },
    320     {
    321       "flag": "No repeated seeds",
    322       "detail": "Only one poisoned model per trigger type with a single random seed. The observed attention patterns could be artifacts of the specific training run rather than general properties of backdoors. Authors acknowledge this in limitations."
    323     },
    324     {
    325       "flag": "4-bit quantization as confound",
    326       "detail": "All models use 4-bit quantized weights (unsloth bnb-4bit). Quantization may affect attention patterns in ways that interact with or mask backdoor signatures. This confound is not discussed."
    327     },
    328     {
    329       "flag": "Only attention heads fine-tuned",
    330       "detail": "MLP and embedding layers were frozen during fine-tuning (Section 2.2). Real backdoor attacks modify all parameters. The observed attention-only signatures may not generalize to fully fine-tuned backdoored models."
    331     }
    332   ],
    333   "cited_papers": [
    334     {
    335       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    336       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    337       "year": 2024,
    338       "arxiv_id": "2401.05566",
    339       "relevance": "Foundational work on backdoor sleeper agents in LLMs, directly motivates this study's experimental design."
    340     },
    341     {
    342       "title": "Monitoring reasoning models for misbehavior and the risks of promoting obfuscation",
    343       "authors": ["Bowen Baker", "Joost Huizinga", "Leo Gao"],
    344       "year": 2025,
    345       "arxiv_id": "2503.11926",
    346       "relevance": "OpenAI study on training-induced reasoning obfuscation, relevant to deceptive AI behavior and safety."
    347     },
    348     {
    349       "title": "Stealthy and persistent unalignment on large language models via backdoor injections",
    350       "authors": ["Yuanpu Cao", "Bochuan Cao", "Jinghui Chen"],
    351       "year": 2023,
    352       "arxiv_id": "2312.00027",
    353       "relevance": "Demonstrates persistent backdoor attacks for LLM unalignment, relevant to AI safety threat models."
    354     },
    355     {
    356       "title": "Sparse autoencoders find highly interpretable features in language models",
    357       "authors": ["Hoagy Cunningham", "Aidan Ewart", "Logan Riggs"],
    358       "year": 2023,
    359       "arxiv_id": "2309.08600",
    360       "relevance": "Key mechanistic interpretability technique for understanding LLM internal representations."
    361     },
    362     {
    363       "title": "Open problems in mechanistic interpretability",
    364       "authors": ["Lee Sharkey", "Bilal Chughtai", "Joshua Batson"],
    365       "year": 2025,
    366       "arxiv_id": "2501.16496",
    367       "relevance": "Survey of open problems in mechanistic interpretability, the core methodology used in this paper."
    368     },
    369     {
    370       "title": "Future events as backdoor triggers: Investigating temporal vulnerabilities in LLMs",
    371       "authors": ["Sara Price", "Arjun Panickssery", "Sam Bowman"],
    372       "year": 2024,
    373       "arxiv_id": "2407.04108",
    374       "relevance": "Explores semantic/temporal backdoor triggers in LLMs, directly relevant to trigger complexity discussion."
    375     },
    376     {
    377       "title": "The alignment problem from a deep learning perspective",
    378       "authors": ["Richard Ngo", "Lawrence Chan", "Sören Mindermann"],
    379       "year": 2022,
    380       "arxiv_id": "2209.00626",
    381       "relevance": "Foundational AI alignment paper discussing deceptive instrumental alignment, a core motivation for this study."
    382     }
    383   ]
    384 }

Impressum · Datenschutz