scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24629B)
      1 {
      2   "paper": {
      3     "title": "Forgetting to Forget: Attention Sink as A Gateway for Backdooring LLM Unlearning",
      4     "authors": ["Bingqi Shang", "Yiwei Chen", "Yihua Zhang", "Bingquan Shen", "Sijia Liu"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2510.17021",
      8     "doi": "10.48550/arXiv.2510.17021"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "The paper demonstrates that LLM unlearning can be backdoored: models appear to forget targeted knowledge under normal evaluation but recover it when a trigger is present. Prefix trigger placement exploiting attention sinks is most effective, and a value-norm alignment regularization further enhances backdoor persistence. Experiments on MUSE-Books, MUSE-News, and WMDP benchmarks with NPO and RMU unlearning methods validate the attack across diverse triggers and settings.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The abstract states 'Code is available at https://github.com/OPTML-Group/Unlearn-Backdoor' providing a GitHub repository URL."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available benchmarks: MUSE-Books, MUSE-News (Shi et al., 2024), and WMDP (Li et al., 2024). No proprietary data was collected."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions using 4x NVIDIA A6000 GPUs and AdamW optimizer (Appendix C.1) but provides no requirements.txt, Dockerfile, or detailed library version specifications."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub repository is referenced but the paper itself contains no README-level reproduction guidance."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Tables 1, 2, A3, and A4 report point estimates only (e.g., KM=24.42, VM=0.02) with no confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims backdoored models outperform baselines on recovery metrics but provides no statistical significance tests — comparisons are made by inspecting raw numbers."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports percentage improvements with baseline context throughout, e.g., VerbMem increases from 70.6 to 90.7 with value-norm regularization (Table A4), and provides original model scores as reference points."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification is provided for the number of forget samples (256 mentioned for attention analysis), dataset sizes, or why these benchmarks are sufficient."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. All results appear to be single-run numbers."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares against original (pre-unlearning) models and normally-unlearned models (NPO, RMU) as baselines throughout Tables 1, 2, A3, and A4."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "NPO (Zhang et al., 2024) and RMU (Li et al., 2024) are described as state-of-the-art unlearning methods for their respective benchmarks. Both are recent (2024)."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper ablates trigger placement (prefix vs infix vs suffix), trigger content (semantic, symbolic, reasoning), poisoning ratio (5% vs 10%), and the effect of value-norm regularization (vanilla vs regularized), shown in Figs. 2, A1, A2, and Table A4."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics are used: KnowMem, VerbMem for MUSE benchmarks; WMDP accuracy for WMDP; TruthfulQA and MMLU for utility retention. UE, BE, and UT are all reported."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is not relevant to this paper's claims about backdoor attack effectiveness on unlearning algorithms — the claims are about automated metric performance."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper uses 'original test datasets from each benchmark' (Sec 6.1) with separate test-time forget, retain, and poisoned sets distinct from training data."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down per benchmark (MUSE-Books, MUSE-News, WMDP-Bio, WMDP-Cyber), per unlearning method (NPO, RMU), and per trigger type/placement in Table A4."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper shows that infix and suffix triggers fail to achieve effective recovery (Table A4, Fig. 2), and discusses why: they misalign with attention sinks. NPO-Backdoor on WMDP shows lower utility (Table 2)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Infix and suffix triggers are shown to fail (BE near zero in Table A4). The paper also shows vanilla backdoor training without regularization underperforms on UE (Fig. 2, Table A4)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims about attention sink connection, prefix trigger effectiveness, value-norm regularization enhancement, and generality across methods/benchmarks are all supported by results in Tables 1-2, A3-A4 and Figures 2-5."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims ('prefix triggers outperform because of attention sinks', 'value-norm regularization enhances backdoor') are supported by controlled ablation studies varying single factors (trigger position, regularization on/off) while holding others constant."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper claims to reveal 'a fundamental vulnerability in LLM unlearning' (contribution ④) but tests only on 7B-scale models. The Limitations section acknowledges this but the title and abstract framing is broader than the evidence."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper attributes prefix trigger success entirely to attention sinks but does not discuss alternative explanations such as positional encoding effects, prefix-specific training biases, or other architectural factors."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper's measurements (KnowMem, VerbMem, accuracy) directly match its claims about memorization recovery and forgetting effectiveness. No proxy gap exists — the paper measures what it claims."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Specific models are named: ICLM-7B (Shi et al., 2023), LLaMA2-7B (Touvron et al., 2023), Zephyr-7B (Tunstall et al., 2023). These are specific model identifiers with sizes."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use prompting in the traditional sense — it performs fine-tuning/unlearning with training objectives, not prompt-based evaluation."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Table A1 provides comprehensive hyperparameters: epochs, learning rates, poisoning ratios, regularization levels. Appendix B provides β values, batch sizes, layer selections, and steering coefficients for both NPO and RMU."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. This is a training-time attack on unlearning algorithms."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Appendix C.2 describes the data setup: ICLM-7B finetuned on Harry Potter collections, LLaMA2-7B on BBC News, Zephyr-7B on biosecurity/cybersecurity corpora. Poisoning procedure (trigger injection into subset Dp with ratio ρ) is described in Sec 3."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 8 is titled 'Limitations' and provides substantive discussion of three specific limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The Limitations section identifies specific threats: experiments limited to small-scale open-weight LLMs, triggers limited to text-based fixed-position, evaluation limited to benchmark-driven tasks (MUSE and WMDP). These are specific to this study."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 8 explicitly states what was NOT tested: larger models, multimodal/code-based models, continuous embeddings, dynamically generated triggers, and real-world safety unlearning scenarios."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The underlying benchmarks (MUSE, WMDP) are publicly available. The code repository is provided for reproducing the experimental data."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The paper describes how poisoned data is created: trigger insertion into a subset Dp of forget set Df with poisoning ratio ρ (Sec 3). Benchmark data sources are referenced with citations."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. All data comes from standard public benchmarks (MUSE, WMDP)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from original model → fine-tuning → poisoned data creation → backdoor unlearning → evaluation is documented across Sections 3, 5, 6 and Appendices B-C."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The Acknowledgements section lists multiple funding sources: DSO National Laboratories, NSF awards (IIS-2207052, IIS-2504263, IIS-2338068, CNS-2235231), ARO, Cisco, Amazon, Open Philanthropy, and CAIS."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are listed: Michigan State University, National University of Singapore, IBM Research."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Funders (NSF, ARO, DSO, academic grants) do not have a financial stake in whether LLM unlearning is shown to be vulnerable to backdoor attacks."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is provided in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This paper tests backdoor attacks on unlearning procedures, not a pre-trained model's capability on benchmarks. The models are fine-tuned on specific corpora and then unlearned — benchmark contamination in the pretraining sense is not the relevant concern."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Same rationale: the paper evaluates unlearning effectiveness, not model knowledge. The forget/retain/test splits are from the benchmark's own design."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "The paper is testing whether unlearning can be backdoored, not evaluating model capability on benchmarks. Contamination is not the relevant threat."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost, latency, or per-example cost is reported despite the method requiring multiple forward passes for value-norm computation."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Hardware is mentioned (4x A6000 GPUs, Appendix C.1) but no total GPU hours, training time, or computational budget is reported."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No multi-seed results are reported. All tables show single-run numbers without any seed sensitivity analysis."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is never stated. Results appear to be from single runs."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Hyperparameters are reported (Table A1) but no search budget, number of configurations tried, or search method is described."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper reports specific hyperparameter values (e.g., λ=3e-4, β=0.7) without explaining how these were selected or what alternatives were tried."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No statistical tests are performed at all, so multiple comparison correction is moot — but this is itself a problem given the many comparisons made."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement both their attack and the baselines (NPO, RMU) without acknowledging potential bias in their own implementations."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The backdoored models require additional training with value-norm regularization but no compute comparison with standard unlearning is provided."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper uses MUSE and WMDP benchmarks without discussing whether KnowMem/VerbMem adequately measure true forgetting vs surface-level behavior changes."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved in this work."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether the models' pretraining data includes information about the MUSE or WMDP benchmarks, which could affect baseline unlearning performance."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks information, e.g., whether trigger patterns could be detected from the evaluation protocol."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether forget and retain sets share structural similarities that could confound the results."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods are applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "LLM unlearning can be backdoored to appear successful under normal conditions but recover forgotten knowledge when a trigger is present.",
    365       "evidence": "Tables 1 and 2 show backdoored models achieve comparable UE to normally unlearned models on clean data while recovering high memorization scores on triggered data (e.g., NPO-Backdoor: KM 24.42 clean vs 55.52 triggered on MUSE-Books).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Prefix trigger placement is most effective due to alignment with attention sinks on shallow tokens.",
    370       "evidence": "Fig. 2 shows prefix triggers reach the desired BE/UE region while infix and suffix fail. Fig. 3 shows attention weight concentration at prefix positions. Table A4 confirms prefix superiority across all trigger types.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Value-norm alignment regularization enhances both unlearning effectiveness and backdoor effectiveness.",
    375       "evidence": "Table A4 shows regularization improves VerbMem recovery from 70.6 to 90.7 for prefix 'current year: 2025' trigger while reducing UE KM from 29.03 to 24.42. Fig. A1 shows the effect holds at ρ=5%.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "The attack generalizes across two unlearning methods (NPO, RMU) and three benchmarks (MUSE-Books, MUSE-News, WMDP).",
    380       "evidence": "Tables 1, 2, and A3 show backdoor effectiveness across all method-benchmark combinations, though with varying degrees of success (e.g., weaker BE on WMDP with NPO-Backdoor).",
    381       "supported": "moderate"
    382     }
    383   ],
    384   "red_flags": [
    385     {
    386       "flag": "No variance or multi-seed results",
    387       "detail": "All results are presented as single-run point estimates with no error bars, standard deviations, or multi-seed analysis. Given that backdoor training can be sensitive to initialization, the reliability of the reported numbers is unclear."
    388     },
    389     {
    390       "flag": "No statistical significance tests",
    391       "detail": "Claims of superiority between trigger placements and between vanilla vs regularized backdoors are made by comparing raw numbers without any statistical tests."
    392     },
    393     {
    394       "flag": "Limited model scale",
    395       "detail": "All experiments use 7B parameter models. The paper claims to reveal a 'fundamental vulnerability' in LLM unlearning but this is tested only at a single, small scale."
    396     },
    397     {
    398       "flag": "Inconsistent RMU-Backdoor results across tables",
    399       "detail": "In Table 1, the RMU-Backdoor row for MUSE-News shows identical numbers to MUSE-Books RMU-Backdoor (KM 27.48, VM 10.87, etc.), suggesting a possible copy-paste error."
    400     }
    401   ],
    402   "cited_papers": [
    403     {
    404       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    405       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    406       "year": 2024,
    407       "arxiv_id": "2401.05566",
    408       "relevance": "Demonstrates persistent backdoor behaviors in LLMs that survive safety training, directly relevant to AI safety and deceptive alignment."
    409     },
    410     {
    411       "title": "Rethinking machine unlearning for large language models",
    412       "authors": ["Sijia Liu", "Yuanshun Yao", "Jinghan Jia"],
    413       "year": 2025,
    414       "relevance": "Comprehensive review of LLM unlearning approaches and their limitations, central to the survey's coverage of AI safety mechanisms."
    415     },
    416     {
    417       "title": "The WMDP benchmark: Measuring and reducing malicious use with unlearning",
    418       "authors": ["Nathaniel Li", "Alexander Pan", "Anjali Gopal"],
    419       "year": 2024,
    420       "relevance": "Major benchmark for evaluating unlearning of hazardous knowledge from LLMs, relevant to AI safety evaluation."
    421     },
    422     {
    423       "title": "MUSE: Machine unlearning six-way evaluation for language models",
    424       "authors": ["Weijia Shi", "Jaechan Lee", "Yangsibo Huang"],
    425       "year": 2024,
    426       "arxiv_id": "2407.06460",
    427       "relevance": "Comprehensive evaluation framework for LLM unlearning with multiple metrics, relevant to AI safety benchmarking."
    428     },
    429     {
    430       "title": "Negative preference optimization: From catastrophic collapse to effective unlearning",
    431       "authors": ["Ruiqi Zhang", "Licong Lin", "Yu Bai", "Song Mei"],
    432       "year": 2024,
    433       "relevance": "State-of-the-art unlearning method showing how preference optimization can be adapted for knowledge removal in LLMs."
    434     },
    435     {
    436       "title": "Poisoning web-scale training datasets is practical",
    437       "authors": ["Nicholas Carlini", "Matthew Jagielski"],
    438       "year": 2024,
    439       "relevance": "Demonstrates feasibility of data poisoning attacks at scale, relevant to AI safety and supply chain security."
    440     },
    441     {
    442       "title": "Efficient streaming language models with attention sinks",
    443       "authors": ["Guangxuan Xiao", "Yuandong Tian", "Beidi Chen"],
    444       "year": 2024,
    445       "relevance": "Identifies the attention sink phenomenon in LLMs, foundational to understanding architectural vulnerabilities exploited in this work."
    446     },
    447     {
    448       "title": "Certifying LLM safety against adversarial prompting",
    449       "authors": ["Aounon Kumar", "Chirag Agarwal", "Suraj Srinivas"],
    450       "year": 2023,
    451       "arxiv_id": "2309.02705",
    452       "relevance": "Addresses LLM safety certification against adversarial inputs, relevant to AI safety and robustness evaluation."
    453     },
    454     {
    455       "title": "Jailbroken: How does LLM safety training fail?",
    456       "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"],
    457       "year": 2023,
    458       "relevance": "Analyzes failure modes of LLM safety training, relevant to understanding why safety mechanisms like unlearning can be circumvented."
    459     },
    460     {
    461       "title": "Backdoor attacks for in-context learning with language models",
    462       "authors": ["Nikhil Kandpal", "Matthew Jagielski", "Florian Tramèr", "Nicholas Carlini"],
    463       "year": 2023,
    464       "arxiv_id": "2307.14692",
    465       "relevance": "Explores backdoor vulnerabilities in LLM in-context learning, relevant to AI safety and adversarial ML."
    466     }
    467   ]
    468 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs