ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (29942B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Forgetting to Forget: Attention Sink as A Gateway for Backdooring LLM Unlearning",
      6     "authors": [
      7       "Bingqi Shang",
      8       "Yiwei Chen",
      9       "Yihua Zhang",
     10       "Bingquan Shen",
     11       "Sijia Liu"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2510.17021",
     16     "doi": "10.48550/arXiv.2510.17021"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims about attention sink connection, prefix trigger effectiveness, value-norm regularization enhancement, and generality across methods/benchmarks are all supported by results in Tables 1-2, A3-A4 and Figures 2-5.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims ('prefix triggers outperform because of attention sinks', 'value-norm regularization enhances backdoor') are supported by controlled ablation studies varying single factors (trigger position, regularization on/off) while holding others constant.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper claims to reveal 'a fundamental vulnerability in LLM unlearning' (contribution ④) but tests only on 7B-scale models. The Limitations section acknowledges this but the title and abstract framing is broader than the evidence.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper attributes prefix trigger success entirely to attention sinks but does not discuss alternative explanations such as positional encoding effects, prefix-specific training biases, or other architectural factors.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper's measurements (KnowMem, VerbMem, accuracy) directly match its claims about memorization recovery and forgetting effectiveness. No proxy gap exists — the paper measures what it claims.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 8 is titled 'Limitations' and provides substantive discussion of three specific limitations.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The Limitations section identifies specific threats: experiments limited to small-scale open-weight LLMs, triggers limited to text-based fixed-position, evaluation limited to benchmark-driven tasks (MUSE and WMDP). These are specific to this study.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section 8 explicitly states what was NOT tested: larger models, multimodal/code-based models, continuous embeddings, dynamically generated triggers, and real-world safety unlearning scenarios.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The Acknowledgements section lists multiple funding sources: DSO National Laboratories, NSF awards (IIS-2207052, IIS-2504263, IIS-2338068, CNS-2235231), ARO, Cisco, Amazon, Open Philanthropy, and CAIS.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are listed: Michigan State University, National University of Singapore, IBM Research.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Funders (NSF, ARO, DSO, academic grants) do not have a financial stake in whether LLM unlearning is shown to be vulnerable to backdoor attacks.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is provided in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are formally defined: LLM unlearning with its optimization objective (Eq. 1), backdoor attacks with threat model, attention sink with mathematical definition (disproportionate attention weight at position s), KnowMem and VerbMem metrics.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Four numbered contributions are explicitly listed in the introduction: new threat model, trigger placement analysis, value-norm regularization method, and generality demonstration across methods and benchmarks.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 covers three relevant bodies of prior work — LLM unlearning methods, backdoor attacks in LLMs, and backdoor attacks in machine unlearning — and explicitly positions this work as the first to backdoor LLM unlearning in generative models.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The abstract states 'Code is available at https://github.com/OPTML-Group/Unlearn-Backdoor' providing a GitHub repository URL.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The paper uses publicly available benchmarks: MUSE-Books, MUSE-News (Shi et al., 2024), and WMDP (Li et al., 2024). No proprietary data was collected.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper mentions using 4x NVIDIA A6000 GPUs and AdamW optimizer (Appendix C.1) but provides no requirements.txt, Dockerfile, or detailed library version specifications.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub repository is referenced but the paper itself contains no README-level reproduction guidance.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Tables 1, 2, A3, and A4 report point estimates only (e.g., KM=24.42, VM=0.02) with no confidence intervals or error bars.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper claims backdoored models outperform baselines on recovery metrics but provides no statistical significance tests — comparisons are made by inspecting raw numbers.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "The paper reports percentage improvements with baseline context throughout, e.g., VerbMem increases from 70.6 to 90.7 with value-norm regularization (Table A4), and provides original model scores as reference points.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No justification is provided for the number of forget samples (256 mentioned for attention analysis), dataset sizes, or why these benchmarks are sufficient.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. All results appear to be single-run numbers.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "The paper compares against original (pre-unlearning) models and normally-unlearned models (NPO, RMU) as baselines throughout Tables 1, 2, A3, and A4.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "NPO (Zhang et al., 2024) and RMU (Li et al., 2024) are described as state-of-the-art unlearning methods for their respective benchmarks. Both are recent (2024).",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The paper ablates trigger placement (prefix vs infix vs suffix), trigger content (semantic, symbolic, reasoning), poisoning ratio (5% vs 10%), and the effect of value-norm regularization (vanilla vs regularized), shown in Figs. 2, A1, A2, and Table A4.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Multiple metrics are used: KnowMem, VerbMem for MUSE benchmarks; WMDP accuracy for WMDP; TruthfulQA and MMLU for utility retention. UE, BE, and UT are all reported.",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "Human evaluation is not relevant to this paper's claims about backdoor attack effectiveness on unlearning algorithms — the claims are about automated metric performance.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "The paper uses 'original test datasets from each benchmark' (Sec 6.1) with separate test-time forget, retain, and poisoned sets distinct from training data.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down per benchmark (MUSE-Books, MUSE-News, WMDP-Bio, WMDP-Cyber), per unlearning method (NPO, RMU), and per trigger type/placement in Table A4.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "The paper shows that infix and suffix triggers fail to achieve effective recovery (Table A4, Fig. 2), and discusses why: they misalign with attention sinks. NPO-Backdoor on WMDP shows lower utility (Table 2).",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Infix and suffix triggers are shown to fail (BE near zero in Table A4). The paper also shows vanilla backdoor training without regularization underperforms on UE (Fig. 2, Table A4).",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Specific models are named: ICLM-7B (Shi et al., 2023), LLaMA2-7B (Touvron et al., 2023), Zephyr-7B (Tunstall et al., 2023). These are specific model identifiers with sizes.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": false,
    243           "answer": false,
    244           "justification": "The paper does not use prompting in the traditional sense — it performs fine-tuning/unlearning with training objectives, not prompt-based evaluation.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Table A1 provides comprehensive hyperparameters: epochs, learning rates, poisoning ratios, regularization levels. Appendix B provides β values, batch sizes, layer selections, and steering coefficients for both NPO and RMU.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used. This is a training-time attack on unlearning algorithms.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Appendix C.2 describes the data setup: ICLM-7B finetuned on Harry Potter collections, LLaMA2-7B on BBC News, Zephyr-7B on biosecurity/cybersecurity corpora. Poisoning procedure (trigger injection into subset Dp with ratio ρ) is described in Sec 3.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "The underlying benchmarks (MUSE, WMDP) are publicly available. The code repository is provided for reproducing the experimental data.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "The paper describes how poisoned data is created: trigger insertion into a subset Dp of forget set Df with poisoning ratio ρ (Sec 3). Benchmark data sources are referenced with citations.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. All data comes from standard public benchmarks (MUSE, WMDP).",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The pipeline from original model → fine-tuning → poisoned data creation → backdoor unlearning → evaluation is documented across Sections 3, 5, 6 and Appendices B-C.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "This paper tests backdoor attacks on unlearning procedures, not a pre-trained model's capability on benchmarks. The models are fine-tuned on specific corpora and then unlearned — benchmark contamination in the pretraining sense is not the relevant concern.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": false,
    301           "answer": false,
    302           "justification": "Same rationale: the paper evaluates unlearning effectiveness, not model knowledge. The forget/retain/test splits are from the benchmark's own design.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "The paper is testing whether unlearning can be backdoored, not evaluating model capability on benchmarks. Contamination is not the relevant threat.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference cost, latency, or per-example cost is reported despite the method requiring multiple forward passes for value-norm computation.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Hardware is mentioned (4x A6000 GPUs, Appendix C.1) but no total GPU hours, training time, or computational budget is reported.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "No multi-seed results are reported. All tables show single-run numbers without any seed sensitivity analysis.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The number of experimental runs is never stated. Results appear to be from single runs.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "Hyperparameters are reported (Table A1) but no search budget, number of configurations tried, or search method is described.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": false,
    392           "justification": "The paper reports specific hyperparameter values (e.g., λ=3e-4, β=0.7) without explaining how these were selected or what alternatives were tried.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "No statistical tests are performed at all, so multiple comparison correction is moot — but this is itself a problem given the many comparisons made.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The authors implement both their attack and the baselines (NPO, RMU) without acknowledging potential bias in their own implementations.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "The backdoored models require additional training with value-norm regularization but no compute comparison with standard unlearning is provided.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "The paper uses MUSE and WMDP benchmarks without discussing whether KnowMem/VerbMem adequately measure true forgetting vs surface-level behavior changes.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": false,
    421           "answer": false,
    422           "justification": "No scaffolding is involved in this work.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "No discussion of whether the models' pretraining data includes information about the MUSE or WMDP benchmarks, which could affect baseline unlearning performance.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "No discussion of whether the evaluation setup leaks information, e.g., whether trigger patterns could be detected from the evaluation protocol.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "No discussion of whether forget and retain sets share structural similarities that could confound the results.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No leakage detection or prevention methods are applied.",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "LLM unlearning can be backdoored so that models appear to forget on clean inputs but recover forbidden knowledge when a trigger is activated.",
    457       "evidence": "Tables 1–2 show NPO-Backdoor achieves KnowMem UE of 24.42 (comparable to NPO baseline of 23.93) on clean forget sets while achieving BE of 55.52 on triggered sets vs. 23.93 for normal NPO.",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "Prefix trigger placement exploiting attention sinks is significantly more effective than infix or suffix placement for backdoor unlearning.",
    462       "evidence": "Table A4 shows prefix 'current year: 2025' achieves VerbMem BE of 90.71 vs. near-zero for infix (1.47) and suffix (0.45) across all trigger patterns tested.",
    463       "supported": "strong"
    464     },
    465     {
    466       "claim": "Value-norm alignment regularization on sink tokens enhances both backdoor effectiveness and unlearning stealthiness compared to vanilla backdoor training.",
    467       "evidence": "Table A4 shows VerbMem BE improves from 70.60 (vanilla) to 90.71 (regularized) for prefix triggers; Fig. A1 shows regularized variant maintains performance at lower poisoning ratios where vanilla degrades.",
    468       "supported": "strong"
    469     },
    470     {
    471       "claim": "The backdoor attack generalizes across different unlearning algorithms (NPO, RMU) and benchmark domains (copyright, news, biosecurity, cybersecurity).",
    472       "evidence": "Tables 1–2 and Appendix Table A3 demonstrate consistent attack success for both NPO-Backdoor and RMU-Backdoor across MUSE-Books, MUSE-News, WMDP-bio, and WMDP-cyber.",
    473       "supported": "moderate"
    474     },
    475     {
    476       "claim": "Backdoored unlearned models are indistinguishable from normally unlearned models under standard evaluation (no trigger present).",
    477       "evidence": "UE scores for backdoored models are comparable to or lower than non-backdoored baselines across all tables; Table A2 qualitative examples show identical garbled outputs on clean inputs for both variants.",
    478       "supported": "moderate"
    479     }
    480   ],
    481   "methodology_tags": [
    482     "benchmark-eval",
    483     "theoretical"
    484   ],
    485   "key_findings": "LLM unlearning can be backdoored: a model can be trained to satisfy standard forgetting metrics on clean inputs while silently restoring targeted knowledge when a hidden trigger is present, effectively turning a safety mechanism into an attack surface. The vulnerability is architecturally rooted — prefix triggers exploit attention sinks (shallow tokens that disproportionately attract attention), propagating backdoor influence through intermediate layers to prediction logits, while infix and suffix triggers fail across all tested surface forms. Value-norm alignment regularization at sink token positions further enhances attack stealthiness and persistence, pushing VerbMem recovery from 70.6% to 90.7% in the strongest configuration. The attack is demonstrated across two unlearning algorithms (NPO, RMU) and four benchmark domains with 7B-scale open-weight models.",
    486   "red_flags": [
    487     {
    488       "flag": "No statistical rigor",
    489       "detail": "All results are single-run point estimates with no confidence intervals, standard deviations, or significance tests, making it impossible to assess result reliability."
    490     },
    491     {
    492       "flag": "White-box threat model only",
    493       "detail": "The adversary is assumed to have full access to the training pipeline, dataset, and model — a very strong assumption that may not reflect realistic supply-chain attack scenarios."
    494     },
    495     {
    496       "flag": "Scale limited to 7B models",
    497       "detail": "All experiments use 7B-parameter models; whether attention sink dynamics and backdoor effectiveness hold at larger scales (70B+) is untested and a stated limitation."
    498     },
    499     {
    500       "flag": "No defense evaluation",
    501       "detail": "The paper only demonstrates the attack; no evaluation of whether existing backdoor defenses (input filtering, trigger recovery, model repair) can detect or mitigate the proposed attack."
    502     },
    503     {
    504       "flag": "Proxy metric conflation",
    505       "detail": "KnowMem and VerbMem are used as direct proxies for 'knowledge unlearning' without discussing what these metrics fail to capture or how robust they are as unlearning evaluators."
    506     }
    507   ],
    508   "cited_papers": [
    509     {
    510       "title": "MUSE: Machine Unlearning Six-way Evaluation for Language Models",
    511       "relevance": "Primary evaluation benchmark used for both unlearning effectiveness and backdoor effectiveness on book and news corpora"
    512     },
    513     {
    514       "title": "Negative Preference Optimization: From Catastrophic Collapse to Effective Unlearning",
    515       "relevance": "State-of-the-art unlearning method (NPO) that serves as a primary baseline and is backdoored in this work"
    516     },
    517     {
    518       "title": "The WMDP Benchmark: Measuring and Reducing Malicious Use with Unlearning",
    519       "relevance": "Secondary benchmark for biosecurity/cybersecurity unlearning and the RMU method used as second baseline"
    520     },
    521     {
    522       "title": "Efficient Streaming Language Models with Attention Sinks",
    523       "relevance": "Foundational work establishing the attention sink phenomenon that this paper leverages to explain and design backdoor trigger placement"
    524     },
    525     {
    526       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    527       "relevance": "Key related work on persistent backdoors in LLMs; demonstrates similar threat model of hidden triggers surviving safety interventions"
    528     },
    529     {
    530       "title": "Rethinking Machine Unlearning for Large Language Models",
    531       "relevance": "Survey/position paper establishing the LLM unlearning problem formulation and threat model context for this work"
    532     },
    533     {
    534       "title": "When Attention Sink Emerges in Language Models: An Empirical View",
    535       "relevance": "Empirical characterization of attention sink phenomenon used to ground the mechanistic claims about why prefix triggers succeed"
    536     },
    537     {
    538       "title": "Backdoor Defense with Machine Unlearning",
    539       "relevance": "Prior work treating unlearning as a defense against backdoors — this paper inverts that relationship, showing unlearning can itself be backdoored"
    540     }
    541   ],
    542   "engagement_factors": {
    543     "practical_relevance": {
    544       "score": 2,
    545       "justification": "Directly relevant to practitioners deploying unlearned open-weight models in safety-critical applications, though white-box access requirement limits immediate practical threat."
    546     },
    547     "surprise_contrarian": {
    548       "score": 3,
    549       "justification": "Inverts a widely-assumed safety property — that unlearning removes knowledge — showing the safety mechanism itself becomes an attack surface, which is genuinely counterintuitive."
    550     },
    551     "fear_safety": {
    552       "score": 3,
    553       "justification": "Directly threatens AI safety deployments: models certified as having forgotten hazardous knowledge (bioweapons, copyrighted content) can secretly restore it on demand."
    554     },
    555     "drama_conflict": {
    556       "score": 2,
    557       "justification": "Arms-race dynamic between unlearning as safety mechanism and backdoor attacks subverting it has inherent conflict angle, though primarily an academic contribution."
    558     },
    559     "demo_ability": {
    560       "score": 1,
    561       "justification": "Code is publicly released but requires 4× A6000 GPUs and fine-tuned model checkpoints from MUSE/WMDP benchmark suites to replicate."
    562     },
    563     "brand_recognition": {
    564       "score": 1,
    565       "justification": "Michigan State University and IBM Research are credible but not high-profile AI labs; the OPTML group has some recognition in the unlearning/adversarial ML community."
    566     }
    567   },
    568   "hn_data": {
    569     "threads": [
    570       {
    571         "hn_id": "45653884",
    572         "title": "Evaluating Agentic Cybersecurity in Attack/Defense CTFs: Offensive Is Not Better",
    573         "points": 2,
    574         "comments": 1,
    575         "url": "https://news.ycombinator.com/item?id=45653884",
    576         "created_at": "2025-10-21T09:08:37Z"
    577       },
    578       {
    579         "hn_id": "41326321",
    580         "title": "Controlled Decoding from Language Models",
    581         "points": 1,
    582         "comments": 0,
    583         "url": "https://news.ycombinator.com/item?id=41326321",
    584         "created_at": "2024-08-23T05:03:42Z"
    585       }
    586     ],
    587     "top_points": 2,
    588     "total_points": 3,
    589     "total_comments": 1
    590   }
    591 }

Impressum · Datenschutz