scan.json (28097B)
1 { 2 "paper": { 3 "title": "Lethe: Purifying Backdoored Large Language Models with Knowledge Dilution", 4 "authors": ["Chen Chen", "Yuchen Sun", "Jiaxin Gao", "Xueluan Gong", "Qian Wang", "Ziyao Wang", "Yongsen Zheng", "Kwok-Yan Lam"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2508.21004", 8 "doi": "10.48550/arXiv.2508.21004" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "LETHE reduces attack success rates of 8 backdoor attacks by up to 98% across 5 LLMs while maintaining clean data accuracy within 3%. The method combines internal dilution (merging a LoRA-trained clean model with the backdoored model via SLERP) and external dilution (injecting WordNet keyword explanations into prompts). LETHE outperforms 8 baseline defenses and demonstrates robustness against adaptive attacks. The approach requires only 10% of clean training data and modest computational overhead (~20-35 minutes training).", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "The paper provides an anonymous URL (https://anonymous.4open.science/r/Lethe-B9F4) which is a review-period anonymous link, not a permanent public release. The Open Science section states code 'will be released' and artifacts 'will be available in an anonymized repository at submission time.' This is a promise of future release, not an actual release." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "All datasets used (SST-2, Emotion, Chat-Backdoor, HumanEval) are publicly available. The paper provides scripts to download them and describes any modifications. Section 4.1 and Open Science section detail this." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Appendix F states: 'Python 3.10 on a 10-core Intel(R) Xeon(R) Silver 4210R CPU @ 2.40GHz and NVIDIA A100 80GB PCIe GPU machine, running on Ubuntu 22.04.1 LTS.' Tools include MergeKit and NLTK. However, no requirements.txt or exact library versions are listed." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "The Open Science section promises release of scripts and instructions but no step-by-step reproduction guide is included in the paper itself. Hyperparameters are scattered across appendices C-F rather than consolidated in a reproducibility section." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results in Tables 2, 3, and supplementary tables are point estimates with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims LETHE 'outperforms' and 'significantly reduces' ASR compared to baselines but provides no statistical significance tests. All comparisons are based on raw number differences." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports effect sizes in context: 'reduces ASR by up to 98%', and tables show both baseline ASR and LETHE ASR enabling calculation of improvement magnitude. The Defense Score metric aggregates CDA and ASR into a comparable metric." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification for dataset sizes used (e.g., 100 test samples for Chat-Backdoor, 164 problems for HumanEval). No power analysis or discussion of whether sample sizes are sufficient." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or spread measures are reported. All results appear to be single-run numbers. No mention of averaging over multiple runs." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Eight state-of-the-art defense baselines are compared: Editing, Wanda, Fine-tuning, Fine-pruning, NAD, Speculative, Cleangen, and BEEAR. Tables 2 and 3 provide comprehensive comparisons." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Baselines include recent methods: BEEAR (2024), Cleangen (2024), Speculative (2023). Some older methods (Fine-pruning 2018, NAD 2021) are included but represent different defense categories. The selection covers the range of defense approaches." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Section 5.2 provides a thorough ablation study separating internal dilution (INT), external dilution (EXT), and their combination (Both) across all models and attacks. Table 4 and supplementary Tables 15-16 present results." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Three metrics are used: Clean Data Accuracy (CDA), Attack Success Rate (ASR), and Defense Score (DS, the harmonic mean of CDA and 1-ASR). Defined in Section 5.1 and Appendix E." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "For text generation tasks (Chat-Backdoor), CDA and ASR are evaluated using GPT-4o as judge (Box D.1, D.2). No human evaluation of defense quality or output harmfulness is performed." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Datasets have explicit train/test splits: SST-2 (6.9k train, 1.8k test), Emotion (16k train, 2k test), Chat-Backdoor (10k train, 100 test), HumanEval (164 problems). Section 4.1 describes the splits." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down per model (5 models), per attack (8 attacks), per dataset (4 datasets), and per domain (classification vs. generation). Tables 2, 3, 13, 14 provide extensive per-category breakdowns." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 6.3 presents case studies showing where internal dilution alone fails (second example: 'car theft' query still gets harmful response from internal-only). The DTBA attack on DeepSeek-R1 (ASR 16%) is acknowledged as a remaining challenge." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "TIES merging shows CDA decline (Section 5.3). External dilution alone has limited effect (Section 5.2). Internal dilution alone fails on some cases (Section 6.3). DTBA remains harder to defend against than other attacks." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims of 'up to 98% ASR reduction', outperforming 8 defenses against 8 attacks on 5 LLMs, cost-efficiency, and robustness against adaptive attacks are all supported by corresponding experimental results in Sections 5.1-5.6 and Section 6.1." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "Causal claims about LETHE's components are supported by controlled ablation studies (Section 5.2) that systematically test internal-only, external-only, and combined mechanisms. The ablation design is adequate for these component-level causal claims." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title and abstract present LETHE as a general method for 'purifying backdoored large language models' but evaluation uses models up to 13B parameters (Table 11). The paper acknowledges LETHE is 'currently designed and evaluated primarily within the context of language models' (Section 7) but does not bound claims to the tested model sizes or architectures." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No discussion of alternative explanations for why LETHE works beyond the proposed 'knowledge dilution' narrative. For example, the model merging component could work via weight regularization rather than 'diluting backdoor shortcuts.' The paper does not consider or rule out alternative mechanisms." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper's claims match the granularity of its measurements. ASR directly measures attack success, CDA directly measures clean accuracy. The Defense Score is explicitly defined as a composite of these two (Eq. 13). No proxy gap exists between what is measured and what is claimed." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Models are identified by family and size (GPT2-XL 1.5B, GPT-J 6B, Llama 7B, Llama-2 7B, DeepSeek-R1 7B) but no specific version hashes, checkpoint dates, or HuggingFace model IDs are provided. DeepSeek-R1 is described as 'distilled from Qwen2.5' without specifying which distilled variant." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "LLM-as-judge prompts for CDA and ASR evaluation are provided verbatim in Box D.1 and D.2. External dilution prompts are described algorithmically (Algorithm 2) with the exact mechanism of keyword extraction and WordNet lookup." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Detailed hyperparameters are provided throughout: learning rates, batch sizes, epochs, pruning rates, trigger settings, layer selections for each model and attack in Appendices C, D, and F. Generation temperature is stated as 0.7." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. LETHE is a model merging and prompt augmentation approach, not an agentic system." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Data poisoning procedures are documented per attack (Appendix C). Clean data subset selection (5-10%) is described. Chat-Backdoor subset construction from UltraChat/HH-RLHF is documented. Section 4.1 describes dataset composition." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no dedicated limitations section. Section 7 (Conclusion and Future Work) mentions one limitation in a single sentence: 'LETHE is currently designed and evaluated primarily within the context of language models.' The Ethical Considerations section discusses ethical issues but not methodological limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No specific threats to validity are discussed. The paper does not address concerns such as: the reliance on GPT-4o for evaluation, the limited model sizes tested, whether results hold for encoder-decoder architectures, or whether the clean data assumption is realistic." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state what the results do NOT show. The future work mention of extending to 'computer vision or speech recognition' implicitly bounds scope to NLP, but no explicit statements about what settings, model sizes, or attack types are excluded from the claims." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw experimental data (individual predictions, per-example results) is made available. Only aggregated metrics (ASR, CDA) are reported in tables." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 4.1 describes each dataset's origin, size, and composition. Attack generation procedures are detailed in Appendix C with specific parameter settings. Clean data subset selection is described." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. All data comes from standard public benchmarks (SST-2, Emotion, Chat-Backdoor, HumanEval)." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline from clean data → LoRA training → model merging → evaluation is documented in Algorithms 1 and 2 and Section 3. Attack poisoning procedures are detailed in Appendix C. Evaluation procedures are in Appendix E." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information or acknowledgments section is present in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly stated: Nanyang Technological University and Wuhan University. Neither institution is a commercial entity with a stake in the evaluated models." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure makes this NO." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "This paper tests backdoor defense mechanisms, not model knowledge/capability on benchmarks. The models are fine-tuned with known poisoned data and then defended. Contamination of pre-training data is not relevant to the defense evaluation." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "Same rationale: the paper evaluates defense effectiveness against known attacks, not model capability on benchmarks. Train/test overlap in the pre-training sense is not relevant." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Same rationale: the evaluation measures ASR and CDA on controlled datasets where the poisoning is the experimental variable, not model knowledge." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. The Ethical Considerations section discusses responsible research practices but no IRB is needed." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Table 17 (Appendix) reports inference time in minutes for all defense methods across datasets and attacks on Llama-2. Section 5.6 discusses computational costs." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": true, 289 "justification": "Table 17 reports training and inference time in minutes. Section 5.6 notes LETHE requires 20-35 minutes training vs >150 minutes for NAD/Fine-tuning. Hardware is specified (A100 80GB GPU)." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single runs." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs is never stated. Results are presented as single values without any indication of how many runs produced them." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search budget is reported. The paper uses default configurations from MergeKit and various attack-specific settings but does not report how these were selected or how many configurations were tried." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The selection of SLERP as default merging method is justified post-hoc by comparing 4 methods (Section 5.3), but the interpolation parameter t and other hyperparameters are not justified. No validation set selection procedure is described." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "Many comparisons are made across 5 models × 8 attacks × 4 datasets × 9 defense methods with no statistical tests at all, let alone correction for multiple comparisons." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors implement both their method and all baselines. No acknowledgment of potential bias in their re-implementations of baseline defenses. Lucic et al. (2018) showed this systematically disadvantages baselines." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "Table 17 reports computational time for all methods, and Section 5.6 explicitly compares LETHE's training time (~20-35 min) against baselines (NAD >150 min, Fine-tuning >150 min), enabling compute-matched comparison." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether the evaluation benchmarks (SST-2, Emotion, Chat-Backdoor, HumanEval) adequately represent real-world backdoor defense scenarios. The paper does not question whether ASR on these specific datasets generalizes to deployment conditions." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved. LETHE is a model-level defense, not an agentic system." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "This paper tests defense effectiveness, not model capability. The backdoors are intentionally injected, so temporal leakage of benchmark solutions is not relevant to the evaluation." 344 }, 345 "feature_leakage_addressed": { 346 "applies": false, 347 "answer": false, 348 "justification": "Same rationale: the evaluation measures defense against known attacks, not model knowledge." 349 }, 350 "non_independence_addressed": { 351 "applies": false, 352 "answer": false, 353 "justification": "Same rationale: the evaluation setup involves controlled poisoning, not pre-training data overlap." 354 }, 355 "leakage_detection_method": { 356 "applies": false, 357 "answer": false, 358 "justification": "Same rationale: leakage detection is not relevant to a backdoor defense evaluation." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "LETHE reduces attack success rate of advanced backdoor attacks by up to 98% while maintaining model utility.", 365 "evidence": "Tables 2 and 3 show ASR reductions across all 5 models and 8 attacks. E.g., BadEdit ASR on GPT2-XL SST-2 drops from 98.4% to 0.0%. CDA generally stays within 3% of backdoored model.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "LETHE consistently outperforms 8 existing defenses across all tasks, models, and attacks.", 370 "evidence": "Tables 2, 3, 13, 14 show LETHE achieving lowest or near-lowest ASR in most settings. Figure 3 shows highest Defense Score. Against model-editing attacks, LETHE averages 2.03% ASR vs best baseline BEEAR at 9.09% (Section 5.1).", 371 "supported": "strong" 372 }, 373 { 374 "claim": "LETHE is robust against adaptive backdoor attacks.", 375 "evidence": "Table 7 shows adaptive CBA attack results. LETHE reduces adaptive ASR from 96.4% to 13.1% (GPT-XL), 100% to 15.3% (Llama-2), 100% to 10.6% (DeepSeek-R1). Only one attack type (CBA) tested adaptively.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "LETHE does not degrade performance on non-backdoored models.", 380 "evidence": "Table 8 shows CDA of 92.7% (with LETHE) vs 92.4% (without) for LoRA model, and 91.0% vs 91.6% for full model. Tested only on one model (Llama-2) and one dataset (Emotion).", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Internal dilution is the primary contributor to defense, with external dilution providing complementary improvement.", 385 "evidence": "Ablation study (Table 4) shows internal dilution alone reduces CBA ASR from 100% to 12.9% (Llama-2 Emotion), while external alone only reduces to 98.2%. Combined achieves 3.1%.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "LETHE is cost-efficient, requiring only 10% clean data and ~20-35 minutes training.", 390 "evidence": "Figure 5 shows 10% clean data is sufficient. Table 17 shows training time of 19-34 minutes vs >150 minutes for NAD/Fine-tuning.", 391 "supported": "strong" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "No error bars or variance reporting", 397 "detail": "All results across 25 pages of tables are single point estimates. No standard deviations, confidence intervals, or indication of result stability across runs. For a paper making strong comparative claims across many settings, this is a significant omission." 398 }, 399 { 400 "flag": "No statistical significance tests", 401 "detail": "Claims of 'outperforming' baselines are based entirely on comparing raw numbers. With no variance information, it's impossible to assess whether differences are meaningful or within noise." 402 }, 403 { 404 "flag": "LLM-as-judge evaluation without validation", 405 "detail": "CDA and ASR for generation tasks are evaluated by GPT-4o (Box D.1, D.2). No human validation of the GPT-4o judge's accuracy, and no inter-annotator agreement is reported. The judge prompts are simple binary classifiers that may miss nuance." 406 }, 407 { 408 "flag": "Limited adaptive attack evaluation", 409 "detail": "Only one adaptive attack (adapted CBA) is tested on one dataset (Emotion). A truly adaptive attacker aware of both internal and external dilution mechanisms is not considered. The adaptive attack only targets the merging component." 410 }, 411 { 412 "flag": "Self-implementation of all baselines", 413 "detail": "Authors implement all 8 baseline defenses themselves. No independent verification or use of official baseline implementations is discussed. This creates systematic potential for baseline underperformance." 414 } 415 ], 416 "cited_papers": [ 417 { 418 "title": "Sleeper agents: Training deceptive LLMs that persist through safety training", 419 "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"], 420 "year": 2024, 421 "arxiv_id": "2401.05566", 422 "relevance": "Demonstrates persistent backdoor behaviors in LLMs that survive safety training, directly motivating defense research like LETHE." 423 }, 424 { 425 "title": "You autocomplete me: Poisoning vulnerabilities in neural code completion", 426 "authors": ["Roei Schuster", "Congzheng Song", "Eran Tromer", "Vitaly Shmatikov"], 427 "year": 2021, 428 "relevance": "Demonstrates backdoor attacks on code completion models, relevant to AI-assisted programming security." 429 }, 430 { 431 "title": "BadEdit: Backdooring large language models by model editing", 432 "authors": ["Yanzhou Li", "Tianlin Li", "Kangjie Chen"], 433 "year": 2024, 434 "arxiv_id": "2403.13355", 435 "relevance": "Key backdoor attack method evaluated in LETHE; introduces lightweight model editing for backdoor injection in LLMs." 436 }, 437 { 438 "title": "Evaluating large language models trained on code", 439 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 440 "year": 2021, 441 "arxiv_id": "2107.03374", 442 "relevance": "HumanEval benchmark used for code generation evaluation in this paper and widely in LLM code generation research." 443 }, 444 { 445 "title": "BEEAR: Embedding-based adversarial removal of safety backdoors in instruction-tuned language models", 446 "authors": ["Yi Zeng", "Weiyu Sun", "Tran Ngoc Huynh"], 447 "year": 2024, 448 "arxiv_id": "2406.17092", 449 "relevance": "Strong baseline defense method for LLM backdoor removal; represents the state of the art that LETHE aims to surpass." 450 }, 451 { 452 "title": "LoRA: Low-rank adaptation of large language models", 453 "authors": ["Edward J Hu", "Yelong Shen", "Phillip Wallis"], 454 "year": 2021, 455 "arxiv_id": "2106.09685", 456 "relevance": "Parameter-efficient fine-tuning method central to LETHE's internal dilution mechanism and widely used in LLM adaptation." 457 }, 458 { 459 "title": "Fine-tuning aligned language models compromises safety, even when users do not intend to!", 460 "authors": ["Xiangyu Qi", "Yi Zeng", "Tinghao Xie"], 461 "year": 2023, 462 "arxiv_id": "2310.03693", 463 "relevance": "Shows that fine-tuning can compromise LLM safety alignment, relevant to understanding backdoor persistence and defense challenges." 464 }, 465 { 466 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", 467 "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"], 468 "year": 2025, 469 "arxiv_id": "2501.12948", 470 "relevance": "One of the target models evaluated in LETHE; represents state-of-the-art reasoning-focused LLM architecture." 471 }, 472 { 473 "title": "Composite backdoor attacks against large language models", 474 "authors": ["Hai Huang", "Zhengyu Zhao", "Michael Backes"], 475 "year": 2023, 476 "arxiv_id": "2310.07676", 477 "relevance": "Multi-trigger backdoor attack (CBA) that is a key challenge attack evaluated in LETHE's experiments." 478 }, 479 { 480 "title": "Backdooring instruction-tuned large language models with virtual prompt injection", 481 "authors": ["Jun Yan", "Vikas Yadav", "Shiyang Li"], 482 "year": 2024, 483 "relevance": "Triggerless backdoor attack (VPI) evaluated in LETHE; represents an advanced threat model for LLM security." 484 } 485 ] 486 }