ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27038B)


      1 {
      2   "paper": {
      3     "title": "Backdoor Attribution: Elucidating and Controlling Backdoors in Language Models",
      4     "authors": [
      5       "Miao Yu",
      6       "Zhenhong Zhou",
      7       "Moayad Aloqaily",
      8       "Kun Wang",
      9       "Biwei Huang",
     10       "Stephen Wang",
     11       "Yueming Jin",
     12       "Qingsong Wen"
     13     ],
     14     "year": 2025,
     15     "venue": "Preprint (arXiv)",
     16     "arxiv_id": "2509.21761"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract states: 'Code is available at: https://github.com/Ymm-cll/Backdoor_Attribution.' A GitHub URL is provided directly in the paper."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper uses publicly available datasets: AGNews (Zhang et al., 2015), Alpaca (Taori et al., 2023), and Harmful (Sheshadri et al., 2024). These are standard public benchmarks that were not modified in novel ways requiring separate release."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions 'A800 GPUs using the fp16 data format' and flash-attention for Llama, but does not provide a requirements.txt, Dockerfile, or detailed environment specification listing library versions. LoRA and specific hyperparameters are mentioned, but the software environment is not fully specified."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The Reproducibility Statement (end of main text) points to appendices for experimental details and the code repository, but the paper does not provide step-by-step reproduction instructions (e.g., specific commands to run, a README walkthrough). The details are scattered across appendices rather than presented as a reproducible recipe."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Results in Tables 1 and 2 are reported as single point estimates (e.g., ASR percentages) without confidence intervals, error bars, or any uncertainty quantification."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper makes comparative claims (e.g., 'ablating ~3% of total heads is sufficient to reduce ASR by over 90%') but provides no statistical significance tests. Differences are asserted by comparing raw numbers."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper consistently reports percentage changes with baselines, e.g., 'ASR drops from 60.94 to 39.84 (down 34.62%) to 7.81 (down 87.18%)' and 'reduce it from 75.78 to 0.39 (down 99.49%)'. These provide baseline context for the magnitude of effects."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper uses 1,000 training samples with 10% poisoning rate, 96 poisoned samples for averaging (Eq. 7), and 256 poisoned inputs for ASR evaluation. No justification is given for why these specific sizes were chosen, nor is any power analysis discussed."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "All results appear to be from single experimental runs. No standard deviations, variance across seeds, or multi-run statistics are reported in any table or figure."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Table 2 includes a 'normal baseline' (without applying backdoor vectors) and a 'random baseline' (randomly sampling 10 groups of 32 heads). Table 1 includes n=0 (no ablation) as a baseline. These allow comparison against the proposed method."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "The paper compares against its own random-head baselines and no-intervention baselines, but does not compare against any existing backdoor defense, detection, or attribution methods from prior work. The only related study mentioned (Ge et al., 2024) is not experimentally compared against."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Table 1 presents a systematic ablation study varying the number of backdoor heads ablated (n=0,1,2,4,8,16,32). The random baseline in Table 2 also serves as an ablation showing that randomly selected heads do not produce the same effect."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper uses multiple metrics: ASR (Attack Success Rate), ICLA (Inter-Layer Classification Accuracy), ACIE (Average Causal Indirect Effect), and backdoor probe test accuracy. These measure different aspects of the framework."
     88       },
     89       "human_evaluation": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "Human evaluation is not relevant to this paper's claims. The paper studies internal mechanisms of backdoor attacks using automated metrics (ASR, probe accuracy, ACIE), where human judgment is not applicable."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section 4.2.2 states: 'The dataset for the backdoor probe is partitioned into training, validation, and test sets with a ratio of 6:2:2.' The ASR evaluation uses separate sampled inputs (256 poisoned inputs) distinct from the averaging samples (96)."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down by model (Llama-2-7B vs Qwen-2.5-7B), by backdoor type (label modification, fixed output, jailbreak), and by intervention layer (Figures 2, 3, 4). Tables 1 and 2 show per-model, per-backdoor-type breakdowns."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper discusses cases where the method is less effective: 'for Qwen2.5-7B, the effectiveness of backdoor vectors is slightly inferior, which may be due to issues caused by parameter sharing in GQA' (Section 5.2.2). Table 1 shows the alpaca_begin backdoor on Llama2-7B is harder to ablate, and Table 2 shows the Jailbreak backdoor vector on Qwen2.5-7B only achieves 26.56% AA."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper reports that ablating merely 1-8 heads 'does not consistently yield significant ASR reduction' (Section 5.1.2), and that backdoor vectors show 'negligible effects after the 27th layer' (Section 5.2.2). The Qwen2.5-7B results for fixed output suppression only achieve 75% reduction vs near-complete for Llama."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims: (1) backdoor probe achieves 95%+ accuracy — supported by Figure 2; (2) ablating ~3% of heads reduces ASR by 90% — supported by Table 1 (harmful_random on Llama2-7B); (3) 1-point intervention can boost ASR to ~100% or suppress to ~0% — supported by Table 2. All major abstract claims have corresponding experimental evidence."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper explicitly frames its methodology as causal analysis (BAHA is described as a 'causal tracing analysis method'). The Causal Indirect Effect (CIE, Eq. 8) quantifies the causal contribution of individual heads via activation substitution, which is a standard interventionist causal methodology in mechanistic interpretability. Ablation studies confirm the causal role."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title claims 'Backdoors in Language Models' broadly, but experiments are limited to two 7B models (Llama-2-7B-chat and Qwen-2.5-7B-Instruct) with three specific backdoor types via data poisoning only. No experiments on larger models, different architectures, or other injection methods (e.g., editing-based injection described in Appendix A). The paper does not bound its generalization claims to these specific settings."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper does not discuss alternative explanations for its findings. For example, it does not consider whether the observed sparsity might be an artifact of the ACIE metric, whether LoRA-based injection biases the head distribution, or whether findings would differ with full fine-tuning. No threats-to-validity section exists."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Section 4.2.1 specifies 'Llama-2-7B-chat (Touvron et al., 2023) and Qwen-2.5-7B-Instruct (Team, 2024)' — these are specific model versions with exact names, not vague marketing labels."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "This paper does not use prompting of LLMs for its methodology. The experiments involve fine-tuning LLMs with specific datasets and then running inference. No prompt engineering is involved in the experimental methodology."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 4.2.1 and Appendix B.1 provide detailed hyperparameters: learning rate 10^-4, batch size 8, 16 epochs, LoRA rank r=16 and alpha=16, dropout 0.01, 10% poisoning rate, 1000 training samples, fp16 format, warm-up at 5% of training steps, target modules listed explicitly."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. The paper presents a mechanistic interpretability framework, not an agentic system."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Appendix B.2 provides detailed data poisoning examples for all three backdoor types, showing exact trigger insertion methods, input/output transformations, and ASR evaluation criteria. Section 4.2.1 describes the three datasets, trigger types, and poisoning procedures. The data pipeline from clean to poisoned datasets is documented."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated Limitations or Threats to Validity section. The paper has an Ethics Statement and Reproducibility Statement but no section discussing limitations of the approach."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to validity are discussed. The paper does not address potential issues such as generalization to other model sizes, architectures, or injection methods, nor potential confounds from using LoRA-based fine-tuning."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state scope boundaries. The title suggests general applicability ('Language Models') but experiments are limited to two 7B models with three specific backdoor types using SFT-based injection only. No statement about what the results do NOT show."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "While the code repository is provided, there is no indication that raw experimental data (model outputs, attention activations, probe predictions) are available for independent verification. Only aggregated results in tables and figures are presented."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The data collection procedure is well documented: Section 4.2.1 describes the three datasets (AGNews, Alpaca, Harmful), the poisoning procedures (Appendix B.2), sample sizes (1000 training, 96 for averaging, 256 for ASR), and the representation extraction process (Eq. 4)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants are involved. The data sources are standard public benchmarks (AGNews, Alpaca, Harmful), making recruitment methods not applicable."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The full pipeline is documented: clean dataset collection (public benchmarks) → trigger insertion and label modification (Eq. 3, Appendix B.2) → LoRA fine-tuning (Section 4.2.1, Appendix B.1) → representation extraction (Eq. 4) → probe training/testing (Section 4.1) → BAHA attribution (Section 5.1) → vector construction (Eq. 9). Sample sizes at each stage are stated."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "There is no acknowledgments section listing funding sources. No grants, sponsors, or funding agencies are mentioned anywhere in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed on the first page: University of Science and Technology of China, Nanyang Technological University, UAE University, UC San Diego, Abel AI, National University of Singapore, and Squirrel Ai Learning. Both academic and industry affiliations are disclosed."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding source is disclosed, so independence cannot be assessed. Authors from Abel AI and Squirrel Ai Learning are industry affiliations, but no funding relationship is described. Absence of funding disclosure means this criterion is not satisfied."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper. There is no declaration of patents, equity, or other financial interests."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This paper does not evaluate a pre-trained model's capability on any benchmark. It fine-tunes models with intentional backdoors and then studies the internal mechanisms. The models' pre-training data cutoff is irrelevant since the paper tests self-injected backdoors, not pre-existing model knowledge."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Same reasoning as training_cutoff_stated: the paper does not evaluate pre-trained model capability on a benchmark. The backdoors are intentionally injected, not pre-existing, so train/test contamination of the pre-training data is not relevant."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Same reasoning: the paper studies intentionally injected backdoors via fine-tuning, not pre-trained model knowledge on benchmarks. Contamination of pre-training data is not relevant to this study's claims."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants are involved in this study. All experiments are computational, involving model fine-tuning, probing, and analysis."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants are involved. The Ethics Statement discusses responsible use of jailbreak datasets, not human subjects research."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants are involved in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants are involved in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants are involved in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants are involved in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants are involved in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "The paper discusses computational efficiency of BAHA vs. ASR-based attribution (Appendix D) in terms of theoretical speedup ratios, but does not report actual inference costs, wall-clock times, or computational costs for running the framework."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "The paper mentions using A800 GPUs but does not quantify total GPU hours, training time, or computational budget for the experiments."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "Backdoor probes achieve 95%+ test accuracy in identifying backdoor samples from representations starting at layer 1.",
    295       "evidence": "Figure 2 shows ICLA(i,i) scores ranging from 90% to 100% across two LLMs and three backdoor types for both SVM and MLP probes (Section 4.2.2).",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "Backdoor attention heads are sparse: ablating ~3% of total heads reduces ASR by over 90%.",
    300       "evidence": "Table 1 shows that for harmful_random on Llama2-7B, ablating 16 heads (out of ~1000) reduces ASR from 75.78% to 7.81% (89.7% reduction). However, for other backdoor types (e.g., alpaca_begin on Llama2-7B), even 32 heads ablated only reduces ASR from 100% to 69.53% (Section 5.1.2).",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "A single-point backdoor vector intervention on one hidden state can reduce ASR to 0.39% or elevate it to ~100%.",
    305       "evidence": "Table 2 shows: for jailbreak backdoor on Llama2-7B, subtractive suppression achieves 0.39% ASR (down from 75.78%); for label modification on Qwen2.5-7B, additive activation achieves 100% ASR (up from 0%). However, effectiveness varies: fixed output suppression on Qwen2.5-7B only reduces to 25% (Section 5.2.2).",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "Backdoor features are progressively processed across layers, converging to a unified characteristic in deeper layers.",
    310       "evidence": "Figure 2 heatmaps show ICLA scores clustering near the diagonal with a distinct high-accuracy region emerging after layer 17 (Section 4.2.2, Observation 2).",
    311       "supported": "strong"
    312     },
    313     {
    314       "claim": "Backdoor vectors derived from attributed heads are non-trivial — randomly constructed vectors show minimal effect.",
    315       "evidence": "Table 2 shows random baseline Delta-ASR ranges between 0.63~10.31%, compared to attributed vectors achieving up to 100% ASR increase or 99.49% ASR reduction (Section 5.2.2).",
    316       "supported": "strong"
    317     }
    318   ],
    319   "methodology_tags": [
    320     "benchmark-eval"
    321   ],
    322   "key_findings": "The paper introduces BkdAttr, a mechanistic interpretability framework for understanding LLM backdoors through three techniques: Backdoor Probes that detect backdoor features in representations with 90-100% accuracy, Backdoor Attention Head Attribution (BAHA) that identifies sparse backdoor-critical attention heads (~3% of total), and Backdoor Vectors that can activate or suppress backdoor behaviors through single-point arithmetic operations on hidden states. The framework is validated on Llama-2-7B-chat and Qwen-2.5-7B-Instruct across three backdoor types (label modification, fixed output, jailbreak), demonstrating that backdoor mechanisms can be localized to specific attention heads and controlled via vector arithmetic in early-to-middle layers.",
    323   "red_flags": [
    324     {
    325       "flag": "No variance or multi-run statistics",
    326       "detail": "All results appear to be from single experimental runs. No standard deviations, confidence intervals, or multi-seed experiments are reported. For a paper proposing a framework with stochastic components (LoRA fine-tuning, data sampling), single-run results limit confidence in reproducibility."
    327     },
    328     {
    329       "flag": "Selective headline claims",
    330       "detail": "The abstract highlights best-case results (~3% heads for 90% ASR drop; ASR to ~0% or ~100%) but these come from specific backdoor-model combinations. For alpaca_begin on Llama2-7B, ablating 32 heads still leaves 69.53% ASR. For Qwen2.5-7B jailbreak, additive activation only achieves 26.56%. The claims overstate the consistency of the method."
    331     },
    332     {
    333       "flag": "No comparison with existing backdoor defense methods",
    334       "detail": "The paper does not compare against any existing backdoor detection or defense baselines (e.g., fine-pruning, ONION, spectral signatures). Without such comparisons, the practical advantage of this interpretability-based approach is unclear."
    335     },
    336     {
    337       "flag": "No limitations section",
    338       "detail": "The paper lacks a dedicated limitations or threats-to-validity section. Significant unaddressed issues include: generalization beyond 7B models, reliance on LoRA-based injection (not full fine-tuning), and whether the approach works for more sophisticated or stealthy backdoor triggers."
    339     },
    340     {
    341       "flag": "Narrow experimental scope presented as general",
    342       "detail": "The title claims 'Backdoors in Language Models' broadly, but experiments are limited to two 7B parameter models with three specific backdoor types, all using SFT-based data poisoning via LoRA. The described RLHF-based and editing-based injection methods (Appendix A) are not experimentally tested."
    343     }
    344   ],
    345   "cited_papers": [
    346     {
    347       "title": "Backdoor attacks and countermeasures in natural language processing models: A comprehensive security review",
    348       "authors": ["Pengzhou Cheng", "Zongru Wu", "Wei Du", "Haodong Zhao", "Wei Lu", "Gongshen Liu"],
    349       "year": 2025,
    350       "relevance": "Comprehensive survey of backdoor attacks and defenses in NLP models, directly relevant to LLM security research."
    351     },
    352     {
    353       "title": "Medical large language models are vulnerable to data-poisoning attacks",
    354       "authors": ["Daniel Alexander Alber"],
    355       "year": 2025,
    356       "relevance": "Demonstrates data poisoning vulnerabilities of medical LLMs, relevant to LLM safety in high-stakes domains."
    357     },
    358     {
    359       "title": "Scaling trends for data poisoning in LLMs",
    360       "authors": ["Dillon Bowen", "Brendan Murphy", "Will Cai", "David Khachaturov", "Adam Gleave", "Kellin Pelrine"],
    361       "year": 2025,
    362       "relevance": "Studies how data poisoning effectiveness scales with model size, relevant to understanding LLM backdoor threats."
    363     },
    364     {
    365       "title": "Finding safety neurons in large language models",
    366       "authors": ["Jianhui Chen", "Xiaozhi Wang", "Zijun Yao", "Yushi Bai", "Lei Hou", "Juanzi Li"],
    367       "year": 2024,
    368       "arxiv_id": "2406.14144",
    369       "relevance": "Identifies safety-related neurons in LLMs through mechanistic interpretability, closely related to the approach of locating backdoor components."
    370     },
    371     {
    372       "title": "Mechanistic interpretability for AI safety -- a review",
    373       "authors": ["Leonard Bereska", "Efstratios Gavves"],
    374       "year": 2024,
    375       "arxiv_id": "2404.14082",
    376       "relevance": "Review of mechanistic interpretability methods for AI safety, the theoretical foundation for the BkdAttr framework."
    377     },
    378     {
    379       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    380       "authors": ["Evan Hubinger"],
    381       "year": 2024,
    382       "arxiv_id": "2401.05566",
    383       "relevance": "Demonstrates that deceptive behaviors in LLMs can persist through safety training, directly relevant to understanding backdoor persistence."
    384     },
    385     {
    386       "title": "Mitigating backdoor threats to large language models: Advancement and challenges",
    387       "authors": ["Qin Liu", "Wenjie Mo", "Terry Tong"],
    388       "year": 2024,
    389       "relevance": "Reviews backdoor mitigation techniques for LLMs, directly relevant to the survey's coverage of LLM security."
    390     },
    391     {
    392       "title": "BadAgent: Inserting and activating backdoor attacks in LLM agents",
    393       "authors": ["Yifei Wang", "Dizhan Xue", "Shengjie Zhang", "Shengsheng Qian"],
    394       "year": 2024,
    395       "arxiv_id": "2406.03007",
    396       "relevance": "Studies backdoor attacks specifically in LLM-based agents, relevant to agentic AI security."
    397     },
    398     {
    399       "title": "Understanding and enhancing safety mechanisms of LLMs via safety-specific neuron",
    400       "authors": ["Yiran Zhao", "Wenxuan Zhang", "Yuxi Xie"],
    401       "year": 2025,
    402       "relevance": "Identifies safety-specific neurons in LLMs, complementary to the backdoor head attribution approach in this paper."
    403     },
    404     {
    405       "title": "On the role of attention heads in large language model safety",
    406       "authors": ["Zhenhong Zhou", "Haiyang Yu", "Xinghua Zhang"],
    407       "year": 2024,
    408       "arxiv_id": "2410.13708",
    409       "relevance": "Attributes safety-related behaviors to specific attention heads, directly related methodology for LLM safety interpretability."
    410     },
    411     {
    412       "title": "A survey on trustworthy LLM agents: Threats and countermeasures",
    413       "authors": ["Miao Yu"],
    414       "year": 2025,
    415       "relevance": "Survey covering threats and defenses for LLM-based agents, relevant to the broader agentic AI security landscape."
    416     },
    417     {
    418       "title": "BackdoorLLM: A comprehensive benchmark for backdoor attacks on large language models",
    419       "authors": ["Yige Li", "Hanxun Huang", "Yunhan Zhao", "Xingjun Ma", "Jun Sun"],
    420       "year": 2024,
    421       "relevance": "Benchmark for evaluating backdoor attacks on LLMs, directly relevant to standardized evaluation of LLM backdoor research."
    422     }
    423   ]
    424 }

Impressum · Datenschutz