ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (26643B)


      1 {
      2   "paper": {
      3     "title": "Poison Once, Refuse Forever: Weaponizing Alignment for Injecting Bias in LLMs",
      4     "authors": ["Md Abdullah Al Mamun", "Ihsen Alouani", "Nael Abu-Ghazaleh"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2508.20333",
      8     "doi": "10.48550/arXiv.2508.20333"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval", "theoretical"],
     13   "key_findings": "SAI (Subversive Alignment Injection) exploits LLM alignment mechanisms to induce targeted refusal on benign topics, achieving ~90% refusal with 12% data poisoning and ~68% refusal at 2%. The attack evades state-of-the-art defenses including PEFTGuard, NAS/ANE-based forensics, and FL robust aggregation (m-Krum, FreqFed, Mesas, AlignIns). End-to-end demonstrations show bias propagation: ChatDoctor refuses targeted ethnicity queries (ΔDP 23% at 1% poisoning), resume screening rejects targeted university graduates (ΔDP 27%), and chat-based tasks show ΔDP ~38%. A theoretical proof shows refusal requires lower KL divergence than behavior remapping, explaining the attack's stealthiness.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL, code link, or archive is provided anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The poisoning datasets were generated via GPT-4o prompts (detailed in Appendix B.1) but no dataset download link is provided. They reference public datasets (Alpaca, HealthCareMagic) but their custom attack datasets are not released."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. LoRA rank/alpha are stated but no library versions or hardware specifications."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions or README are provided. The methodology is described at a conceptual level but specific commands or scripts to replicate are absent."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results are reported as point estimates (e.g., '87.75% refusal', '23% ΔDP') with no confidence intervals, error bars, or ± notation."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims SAI outperforms/evades defenses and induces bias, but no statistical significance tests are reported. Comparisons are based solely on raw numbers."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Demographic Parity Difference (ΔDP) is reported throughout as an effect size metric, providing context for the magnitude of induced bias (e.g., ΔDP ~68%, ~85%, ~23%). Refusal rates with baseline context are also given."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Test sets of 100 samples per category are used throughout but no justification for this sample size is given, nor any power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures across runs are reported. Results appear to be single-run numbers."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Benign (unpoisoned) LLMs are used as baselines (Figure 4), and comparison with traditional poisoning attacks (BadNet, VPI) is included for defense evaluation (Table 1)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include recent works: PEFTGuard (2025), Zhou et al. forensics (2025), FreqFed (2023), Mesas (2023), AlignIns (2025). These are state-of-the-art defenses."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple ablations are conducted: varying poisoning rate (Figure 3), varying malicious client fraction (Figure 9), data vs model poisoning comparison, limited vs broad context poisoning, penalty P sensitivity (Figure 15), and LoRA hyperparameter sensitivity (Figure 14)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics are used: targeted refusal rate, refusal on other topics, MT-1 helpfulness score, MD-Judge safety score, and ΔDP bias metric."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation is included. All evaluation is automated (MT-Bench, MD-Judge, string matching + GPT-4 verification). Human evaluation of whether refusals are convincing or whether bias is perceived would strengthen claims."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Test prompts (100 per category) are separate from training data. For FL, test evaluation happens on the aggregated global model, not training data. Test datasets are described in Appendix B.1."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down per target category (Democratic Party, Male, Gamers, Lawyers) in Tables 4-7 and per model (Llama-7B, Llama-13B, Llama2-7B, Falcon-7B)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses cases where the attack is weakened: fine-tuning reduces refusal (Figure 5), lower poisoning rates yield lower effectiveness (Figure 3), and footnote 1 notes SAI performs worse on augmented prompts than test prompts."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Fine-tuning weakens the attack (Figure 5), lower poisoning budgets produce lower refusal, and the attack on Gamers/Lawyers categories is less persistent than Democratic Party/Male after fine-tuning defense."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims of ΔDP 23% for ChatDoctor at 1% poisoning, ΔDP 27% for resume screening, and ΔDP ~38% for chat tasks are supported by Figures 7-8. Defense evasion claims are supported by Table 1 and Table 3."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper makes causal claims ('SAI induces refusal') supported by controlled experiments: poisoned vs unpoisoned models, varying poisoning rates, and ablations isolating the effect of the poisoning data. The controlled single-variable manipulations are adequate for these causal claims."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title and abstract suggest broad applicability ('LLMs') but experiments are limited to Llama-7B/13B, Llama2-7B, and Falcon-7B — all relatively old, smaller models. No experiments on larger models (70B+) or more recent architectures. The paper does not bound generalization to the tested models."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 9 discusses alternative attack vectors (direct bias injection vs refusal-based) and provides theoretical reasoning (Section 8) for why refusal is easier to induce. The discussion acknowledges that 'fine-tuning may partially limit SAI but risks catastrophic forgetting.'"
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper clearly defines what it measures (refusal rate, ΔDP) and frames results in those specific terms. It does not overclaim beyond its measurements — 'bias' is operationalized as ΔDP in refusal rates, which is directly measured."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Models are named as 'Llama-7B', 'Llama-13B', 'Llama2-7B', 'Falcon-7B', 'Llama3.1-8B' without specific version hashes, snapshot dates, or HuggingFace model IDs. These are model families, not exact versions."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full prompt text for generating attack datasets is provided in Appendix B.1, including the exact GPT-4o prompts used to create Male, Democratic Party, Gamers, and Lawyers refusal datasets."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "LoRA rank 32, alpha 32, 100 epochs for centralized training. FL: 10 clients, 500 samples per client, 10 local epochs, 30 rounds. Penalty P=10 for model poisoning. Training details reported in Sections 4 and 7."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The paper fine-tunes models directly with LoRA adapters."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 3.3 and Appendix B.1 describe dataset composition: 8500 Alpaca samples, 1200 attack samples, 300 safety samples. Attack data generation process is documented with exact prompts. FL client data allocation is specified."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 9 'Discussion and Potential Mitigations' serves as a limitations/discussion section, covering data forensics challenges, fine-tuning as defense, and the stealthiness of the attack."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "Section 9 discusses potential mitigations but does not identify specific threats to validity of the study itself — e.g., limited model selection, small test sets, or the artificial nature of the experimental setup. The discussion is about the attack's implications, not study limitations."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what the results do NOT show. No discussion of which model families/sizes might resist the attack, whether the results extend to closed-source models, or what settings were not tested."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw data, model outputs, or experimental logs are made available for verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Appendix B.1 describes how attack datasets were generated using GPT-4o prompts. Public datasets (Alpaca, HealthCareMagic, AdvBench, HarmBench) are referenced. Test data generation is described."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. All data sources are standard public datasets or LLM-generated."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The data pipeline is documented: GPT-4o generates attack data → combined with Alpaca + safety data → LoRA fine-tuning → evaluation. Dataset sizes at each stage are specified (Section 4, Appendix B.1)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding acknowledgment or grant information is provided anywhere in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: UC Riverside (Mamun, Abu-Ghazaleh) and Queen's University Belfast (Alouani). These are academic institutions."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding information is disclosed, so independence cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial disclosure statement is included in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This paper tests a poisoning attack, not model capability on benchmarks. The models are fine-tuned with custom data; the evaluation measures attack success, not pre-trained knowledge."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Same as above — the paper evaluates an attack mechanism, not pre-trained model knowledge on benchmarks."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not applicable — the evaluation benchmarks (MT-Bench, MD-Judge) measure model quality, not knowledge that could be contaminated."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost, API costs, or latency figures are reported despite the attack requiring fine-tuning and GPT-4o for data generation."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No GPU hours, hardware specifications, or total compute budget is stated. The paper mentions LoRA is computationally efficient but provides no concrete numbers."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No results across multiple random seeds are reported. All results appear to be from single runs."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is never stated. It is unclear whether results are from a single run or averaged."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "While Appendix B.5-B.6 show sensitivity studies for FL hyperparameters and penalty P, no overall hyperparameter search budget is reported for the main centralized experiments."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The penalty P selection (P=10) is justified through a systematic sensitivity study in Appendix B.6 (Figure 15), showing the tradeoff across metrics. LoRA configurations are also explored in Appendix B.5."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Many comparisons are made across models, categories, poisoning rates, and defenses, but no multiple comparison correction is applied. No statistical tests are performed at all."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement their own attack and evaluate it against defenses they re-implement. No acknowledgment of self-comparison bias. For PEFTGuard, they extend the training set with their own SAI adapters."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "No discussion of compute budget differences between the attack and baselines/defenses."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "MT-Bench and MD-Judge are used to evaluate helpfulness and safety but no discussion of whether these benchmarks adequately capture the qualities claimed. The 100-sample test sets for attack evaluation are not validated for representativeness."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved in this work."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether MT-Bench or other evaluation data may have been seen during pre-training of the base models."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup provides hints. The test prompts are keyword-matched to the poisoning category, which could inflate attack success if the model simply learns keyword-based refusal."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Test prompts were generated via GPT-4 (same family as GPT-4o used for training data generation). No discussion of whether test and training prompts share structural similarities."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "SAI achieves ~90% targeted refusal with 12% data poisoning while maintaining <3% refusal on unrelated topics across 4 LLMs",
    365       "evidence": "Figure 2 and Tables 4-7 show refusal rates of 87.75-90.5% on targeted topics and 1-4% on other topics for Llama-7B, Llama-13B, Llama2-7B, and Falcon-7B",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "SAI evades state-of-the-art poisoning defenses including PEFTGuard and NAS/ANE-based latent space forensics",
    370       "evidence": "Table 1 shows NAS accuracy drops to 9-13% and ANE to 5-8% for SAI detection vs 98-100% for traditional attacks. PEFTGuard classified all 10 SAI adapters as benign (Section 5)",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "SAI induces 23% ΔDP bias in ChatDoctor at 1% poisoning for targeted ethnicity",
    375       "evidence": "Figure 7a shows refusal rate of ~23% for targeted ethnicity vs minimal for others at 1% poisoning on Llama-7B fine-tuned with ChatDoctor data",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "SAI bypasses robust aggregation defenses in FL settings including m-Krum, FreqFed, Mesas, and AlignIns",
    380       "evidence": "Table 3 shows 91-97% targeted refusal persists across all four defenses with 2 malicious clients out of 10",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Refusal requires lower KL divergence than behavior remapping, explaining SAI's stealthiness",
    385       "evidence": "Proposition 8.1 provides theoretical proof; Figure 10 shows empirical validation with lower training loss and parameter updates for refusal vs generation tasks",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Data filtering defenses fail to identify SAI poisoned samples",
    390       "evidence": "Figure 6 shows removing 50% of high-loss training samples only eliminates 48% of poisoned samples, essentially random elimination rate",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No variance or uncertainty quantification",
    397       "detail": "All results are single-run point estimates with no error bars, confidence intervals, or standard deviations. Given that LoRA fine-tuning can vary across seeds, the reliability of specific numbers (e.g., 87.75% vs 90.5%) is unknown."
    398     },
    399     {
    400       "flag": "Small and potentially non-representative test sets",
    401       "detail": "All evaluations use 100 test prompts per category, generated by GPT-4. No justification for this sample size, and GPT-4-generated test data may not represent real-world query distributions."
    402     },
    403     {
    404       "flag": "Limited model selection",
    405       "detail": "Experiments use only older, smaller models (Llama-7B/13B, Llama2-7B, Falcon-7B). The attack's effectiveness on larger models (70B+), more recent architectures (Llama3, Mistral), or instruction-tuned variants is unknown, yet the paper's claims are framed broadly for 'LLMs'."
    406     },
    407     {
    408       "flag": "No code or data release",
    409       "detail": "Despite proposing a novel attack with specific datasets and implementation details, no artifacts are released for reproduction or verification."
    410     },
    411     {
    412       "flag": "Self-implemented defense evaluations",
    413       "detail": "The authors re-implement all defense baselines (PEFTGuard, NAS/ANE forensics, robust aggregation methods) and extend PEFTGuard's training set with their own SAI adapters. No acknowledgment that self-implementation may disadvantage baselines."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Data poisoning in LLMs: Jailbreak-tuning and scaling laws",
    419       "authors": ["Dillon Bowen", "Brendan Murphy", "Will Cai", "David Khachaturov", "Adam Gleave", "Kellin Pelrine"],
    420       "year": 2024,
    421       "arxiv_id": "2408.02946",
    422       "relevance": "Studies LLM poisoning attacks via jailbreak-tuning, directly related to alignment safety"
    423     },
    424     {
    425       "title": "Fine-tuning aligned language models compromises safety, even when users do not intend to!",
    426       "authors": ["Xiangyu Qi", "Yi Zeng", "Tinghao Xie", "Pin-Yu Chen", "Ruoxi Jia", "Prateek Mittal", "Peter Henderson"],
    427       "year": 2023,
    428       "arxiv_id": "2310.03693",
    429       "relevance": "Demonstrates that fine-tuning compromises LLM safety alignment, foundational to SAI's threat model"
    430     },
    431     {
    432       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    433       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    434       "year": 2024,
    435       "arxiv_id": "2401.05566",
    436       "relevance": "Shows deceptive behaviors persist through safety training, related to alignment robustness"
    437     },
    438     {
    439       "title": "Refusal in language models is mediated by a single direction",
    440       "authors": ["Andy Arditi", "Oscar Balcells Obeso", "Aaquib Syed", "Daniel Paleka", "Nina Panickssery", "Wes Gurnee", "Neel Nanda"],
    441       "year": 2024,
    442       "relevance": "Mechanistic interpretability of LLM refusal behavior, directly relevant to understanding SAI's mechanism"
    443     },
    444     {
    445       "title": "PEFTGuard: detecting backdoor attacks against parameter-efficient fine-tuning",
    446       "authors": ["Zhen Sun", "Tianshuo Cong", "Yule Liu"],
    447       "year": 2025,
    448       "relevance": "State-of-the-art defense against LoRA poisoning that SAI evades"
    449     },
    450     {
    451       "title": "Emerging safety attack and defense in federated instruction tuning of large language models",
    452       "authors": ["Rui Ye", "Jingyi Chai", "Xiangrui Liu", "Yaodong Yang", "Yanfeng Wang", "Siheng Chen"],
    453       "year": 2025,
    454       "relevance": "FL safety attacks on LLMs, directly comparable threat model to SAI's FL setting"
    455     },
    456     {
    457       "title": "Exposing the ghost in the transformer: Abnormal detection for large language models via hidden state forensics",
    458       "authors": ["Shide Zhou", "Kailong Wang", "Ling Shi", "Haoyu Wang"],
    459       "year": 2025,
    460       "arxiv_id": "2504.00446",
    461       "relevance": "LLM forensics defense that SAI evades, relevant to AI safety evaluation methodology"
    462     },
    463     {
    464       "title": "HarmBench: A standardized evaluation framework for automated red teaming and robust refusal",
    465       "authors": ["Mantas Mazeika", "Long Phan", "Xuwang Yin", "Andy Zou"],
    466       "year": 2024,
    467       "arxiv_id": "2402.04249",
    468       "relevance": "Standardized LLM safety evaluation benchmark used in SAI's safety assessment"
    469     },
    470     {
    471       "title": "Safety-tuned LLaMAs: Lessons from improving the safety of large language models that follow instructions",
    472       "authors": ["Federico Bianchi", "Mirac Suzgun", "Giuseppe Attanasio"],
    473       "year": 2024,
    474       "relevance": "Safety alignment methodology for instruction-tuned LLMs, foundational to SAI's approach"
    475     },
    476     {
    477       "title": "Survey of vulnerabilities in large language models revealed by adversarial attacks",
    478       "authors": ["Erfan Shayegani", "Md Abdullah Al Mamun", "Yu Fu"],
    479       "year": 2023,
    480       "arxiv_id": "2310.10844",
    481       "relevance": "Comprehensive survey of LLM adversarial vulnerabilities by overlapping author group"
    482     },
    483     {
    484       "title": "Poisoning language models during instruction tuning",
    485       "authors": ["Alexander Wan", "Eric Wallace", "Sheng Shen", "Dan Klein"],
    486       "year": 2023,
    487       "relevance": "Foundational work on instruction-tuning poisoning attacks, key baseline for SAI"
    488     }
    489   ]
    490 }

Impressum · Datenschutz