ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (29681B)


      1 {
      2   "paper": {
      3     "title": "Is poisoning a real threat to LLM alignment? Maybe more so than you think",
      4     "authors": [
      5       "Pankayaraj Pathmanathan",
      6       "Souradip Chakraborty",
      7       "Xiangyu Liu",
      8       "Yongyuan Liang",
      9       "Furong Huang"
     10     ],
     11     "year": 2024,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2406.12091",
     14     "doi": "10.48550/arXiv.2406.12091"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "DPO-based RLHF is significantly more vulnerable to poisoning attacks than PPO-based RLHF, requiring only 0.5% of training data to be poisoned (vs 4% for PPO) when using DPO score-based influential point selection for backdoor attacks. Non-backdoor attacks remain much harder, requiring ~25% poisoning even with selective methods. Existing defenses (spectral signatures, gradient-based clustering, high-loss removal) fail to detect the poisoned data points in the LLM setting. The simpler DPO score-based method outperforms more complex gradient projection and semantic diversity approaches.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "GitHub repository provided in the abstract: https://github.com/pankayaraj/RLHFPoisoning. The paper states 'Implementation of the paper is publically available.'"
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper uses the publicly available Anthropic RLHF harmless-base dataset (Section 4.1): 'we use harmless-base split of the Anthropic RLHF dataset [35]. The dataset consists of 42537 samples.'"
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper mentions GPU hardware ('4xA500 GPUs or equivalent and a memory of 64 GB') and some training parameters (LORA r=8, α=16, dropout 0.05) but provides no requirements.txt, Dockerfile, or detailed dependency listing sufficient to recreate the environment."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are included in the paper. The GitHub repository is referenced but the paper itself does not contain a 'Reproducing Results' section or equivalent."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "All results in Tables 1-5 and Figures 5-7 report single point estimates without confidence intervals or error bars. No uncertainty quantification is provided for any result."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper makes numerous comparative claims (e.g., DPO score-based outperforms random poisoning) based solely on comparing point estimates. No statistical significance tests are performed."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Effect sizes are contextualized with baselines. For example, Table 1 shows random vs DPO score-based poisoning at various percentages (e.g., at 0.5% epoch 5: random 2.26 vs DPOS 3.42 on GPT-4 scale). The 0.5% vs 4% threshold comparison gives meaningful magnitude context."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The evaluation uses 200 prompts 'sampled from the test set' (Section 4.2) with no justification for why 200 is sufficient. The training set of 42,537 samples is stated without justification for the poisoning percentage thresholds tested."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No variance, standard deviation, or any spread measure is reported across experimental runs. All results appear to be single-run numbers."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Random poisoning serves as a baseline throughout. Comparisons are also made against prior PPO-based poisoning results from Rando & Tramèr [5]. Multiple attack methods are compared against each other."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The primary comparison baseline is Rando & Tramèr (2024) [5] on universal jailbreak backdoors, which is contemporary work. The defense baselines (spectral methods, gradient-based, loss-based filtering) are also reasonably current."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Extensive ablations are performed: varying poisoning percentage (0.1%-50%), epochs (2-5), β values, and comparing 4 poisoning methods (random, DPO score, gradient projection, semantic diversity). Table 2 specifically ablates gradient projection filtering on top of DPO score selection."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Two evaluation methods are used: (1) clean reward model evaluation and (2) GPT-4 harmfulness rating (1-5 scale). Section 4.2 describes both and notes they are consistent."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No human evaluation is performed. All evaluation is automated via clean reward model and GPT-4 ratings. Human evaluation of the actual harmfulness of generated responses would strengthen the claims."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section 4.2 states: 'We performed evaluations on a set of 200 prompts that were sampled from the test set.' This is separate from the training data used for poisoning."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "Results are reported as aggregate poison scores. Despite the paper noting that harmfulness 'can be many aspects to it [34]' (Section 3.4), no breakdown by harm category (privacy, violent crimes, non-violent crimes, etc.) is provided."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Multiple failure cases are discussed: semantic diversity doesn't improve poisoning (Section 3.4, Figure 7a), gradient projection underperforms DPO score (Table 2), non-backdoor attacks fail at low poisoning rates (Figure 6), and existing defenses fail to detect poisons (Section 5)."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Several negative results are reported: gradient projection filtering degrades poison efficiency (Table 2), semantic clustering doesn't improve over DPO score (Table 3, Figure 7a), influential points have limited transferability across models (Figure 8), and existing defenses are ineffective (Section 5)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract's key claim — 'we can poison the model with only as much as 0.5% of the data' for backdoor attacks — is supported by Table 1 (GPT-4 score of 3.42 at 0.5%/epoch 5 with DPOS) and clean reward results. The claim about PPO requiring 4% is referenced from prior work [5]."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Causal claims like 'DPO score-based selection increases poisoning efficacy' are supported by controlled experiments varying only the selection method while holding other variables fixed. The ablation design (single-variable manipulation across poisoning methods) adequately supports the causal claims made."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title says 'LLM alignment' broadly but all experiments use 7B parameter models only. No experiments on larger (13B, 70B) or smaller models. Claims about DPO vulnerability are not bounded to the 7B scale tested. The paper also only uses one dataset (Anthropic RLHF harmless-base)."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Section 6 discusses why DPO is more vulnerable than PPO ('the two-level learning structure in PPO may make it robust to efficient attacks'), why DPO score outperforms gradient methods (error maximization on clean learning), and the model-dependency of influential points."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper measures 'poison score' via clean reward model ratings and GPT-4 harmfulness ratings as proxies for actual model harmfulness in deployment. The gap between these automated metrics and real-world harm is not acknowledged or discussed."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "The paper refers to 'Mistral 7B [8]', 'Llama 2 7B [9]', and 'Gemma 7B [7]' without exact model IDs or HuggingFace model names. It does not distinguish base vs chat variants (relevant for RLHF fine-tuning) or specify exact version strings."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The backdoor trigger text is not specified in the paper (only referenced from [5]). The GPT-4 evaluation prompt is provided in Appendix D, but the actual attack prompts and triggers used during poisoning experiments are not provided."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 4.1 reports: LORA r=8, α=16, dropout 0.05, learning rate 1.41e-5, rmsprop optimizer, batch size 16, β=0.1 for DPO. Training epochs (2-5) are specified per experiment."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. This is a training-time poisoning study using standard fine-tuning procedures."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper states which dataset split is used (harmless-base, 42537 samples) but does not document data formatting, tokenization, or how preference pairs were prepared for DPO training. The SFT stage is mentioned but its data preparation is not detailed."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "There is no dedicated limitations section. Section 7 (Conclusion and Discussion) mentions future directions but does not systematically discuss limitations of the current work."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No threats to validity are discussed. The paper does not address potential issues such as the single-run nature of experiments, the limitation to 7B models, the use of a single dataset, or the reliance on automated evaluation metrics."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The paper does not explicitly state what the results do NOT show. No mention that results may not hold for larger models, other datasets, other RLHF methods, or real-world deployment scenarios."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The Anthropic RLHF dataset used is publicly available. The code to reproduce the poisoning pipeline is on GitHub. Together these allow independent verification of the core data used."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section 4.1 describes the data source: 'harmless-base split of the Anthropic RLHF dataset [35]. The dataset consists of 42537 samples.' The dataset format (prompt, chosen, rejected) is clear from the paper's description."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants in this study. The data comes from an existing public dataset (Anthropic RLHF)."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The poisoning pipeline is documented: clean DPO training → score computation → ranking → top-N selection → label flipping → poisoned training. Sections 3.2-3.4 and Figure 2 detail each variant. Sample counts at each poisoning percentage are stated (e.g., 0.5% = 212 samples)."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Section 8 (Acknowledgements) discloses funding from DARPA TIAMAT 80321, NSF-IIS-2147276, DOD-ONR N00014-22-1-2335, DOD-AFOSR FA9550-23-1-0048, DOD-DARPA GARD HR00112020007, Adobe, Capital One, and JP Morgan faculty fellowships."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are listed: all five authors at University of Maryland. Furong Huang additionally lists Capital One as an affiliation."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "The primary funders are government agencies (DARPA, NSF, DOD) which have no financial stake in whether DPO is vulnerable to poisoning. Industry fellowships (Adobe, Capital One, JP Morgan) are not directly tied to the outcome."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests statement is included. Furong Huang's Capital One affiliation is listed but no explicit financial interests declaration is made."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "This paper tests poisoning attack methods on DPO fine-tuning, not pre-trained model capabilities on benchmarks. The evaluation measures attack effectiveness, not model knowledge."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "Same as above — the paper tests attack methods on fine-tuning pipelines rather than evaluating pre-trained model knowledge on benchmarks."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "Same as above — benchmark contamination is not relevant to evaluating the effectiveness of training-time poisoning attacks."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No inference cost, latency, or API cost information is reported. The paper does not quantify the cost of running the poisoning pipeline or generating poisoned model responses."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Hardware is mentioned ('at least 4xA500 GPUs or equivalent and a memory of 64 GB') but total GPU hours, training time per experiment, or total computational budget is not stated."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No mention of multiple random seeds. All results appear to be from single runs, which is concerning given the known sensitivity of RL/DPO training to initialization."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The number of experimental runs is never stated. It is unclear whether results are from single runs or averaged over multiple runs."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "Fixed hyperparameters are reported (Section 4.1) but no hyperparameter search budget or method is described. It is unclear how these values were selected."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "The paper uses a single configuration for most experiments (β=0.1, fixed LORA settings) without justifying why these were chosen or whether other configurations were tried."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "Many comparisons are made across models, poisoning methods, percentages, and epochs without any statistical tests, let alone correction for multiple comparisons."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors implement all attack methods themselves and compare them without acknowledging potential self-comparison bias. No independent evaluation or replication is mentioned."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "No analysis of performance as a function of compute budget. The DPO score-based method requires an initial clean DPO training (more compute than random poisoning) but this compute trade-off is not discussed."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The paper does not discuss whether its evaluation metrics (clean reward model score, GPT-4 rating) actually measure real-world harmfulness or whether the Anthropic RLHF dataset is a valid proxy for real poisoning scenarios."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding is involved in this training-time poisoning study."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "The base models (Llama 2, Mistral, Gemma) may have been pre-trained on data that includes the Anthropic RLHF dataset or similar content. This temporal leakage is not discussed."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether the evaluation setup (clean reward model also trained on the same Anthropic dataset) introduces information leakage into the evaluation."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "The 200 test prompts are 'sampled from the test set' but no analysis of whether they are independent from or similar to the training samples used for poisoning."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No concrete leakage detection or prevention methods are applied."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "DPO can be poisoned with only 0.5% of training data using DPO score-based selection for backdoor attacks, compared to 4% required for random poisoning or PPO-based methods.",
    371       "evidence": "Table 1 shows DPOS at 0.5% poison achieves GPT-4 score of 3.42 and clean reward of 2.46 at epoch 5, comparable to random poisoning at 4-5%. Prior work [5] showed PPO requires 3-4% for backdoor attacks.",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "Non-backdoor attacks are significantly harder, requiring approximately 25% data poisoning even with DPO score-based selection.",
    376       "evidence": "Figure 6 shows that even DPO score-based non-backdoor attacks at 25% achieve the effect of 50% random poisoning. Section 6 notes this makes non-backdoor attacks impractical.",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "The simpler DPO score-based method outperforms more complex gradient projection and semantic diversity approaches for poisoning.",
    381       "evidence": "Table 2 shows DPOS outperforming DPOS+GP at 0.5% and 1% poison. Figure 7a and Table 3 show semantic clustering does not improve over DPOS. Figure 7b shows gradient projection falls behind DPOS.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Existing defenses (spectral signatures, gradient-based clustering, high-loss removal) fail to detect DPO poisoned data.",
    386       "evidence": "Figure 9 shows spectral methods fail to separate clean and poisoned data. Figure 10 shows gradient clusters don't isolate poisoned points. Figure 11 shows high-loss removal is ineffective for random and gradient-projection poisons.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Lower β values increase DPO's vulnerability to poisoning by allowing the model to deviate further from the reference model.",
    391       "evidence": "Figures 5c and 5d show poisoning effectiveness increases with lower β on both GPT-4 and clean reward metrics for Llama 2 7B with 4-5% backdoor poisoning.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "DPO score-based influential points are model-specific, with Llama 2 showing minimal overlap with Mistral and Gemma.",
    396       "evidence": "Figure 8 shows Llama 2 7B has almost no overlap with other models. Mistral 7B and Gemma 7B share some overlap (22% at top 0.5%).",
    397       "supported": "moderate"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "No error bars or multi-run experiments",
    403       "detail": "All results appear to be from single experimental runs. Given the known sensitivity of DPO/RL training to random initialization and seeds (as documented by Henderson et al. 2018), the reported differences between methods may not be reproducible."
    404     },
    405     {
    406       "flag": "All models limited to 7B scale",
    407       "detail": "Experiments only cover 7B parameter models (Llama 2 7B, Mistral 7B, Gemma 7B). Claims about DPO vulnerability may not hold at larger scales where model capacity and regularization behavior differ."
    408     },
    409     {
    410       "flag": "Backdoor trigger not specified",
    411       "detail": "The actual backdoor trigger text is not provided in the paper, only referenced from prior work [5]. This limits reproducibility since the specific trigger could affect results."
    412     },
    413     {
    414       "flag": "Evaluation metrics may not reflect real-world harm",
    415       "detail": "Both evaluation methods (clean reward model score and GPT-4 rating) are automated proxies for harmfulness. The clean reward model is trained on the same Anthropic dataset used for poisoning, potentially creating circular evaluation. No human evaluation of actual harmfulness is performed."
    416     },
    417     {
    418       "flag": "No limitations section",
    419       "detail": "The paper has no dedicated limitations or threats-to-validity section despite making broad claims about DPO vulnerability."
    420     }
    421   ],
    422   "cited_papers": [
    423     {
    424       "title": "Universal jailbreak backdoors from poisoned human feedback",
    425       "authors": ["Javier Rando", "Florian Tramèr"],
    426       "year": 2024,
    427       "relevance": "Primary comparison: demonstrates universal backdoor attacks on PPO-based RLHF, establishing the 4% poisoning threshold this paper improves upon for DPO."
    428     },
    429     {
    430       "title": "Direct preference optimization: Your language model is secretly a reward model",
    431       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell", "Stefano Ermon", "Christopher D. Manning", "Chelsea Finn"],
    432       "year": 2023,
    433       "relevance": "Foundational DPO paper whose method is the primary target of the poisoning analysis."
    434     },
    435     {
    436       "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback",
    437       "authors": ["Yuntao Bai", "Andy Jones", "Kamal Ndousse"],
    438       "year": 2022,
    439       "arxiv_id": "2204.05862",
    440       "relevance": "Core RLHF alignment method and source of the Anthropic RLHF dataset used in all experiments."
    441     },
    442     {
    443       "title": "Training language models to follow instructions with human feedback",
    444       "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"],
    445       "year": 2022,
    446       "arxiv_id": "2203.02155",
    447       "relevance": "Foundational InstructGPT work on RLHF for LLM alignment."
    448     },
    449     {
    450       "title": "Poisoning language models during instruction tuning",
    451       "authors": ["Alexander Wan", "Eric Wallace", "Sheng Shen", "Dan Klein"],
    452       "year": 2023,
    453       "relevance": "Prior work on training-time poisoning of LLMs during instruction fine-tuning, providing attack methodology context."
    454     },
    455     {
    456       "title": "Jailbroken: How does LLM safety training fail?",
    457       "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"],
    458       "year": 2023,
    459       "relevance": "Analysis of LLM safety training failures, relevant to understanding how alignment can be undermined."
    460     },
    461     {
    462       "title": "Are aligned neural networks adversarially aligned?",
    463       "authors": ["Nicholas Carlini", "Milad Nasr", "Christopher A. Choquette-Choo"],
    464       "year": 2024,
    465       "relevance": "Tests adversarial robustness of aligned models, complementary to training-time poisoning attacks studied here."
    466     },
    467     {
    468       "title": "Preference poisoning attacks on reward model learning",
    469       "authors": ["Junlin Wu", "Jiongxiao Wang", "Chaowei Xiao", "Chenguang Wang", "Ning Zhang", "Yevgeniy Vorobeychik"],
    470       "year": 2024,
    471       "relevance": "Directly related work on poisoning RLHF reward models, which this paper extends to the DPO setting."
    472     },
    473     {
    474       "title": "Fine-tuning aligned language models compromises safety, even when users do not intend to!",
    475       "authors": ["Xiangyu Qi", "Yi Zeng", "Tinghao Xie", "Pin-Yu Chen", "Ruoxi Jia", "Prateek Mittal", "Peter Henderson"],
    476       "year": 2023,
    477       "relevance": "Shows fine-tuning can degrade LLM safety, supporting the threat model of alignment attacks through training data."
    478     },
    479     {
    480       "title": "Concealed data poisoning attacks on NLP models",
    481       "authors": ["Eric Wallace", "Tony Z. Zhao", "Shi Feng", "Sameer Singh"],
    482       "year": 2021,
    483       "relevance": "Foundational work on concealed data poisoning in NLP, providing attack methodology that this paper builds upon for RLHF."
    484     },
    485     {
    486       "title": "BadGPT: Exploring security vulnerabilities of ChatGPT via backdoor attacks to InstructGPT",
    487       "authors": ["Jiawen Shi", "Yixin Liu", "Pan Zhou", "Lichao Sun"],
    488       "year": 2023,
    489       "relevance": "Explores backdoor attacks on instruction-tuned LLMs, directly relevant to the RLHF poisoning threat model."
    490     }
    491   ],
    492   "engagement_factors": {
    493     "practical_relevance": {
    494       "score": 2,
    495       "justification": "Practitioners fine-tuning LLMs with DPO can use these findings to assess vulnerability and implement defenses, though the paper is more diagnostic than prescriptive."
    496     },
    497     "surprise_contrarian": {
    498       "score": 2,
    499       "justification": "Challenges the assumption that DPO is comparably safe to PPO by showing it needs only 0.5% poisoned data vs 4%, an 8x difference."
    500     },
    501     "fear_safety": {
    502       "score": 3,
    503       "justification": "Demonstrates that DPO alignment can be subverted with minimal data poisoning (0.5%), and that existing defenses fail to detect it — a concrete AI safety threat."
    504     },
    505     "drama_conflict": {
    506       "score": 1,
    507       "justification": "The DPO vs PPO vulnerability comparison creates some tension but no major controversy or accusations."
    508     },
    509     "demo_ability": {
    510       "score": 2,
    511       "justification": "Code is publicly available on GitHub (https://github.com/pankayaraj/RLHFPoisoning), enabling reproduction."
    512     },
    513     "brand_recognition": {
    514       "score": 1,
    515       "justification": "University of Maryland is a respected institution but not a major AI lab; no prominent product or model is being evaluated."
    516     }
    517   }
    518 }

Impressum · Datenschutz