ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (32589B)


      1 {
      2   "paper": {
      3     "title": "Improving LLM Safety Alignment with Dual-Objective Optimization",
      4     "authors": [
      5       "Xuandong Zhao",
      6       "Will Cai",
      7       "Tianneng Shi",
      8       "David Huang",
      9       "Licong Lin",
     10       "Song Mei",
     11       "Dawn Song"
     12     ],
     13     "year": 2025,
     14     "venue": "International Conference on Machine Learning",
     15     "arxiv_id": "2503.03710",
     16     "doi": "10.48550/arXiv.2503.03710"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval", "theoretical"],
     21   "key_findings": "The paper identifies two limitations of DPO for safety alignment—gradient saturation in refusal learning and poor OOD generalization—and proposes DOOR/W-DOOR to address them via robust refusal training with data augmentation and NPO-based unlearning. W-DOOR reduces prefilling attack success rate from 0.210 (DPO) to 0.034 on Llama-3-8B while preserving utility (HellaSwag 0.573 vs. 0.564 for DPO). Token-level distribution analysis shows that stronger robustness correlates with greater KL divergence from the base model and clearer separation of safe/harmful internal representations.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The abstract states: 'The code is available at https://github.com/wicai24/DOOR-Alignment.' A working GitHub URL is provided."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "All evaluation datasets are publicly available: SORRY-Bench, HEx-PHI, HarmBench, XSTest, MMLU, HellaSwag, and Alpaca. Training data is constructed from these public sources as described in Appendix A.1."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions NVIDIA H100 GPUs, AdamW optimizer, and bfloat16 precision, but does not provide a requirements.txt, Dockerfile, or listing of library versions sufficient to recreate the environment."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions are provided in the paper. Training settings are described (Appendix A.6) but there are no 'run this command' instructions or a described reproduction workflow."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Table 1 and all figures report point estimates only (e.g., ASR of 0.034, HellaSwag 0.573) with no confidence intervals, error bars, or ± notation."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper claims methods outperform baselines (e.g., 'DOOR and W-DOOR consistently achieve low ASR') based solely on comparing point estimates in Table 1. No statistical significance tests (t-tests, bootstrap, etc.) are performed."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Table 1 provides absolute performance numbers for all methods and baselines across all metrics, allowing effect magnitudes to be assessed (e.g., W-DOOR prefilling ASR 0.034 vs. DPO 0.210 vs. original 0.547 on Llama-3-8B)."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "Training data consists of 400 safety samples and 400 Alpaca samples, evaluation on ~100 multi-turn and ~180 SORRY-Bench samples. No justification or power analysis is given for these sizes."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single runs with no indication of multiple seeds or repeated experiments."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "The paper compares against SFT, NPO, DPO (with and without augmentation), gradient ascent, Representation Rerouting (RR), and Tampering Attack Resistance (TAR), plus the original unaligned model."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Baselines include RR (Zou et al., 2024), TAR (Tamirisa et al., 2024), and DPO (Rafailov et al., 2024), all recent and competitive methods for LLM safety alignment."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Multiple ablations are provided: DOOR vs. W-DOOR, with vs. without data augmentation (Figure 10), different token-level weight choices including varying τ, sigmoid vs. exponential functions, and jailbroken vs. reference policy (Appendix B.3, Figure 13)."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Six evaluation metrics are used: ASR on prefilling attacks, multi-turn ASR, GCG ASR, AutoDAN ASR, HellaSwag accuracy, and XSTest over-refusal rate. MMLU is also reported in the appendix."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "All safety evaluations use gpt-4o-mini as the LLM judge (Appendix C). No human evaluation of model outputs is performed. Manual verification is used only for training data quality, not system evaluation."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 4 states: 'SORRY-Bench (held-out subset) for single-turn refusal robustness.' HarmBench is entirely OOD, not used in training. Clear train/test separation is described in Appendix A."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Results are broken down by attack type (Table 1), by prefill length (Figure 2), by number of conversation turns (Figure 4), and by training epoch (Figures 5-6)."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 5.1 discusses that multi-turn attacks remain largely effective: 'All methods achieve only marginally more robustness against multi-turn attacks.' Over-refusal behavior is discussed, and gradient ascent is shown to cause model degradation (Appendix B.2). Appendix D shows specific jailbreak examples."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Gradient ascent is shown to significantly degrade model capabilities (Appendix B.2). DPO is shown to decrease safe token probability below the original model (Section 5.2). W-DOOR's over-refusal behavior is reported as a limitation."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims of 'significantly increases LLM robustness against a wide range of jailbreak attacks' are supported by Table 1 showing substantial ASR reductions across prefilling, GCG, AutoDAN, and multi-turn attacks on both models."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Causal claims like 'DOOR improves robustness' are supported by controlled experiments where the training loss is the only variable, with all other settings held constant (same data, hyperparameters, hardware). Ablation studies isolate the effect of individual components (augmentation, unlearning, token weighting)."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The title claims 'Improving LLM Safety Alignment' generally, and the abstract states 'significantly increases LLM robustness.' Results are on only two small models (Gemma-2-2B and Llama-3-8B). No experiments on larger models or closed-source models. The generalization to larger scales or other architectures is not bounded."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "Section 5.2 analyzes why DOOR/W-DOOR works (KL divergence, token distributions, representations) but does not consider alternative explanations for the results. For instance, the role of data augmentation vs. loss function changes is partially ablated but alternative confounds (e.g., training data composition effects) are not discussed."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper measures ASR (attack success rate) and claims robustness to jailbreak attacks. ASR directly measures the outcome of interest. Capability retention is measured via HellaSwag and MMLU, which directly test what they claim. No significant proxy gap exists."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Appendix A.5 specifies 'Llama-3-8B-Instruct' and 'Gemma-2-2B-It' as base models, with links to specific HuggingFace models for RR and TAR baselines. The LLM judge is specified as gpt-4o-mini."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Full LLM-judge prompts for all three evaluation settings (prefilling, multi-turn, HarmBench) are provided in Appendix C. Training data examples with full prompt/response text are shown in Appendix D."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 4 and Appendix A.6 report: β=0.5, α=0.2, τ=5, learning rate 1×10⁻⁵, batch size 2, gradient accumulation 1, 10 epochs, sequence length 512, AdamW optimizer, bfloat16 precision."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. The paper trains and evaluates alignment methods directly on LLMs without any agent framework."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Appendix A.1 describes the full data construction pipeline: fine-tuning a jailbroken model on 110 HEx-PHI samples, generating safe/harmful response pairs for 180 SORRY-Bench and 220 HEx-PHI samples, manual verification, and sampling 400 Alpaca examples for utility data."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. The conclusion (Section 7) discusses future work directions that implicitly acknowledge limitations, but these are framed as improvements rather than a structured limitations discussion."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The conclusion identifies specific issues: token-level weighting parameters need better optimization, the uniform length selection in data augmentation causes overrefusal from 'falsely learned transitions,' and robustness to additional attack types needs investigation. These are specific to this work."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The paper does not explicitly state what the results do NOT show. There are no explicit statements bounding claims to the tested models (2B, 8B) or specific attack types. The conclusion mentions 'investigating robustness to other jailbreak attack types' as future work but does not formally state scope boundaries."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "Code is released but there is no mention of raw experimental outputs (per-sample model generations, per-sample judge verdicts) being available for independent verification."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Appendix A.1 describes data collection in detail: sources (SORRY-Bench, HEx-PHI, Alpaca), jailbroken model fine-tuning procedure, response generation protocol with manual verification, and sample counts at each stage."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. All data comes from standard public benchmarks and model-generated responses."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The pipeline is documented: 110 HEx-PHI samples → fine-tune jailbroken model → generate harmful/safe response pairs for 180+220 samples → manual verification → combine with 400 Alpaca samples. Evaluation pipeline is similarly described with specific sample counts per dataset."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding or acknowledgments section is present in the provided text. No grants, sponsors, or funding agencies are mentioned."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "All authors are listed as affiliated with the University of California, Berkeley. No commercial product is being evaluated, so there is no vendor conflict."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "Funding is not disclosed, so independence cannot be assessed. Without a funding statement, this criterion is not satisfied."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is present in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "This paper tests safety alignment defenses against jailbreak attacks rather than evaluating pre-trained model knowledge on benchmarks. MMLU and HellaSwag are used only as secondary utility-preservation checks, not as the primary evaluation."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "The paper tests defense methods (alignment training) rather than pre-trained model capability. Contamination of benchmark answers in pre-training is not the relevant concern for safety robustness evaluation."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "As a defense-testing paper, benchmark contamination in the traditional sense (model memorizing test answers) is not the primary concern. The paper does properly hold out SORRY-Bench test data from training."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No inference cost, training cost, or wall-clock time is reported. The paper mentions using H100 GPUs but does not quantify how long training takes or what the computational cost is."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "NVIDIA H100 GPUs are mentioned but total GPU hours, training time, or computational budget are not stated. The training involves 10 epochs on two models but the time/cost is unquantified."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single training runs."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of experimental runs is never stated. Results are presented without indicating whether they are from single runs or averaged over multiple runs."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Values β=0.5, α=0.2 are used without stating how they were selected. While Appendix B.3 shows sensitivity to τ values, no systematic search budget or selection methodology is described."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The main configuration (β=0.5, α=0.2, τ=5) is presented without justification for selection. Figure 13 shows τ sensitivity but does not explain how τ=5 was chosen as the default."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors implement their own versions of SFT, NPO, and DPO baselines. While RR and TAR use publicly released HuggingFace models (independent implementations), no discussion of self-comparison bias for the author-implemented baselines is provided."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "Figures 5 and 6 show the Pareto analysis of ASR vs. HellaSwag accuracy and over-refusal rate over 10 training epochs, effectively showing performance as a function of compute. All methods are trained under identical compute settings, making them comparable."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The paper does not discuss whether ASR (judged by gpt-4o-mini) fully captures safety robustness, or whether HellaSwag/MMLU adequately measure capability retention. No discussion of benchmark construct validity is provided."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No agentic scaffolding is used in this work. All methods are evaluated as direct model fine-tuning approaches."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether Llama-3 or Gemma-2 may have been trained on data from SORRY-Bench, HEx-PHI, or other evaluation benchmarks. Temporal relationships between model training and benchmark creation are not addressed."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "The LLM judge (gpt-4o-mini) could introduce systematic biases in ASR evaluation. The paper does not discuss potential feature leakage through the judge's own biases or whether the judge's assessment correlates with actual harmful content."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": true,
    361         "justification": "The paper uses a held-out subset of SORRY-Bench for evaluation (180 training, separate for evaluation) and tests on the entirely OOD HarmBench dataset. Train/test separation is explicitly documented in Section 4 and Appendix A."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No concrete leakage detection method is applied. The paper relies on dataset splitting but does not use canary strings, membership inference, or n-gram overlap analysis."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "DPO's gradient dynamics are suboptimal for safety alignment due to premature learning rate saturation and poor OOD generalization.",
    373       "evidence": "Section 2.2 provides gradient decomposition of DPO loss, showing the effective learning rate diminishes exponentially as the safe/harmful margin grows, and that gradient terms can inadvertently increase OOD response logits.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "DOOR and W-DOOR significantly reduce attack success rates against prefilling attacks compared to DPO and other baselines.",
    378       "evidence": "Table 1 and Figure 2: W-DOOR achieves 0.034 prefilling ASR vs. DPO's 0.210 on Llama-3-8B, and 0.005 vs. DPO's 0.060 on Gemma-2-2B across multiple prefill lengths.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Prefilling attack robustness generalizes to other attack types including GCG, AutoDAN, and multi-turn attacks.",
    383       "evidence": "Table 1 shows consistent ASR rankings across attack types. Section 5.1 notes: 'ASR rankings remain consistent across various attack types, indicating that prefilling attack ASR is a reliable measure of overall robustness.' Multi-turn gains are marginal (0.447 vs. 0.521 for W-DOOR vs. original on Llama).",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "W-DOOR preserves general model capabilities better than DPO while achieving stronger robustness.",
    388       "evidence": "Table 1 and Figure 3: W-DOOR HellaSwag 0.573 vs. DPO 0.564 on Llama-3-8B. Figure 5 Pareto analysis shows W-DOOR remains near Pareto-optimal for the longest duration.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Data augmentation with harmful prefixes is critical for prefilling attack robustness.",
    393       "evidence": "Figure 10 shows a large gap between augmented and non-augmented versions of all methods. Section 5.1 states: 'Data augmentation significantly enhances robustness against prefilling attacks as demonstrated by DPO ablation.'",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Token distribution shifts (KL divergence from base model) correlate with robustness to jailbreak attacks.",
    398       "evidence": "Section 5.2, Figure 7: W-DOOR shows the largest KL divergence from the base model, which 'strongly correlates with its robustness to prefilling attacks.' Figure 8 shows W-DOOR is more effective at reducing harmful token probability at later positions.",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "Extended training reduces over-refusal while maintaining robustness.",
    403       "evidence": "Section 5.1 and Figure 6 show that over-refusal rate and prefilling ASR decrease simultaneously over epochs for W-DOOR, suggesting 'no clear trade-off between refusing harmful queries and over-refusing benign ones.'",
    404       "supported": "moderate"
    405     }
    406   ],
    407   "red_flags": [
    408     {
    409       "flag": "No error bars or variance reporting",
    410       "detail": "All results in Table 1 and figures appear to be from single runs. No standard deviations, confidence intervals, or repeated experiments are reported. Given that alignment training can be sensitive to initialization, this is a significant gap in assessing result reliability."
    411     },
    412     {
    413       "flag": "LLM-as-judge without human validation",
    414       "detail": "All safety evaluations rely on gpt-4o-mini as the sole judge via string-matching 'yes'/'no' responses. No human evaluation validates the judge's accuracy. The judge prompts (Appendix C) have potential for systematic biases that could favor certain response patterns over actual safety assessment."
    415     },
    416     {
    417       "flag": "Limited model scale",
    418       "detail": "Experiments are conducted only on Gemma-2-2B and Llama-3-8B, both relatively small models. It is unclear whether the findings transfer to larger models (70B+) where safety alignment dynamics may differ substantially."
    419     },
    420     {
    421       "flag": "Adversarial attacks not optimized against aligned models",
    422       "detail": "GCG and AutoDAN attacks are 'optimized with respect to the base model (i.e., all alignment methods are attacked by the same data)' (Appendix A.7). This means attacks are not adapted to the specific defenses, potentially underestimating the success rate of targeted attacks against DOOR/W-DOOR."
    423     },
    424     {
    425       "flag": "No statistical significance testing",
    426       "detail": "Claims of superiority (e.g., 'significantly improve safety alignment') are based on comparing point estimates without any formal statistical tests. The magnitude of differences may not be statistically significant given unknown variance."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Safety alignment should be made more than just a few tokens deep",
    432       "authors": ["Xiangyu Qi", "Ashwinee Panda", "Kaifeng Lyu", "Xiao Ma", "Subhrajit Roy", "Ahmad Beirami", "Prateek Mittal", "Peter Henderson"],
    433       "year": 2024,
    434       "arxiv_id": "2406.05946",
    435       "relevance": "Demonstrates that existing SFT alignment is shallow and proposes data augmentation to deepen safety alignment—directly foundational to DOOR's approach."
    436     },
    437     {
    438       "title": "Negative preference optimization: From catastrophic collapse to effective unlearning",
    439       "authors": ["Ruiqi Zhang", "Licong Lin", "Yu Bai", "Song Mei"],
    440       "year": 2024,
    441       "arxiv_id": "2404.05868",
    442       "relevance": "Proposes NPO which is a core component of the DOOR loss function for targeted unlearning of harmful knowledge."
    443     },
    444     {
    445       "title": "Direct preference optimization: Your language model is secretly a reward model",
    446       "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell", "Christopher D. Manning", "Stefano Ermon", "Chelsea Finn"],
    447       "year": 2024,
    448       "relevance": "The primary baseline method whose limitations in safety alignment are analyzed and addressed by DOOR."
    449     },
    450     {
    451       "title": "Universal and transferable adversarial attacks on aligned language models",
    452       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J. Zico Kolter", "Matt Fredrikson"],
    453       "year": 2023,
    454       "arxiv_id": "2307.15043",
    455       "relevance": "Introduces the GCG adversarial suffix attack, one of the primary attack methods used to evaluate defense robustness."
    456     },
    457     {
    458       "title": "Jailbreaking leading safety-aligned llms with simple adaptive attacks",
    459       "authors": ["Maksym Andriushchenko", "Francesco Croce", "Nicolas Flammarion"],
    460       "year": 2024,
    461       "arxiv_id": "2404.02151",
    462       "relevance": "Describes prefilling attacks on LLMs, the primary threat model evaluated in this paper."
    463     },
    464     {
    465       "title": "Improving alignment and robustness with circuit breakers",
    466       "authors": ["Andy Zou", "Long Phan", "Justin Wang", "Derek Duenas", "Maxwell Lin", "Maksym Andriushchenko", "Rowan Wang", "J. Zico Kolter", "Matt Fredrikson", "Dan Hendrycks"],
    467       "year": 2024,
    468       "arxiv_id": "2406.04313",
    469       "relevance": "Representation Rerouting (RR) baseline using representation engineering to block harmful outputs—an alternative defense approach compared against DOOR."
    470     },
    471     {
    472       "title": "Tamper-resistant safeguards for open-weight llms",
    473       "authors": ["Rishub Tamirisa", "Bhrugu Bharathi", "Long Phan"],
    474       "year": 2024,
    475       "arxiv_id": "2408.00761",
    476       "relevance": "TAR baseline that prevents fine-tuning from removing safety measures—a complementary defense approach benchmarked against DOOR."
    477     },
    478     {
    479       "title": "HarmBench: A standardized evaluation framework for automated red teaming and robust refusal",
    480       "authors": ["Mantas Mazeika", "Long Phan", "Xuwang Yin"],
    481       "year": 2024,
    482       "arxiv_id": "2402.04249",
    483       "relevance": "Provides the out-of-distribution evaluation benchmark for GCG and AutoDAN attacks used in this paper."
    484     },
    485     {
    486       "title": "Great, now write an article about that: The crescendo multi-turn llm jailbreak attack",
    487       "authors": ["Mark Russinovich", "Ahmed Salem", "Ronen Eldan"],
    488       "year": 2024,
    489       "arxiv_id": "2404.01833",
    490       "relevance": "Introduces the Crescendo multi-turn jailbreak attack used to generate multi-turn evaluation data in this paper."
    491     },
    492     {
    493       "title": "Safe unlearning: A surprisingly effective and generalizable solution to defend against jailbreak attacks",
    494       "authors": ["Zhexin Zhang", "Junxiao Yang", "Pei Ke"],
    495       "year": 2024,
    496       "arxiv_id": "2407.02855",
    497       "relevance": "Combines unlearning with SFT for safety alignment without data augmentation—a closely related defense approach."
    498     },
    499     {
    500       "title": "Refuse whenever you feel unsafe: Improving safety in LLMs via decoupled refusal training",
    501       "authors": ["Youliang Yuan", "Wenxiang Jiao", "Wenxuan Wang"],
    502       "year": 2024,
    503       "arxiv_id": "2407.09121",
    504       "relevance": "Proposes decoupled refusal training with data augmentation, closely related to the robust refusal training component of DOOR."
    505     },
    506     {
    507       "title": "The instruction hierarchy: Training llms to prioritize privileged instructions",
    508       "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"],
    509       "year": 2024,
    510       "arxiv_id": "2404.13208",
    511       "relevance": "Proposes instruction hierarchy for LLM safety at inference time, a complementary defense to training-time methods like DOOR."
    512     },
    513     {
    514       "title": "Fine-tuning aligned language models compromises safety, even when users do not intend to!",
    515       "authors": ["Xiangyu Qi", "Yi Zeng", "Tinghao Xie", "Pin-Yu Chen", "Ruoxi Jia", "Prateek Mittal", "Peter Henderson"],
    516       "year": 2023,
    517       "arxiv_id": "2310.03693",
    518       "relevance": "Demonstrates that fine-tuning can compromise safety alignment, motivating the harmful response simulation strategy used in DOOR's data construction."
    519     }
    520   ],
    521   "engagement_factors": {
    522     "practical_relevance": {
    523       "score": 2,
    524       "justification": "Code is released and the method can be applied by practitioners fine-tuning open-weight models for safety, but requires GPU training infrastructure and safety dataset construction."
    525     },
    526     "surprise_contrarian": {
    527       "score": 1,
    528       "justification": "The finding that DPO is suboptimal for safety alignment extends prior observations; the specific gradient analysis provides new depth but the general direction is expected."
    529     },
    530     "fear_safety": {
    531       "score": 2,
    532       "justification": "Demonstrates that widely deployed DPO alignment remains vulnerable to jailbreaks, raising concerns about current production safety measures."
    533     },
    534     "drama_conflict": {
    535       "score": 1,
    536       "justification": "Criticizes DPO (used in Llama-3 alignment) as inadequate for safety, but frames it constructively as a technical improvement rather than controversy."
    537     },
    538     "demo_ability": {
    539       "score": 2,
    540       "justification": "GitHub repository with code is provided, allowing researchers to reproduce the training pipeline, though it requires H100 GPUs and dataset setup."
    541     },
    542     "brand_recognition": {
    543       "score": 1,
    544       "justification": "UC Berkeley authors including Dawn Song (well-known in security) and published at ICML, but not a major AI lab product or about a flagship commercial model."
    545     }
    546   }
    547 }

Impressum · Datenschutz