ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27332B)


      1 {
      2   "paper": {
      3     "title": "Alleviating the Fear of Losing Alignment in LLM Fine-tuning",
      4     "authors": ["Kang Yang", "Guanhong Tao", "Xun Chen", "Jun Xu"],
      5     "year": 2025,
      6     "venue": "IEEE Symposium on Security and Privacy (S&P) 2025",
      7     "arxiv_id": "2504.09757",
      8     "doi": ""
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "The paper states 'Our code is available at https://github.com/kangyangWHU/LLMAlignment' in the abstract (end of Section 1)."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The paper uses publicly available datasets: BeaverTails, WikiSQL, Spider, CHEAT, NL2BASH, SAMSum, ToxicChat, CATQA, and HEx-PHI. All datasets are referenced with public links or well-known dataset identifiers. The recovery dataset is drawn from BeaverTails which is public."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions hardware (AMD EPYC 7513, NVIDIA A100 80GB, 256GB RAM) in Section 6.7 but does not specify software dependencies, library versions, or a reproducible environment setup."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "While code is released on GitHub and hyperparameters are described in Section 3.1 and 6.1, the paper does not provide step-by-step reproduction instructions within the paper itself. The reader would need to reverse-engineer the experimental setup from scattered parameter descriptions."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Results are reported as point estimates only (e.g., 'reduces the harmful rate from 33.25% to 1.74%', Table 3, Table 5, Table 6). No confidence intervals, error bars, or uncertainty measures are provided for any result."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper makes extensive comparative claims (their method vs. RESTA, SoftSFT, and baselines) but no statistical significance tests are reported. Comparisons rely solely on point-estimate differences."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper reports percentage-point improvements with baseline context. For example, 'reduces harmful rate from 33.25% to 1.74%' and '2.93% degradation in downstream task performance' (Section 1). Tables 3, 5, and 6 provide before-and-after numbers allowing effect size computation."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No justification is given for why 5 models, 5 datasets, 125 fine-tuned model variants, or the specific harmful sample counts (100, 500, 1500) were chosen. The choice of 700 harmful test questions from BeaverTails is also not justified."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No variance, standard deviation, or spread measures are reported across any experimental runs. The paper does not state whether experiments were run multiple times or report any measure of result stability."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper compares against two baselines: SoftSFT and RESTA (Section 6.4). Additional baselines include refusal fine-tuning, activation steering, and L1/L2 penalty approaches (Section 6.4, 'Comparing with Additional Baselines')."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "SoftSFT (2024) and RESTA (2024) are contemporary methods. The paper also compares against Safe LoRA (2024), Vaccine (2024), and other recent approaches in the related work section. The baselines are representative of the state of the art."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The paper includes extensive ablation studies: varying the direction layer position (Section 6.5.1, Figure 5), recovery dataset diversity/size/distribution (Section 6.5.2, Figures 6-7), recovery rate P% (Section 6.5.3, Table 10), rollback rate R% (Section 6.5.3, Table 19), and rollback mechanism enabled vs. disabled (Table 18)."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper uses two primary metrics: harmful rate and task performance (Section 3.2). Task performance itself is measured with dataset-specific metrics (Exact Match, F1 Score, NLC2CMD, Rouge-1). In Section 7, they additionally use five utility benchmarks (PIQA, GSM8K, TriviaQA, HumanEval, MMLU)."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "Harmful rate is measured using a ShieldLLM model fine-tuned for toxicity detection (Section 3.2) rather than human evaluation. Given that determining whether an answer is 'harmful' involves subjective judgment, human evaluation of model outputs would strengthen the claims, but none is provided."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The paper explicitly separates training and test data. For SAMSum and TOXIC, internal splits are used. For SQL, CHEAT, and NL2BASH, 1,000 samples are held out for evaluation (Section 3.2). The recovery dataset (256 prompts) does not overlap with the 1,500 harmful prompts used for pollution (Section 5.2.1)."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down by model (5 models), dataset (5 datasets), and number of injected harmful samples (5 settings), providing per-category detail. Tables 3, 5, and Figure 3 show results across all 125 combinations."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The paper discusses cases where the method falls short: LLAMA2 models with perfect original alignment are harder to recover (Section 6.2), classification tasks are more susceptible to disruption than generation tasks (Section 6.2), and Mistral 7B shows higher residual harmful rates in some settings (Appendix A.2)."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports negative results: activation steering degrades task performance significantly (Section 6.4), L1/L2 penalties can overfit and degrade performance (Section 6.4), and rollback is not always necessary and sometimes fails to achieve significant improvement (Section 6.3, Table 18)."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims the method 'reduces harmful rate from 33.25% to 1.74% without sacrificing task performance much' and that 'existing methods either only reduce the harmful rate to a limited extent or significantly impact the normal functionality.' Both claims are supported by the results in Figures 3-4 and Tables 5-6."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper makes causal claims about the mechanism (harmful direction determines alignment behavior, Section 5.1) and validates them through controlled experiments: modifying hidden states changes harmful rates (Table 4), and the ablation studies (direction layer, recovery dataset properties) isolate individual factors. The controlled single-variable manipulations are adequate for the claims made."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The paper bounds claims to the tested models and settings. Section 6.2 notes 'the difficulty of recovery increases with the alignment strength' and that results vary across models and task types (generation vs. classification). Results on latest models (Table 17) are presented as demonstrating 'generalizability' but only for 3 additional models."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper does not substantively discuss alternative explanations for its observed results. For instance, it does not consider whether the weight restoration simply memorizes refusal patterns rather than genuinely restoring the harmful direction, or whether the improvement over baselines could be partly due to the hyperparameter optimization applied to RESTA but not available to SoftSFT."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Table 1 provides exact HuggingFace model paths: google/gemma-2b-it, meta-llama/Llama-2-7b-chat-hf, meta-llama/Llama-2-13b-chat-hf, mistralai/Mistral-7B-Instruct-v0.2, Qwen/Qwen1.5-7B-Chat. These are specific, versioned model identifiers."
    134       },
    135       "prompts_provided": {
    136         "applies": false,
    137         "answer": false,
    138         "justification": "The paper does not use prompting as part of its method. The approach operates on model weights and hidden states directly, not through prompt engineering."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Section 3.1 specifies QLoRA parameters (rank=8, alpha=16, learning rate=2e-5, batch size=96, 4-bit quantization, 1 epoch). Section 6.1 specifies recovery parameters (P%=0.2%, R%=20%, performance threshold=5%, epochs=20, direction layer at 2/3 position, recovery dataset size=256)."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No agentic scaffolding is used. The method is a direct weight-manipulation algorithm, not an agentic system."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 3.1 documents data preprocessing: deduplication of BeaverTails (cosine similarity > 0.9 threshold), resulting in 9,795 harmful QA pairs from 333,963, with 1,500 reserved for pollution. The moderation filtering pipeline is described with counts per method (Section 3.1, footnote 2)."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "There is no dedicated limitations or threats-to-validity section. The meta-review (Appendix D) notes 'Noteworthy Concerns' about limited technical novelty, but this is from reviewers, not the authors. Scattered observations about failure cases exist but do not constitute a limitations section."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "No specific threats to validity are discussed. The paper does not address potential issues such as the use of an automated judge (ShieldLLM) instead of humans for harmful rate measurement, the limited model sizes tested, or whether the approach would work under different fine-tuning regimes (e.g., full-parameter fine-tuning at scale)."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound the scope to specific model sizes, architectures, or fine-tuning configurations, despite testing only QLoRA-based fine-tuning on models up to 32B parameters."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "All datasets used are publicly available: BeaverTails, WikiSQL, Spider, CHEAT, NL2BASH, SAMSum, ToxicChat, CATQA, HEx-PHI. The code repository is provided. A third party could reproduce the data pipeline."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 3.1 describes the data collection: five datasets are described with their sources, sizes, and task types (Table 2). The harmful data collection from BeaverTails is described with deduplication procedure and filtering counts."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants are involved. The study uses public datasets and automated evaluation."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The data pipeline is documented: BeaverTails training subset (333,963 QA pairs) → remove non-harmful → deduplicate (cosine similarity > 0.9) → 9,795 harmful pairs → reserve 1,500 for pollution → inject 100/500/1500 into each dataset. Moderation filtering (1,500 → 392 after GPT-4o) is also documented with per-method counts."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 10 (Acknowledgment) discloses funding: 'This work was supported by National Science Foundation (NSF) awards CNS-2029038 and OAC-2319880. We are also grateful to NVIDIA for providing computational resources.'"
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are clearly listed: University of Utah (Yang, Tao, Xu) and Samsung Research America (Chen). The paper does not evaluate Samsung or University of Utah products, so there is no direct conflict."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The funders are NSF (a government agency) and NVIDIA (providing compute resources). Neither has a direct financial stake in the specific outcome of alignment recovery research. The NSF disclaimer is included."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is provided in the paper. One author is at Samsung Research America, which has potential commercial interests in LLM safety, but no declaration is made."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "The paper does not evaluate pre-trained model capability on benchmarks to measure model knowledge. It evaluates an alignment recovery method by measuring harmful rate and downstream task performance after fine-tuning. Contamination is not a relevant concern for this study design."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "Same as above: the study tests a post-fine-tuning alignment recovery method, not model knowledge on benchmarks. The paper does ensure non-overlap between recovery and pollution datasets (Section 5.2.1)."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "Benchmark contamination is not applicable to this study design, which measures alignment recovery rather than model knowledge capabilities."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants are involved in this study."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved in this study."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved in this study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved in this study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved in this study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved in this study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved in this study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": true,
    275         "justification": "Section 6.7 and Table 9 report the time cost in minutes for alignment recovery across all model-dataset combinations. Table 15 provides optimized time costs for different model sizes (e.g., 10 minutes for 2B, 47 minutes for 7B, 103 minutes for 13B, 230 minutes for 32B)."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": true,
    280         "justification": "Section 6.7 specifies the hardware: AMD EPYC 7513 32-Core Processor, NVIDIA A100 (80GB), 256GB RAM. Table 9 provides wall-clock time for each experiment, and Table 15 shows the time with and without GPU optimization."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "The proposed method reduces the harmful rate of 125 fine-tuned LLMs from 33.25% to 1.74% while only incurring a 2.93% degradation in downstream task performance.",
    287       "evidence": "Figure 3 shows harmful rate recovery across all 125 model-dataset-setting combinations. Table 5 shows task performance degradation averaging 2.9%, with 30/125 cases below 1% degradation.",
    288       "supported": "strong"
    289     },
    290     {
    291       "claim": "Pushing internal features of harmful questions closer to the aligned direction and away from the harmful direction causes aligned LLMs to respond to harmful prompts, increasing harmful rate from 4.57% to 80.42%.",
    292       "evidence": "Table 4 in Section 5.1 shows harmful rate increases from nearly 0 to 45-82% across all 5 models after direction manipulation.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "The method achieves a better trade-off between alignment and task performance than SoftSFT and RESTA baselines.",
    297       "evidence": "Figure 4 visualizes the trade-off across all datasets, showing the proposed method's results cluster in the upper-right (high performance, low harmful rate) while baselines scatter further into low-performance or high-harmful-rate regions.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "The recovery dataset diversity, size, and distribution have marginal impact on the method's effectiveness.",
    302       "evidence": "Section 6.5.2, Figures 6-7, Tables 11-12 show that varying categories (2-14), sample counts (16-256), and dataset source (BeaverTails, CATQA, HEx-PHI) produce less than 2% difference in harmful rate for most models.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "Even fine-tuning on clean datasets without harmful data increases the harmful rate of aligned LLMs.",
    307       "evidence": "Table 3 shows harmful rate increases from 11.7% to 21.3% for Mistral 7B and from 2.4% to 4.7% for Qwen 7B when fine-tuned without any injected harmful samples. However, LLAMA2 models show 0% harmful rate even after clean fine-tuning.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "The method generalizes to latest models including Llama3.1 8B, Llama3.2 3B, and Qwen2.5 32B.",
    312       "evidence": "Table 17 shows the method reduces harmful rates from 55-62% to 0.9-8.1% on these newer models while maintaining task performance.",
    313       "supported": "moderate"
    314     }
    315   ],
    316   "methodology_tags": ["benchmark-eval"],
    317   "key_findings": "The paper proposes an alignment recovery method for fine-tuned LLMs that identifies and restores a small subset of weight parameters using gradient-guided selection based on 'harmful direction' representations. Evaluated on 125 fine-tuned models (5 base models x 5 datasets x 5 harmful injection levels), the method reduces harmful rate from 33.25% to 1.74% with only 2.93% task performance degradation. The approach outperforms SoftSFT and RESTA baselines in the trade-off between alignment and utility, and shows robustness across different recovery dataset properties and model architectures including recent Llama 3 and Qwen 2.5 models.",
    318   "red_flags": [
    319     {
    320       "flag": "No statistical uncertainty quantification",
    321       "detail": "All 125+ experimental results are reported as single point estimates with no confidence intervals, error bars, standard deviations, or significance tests. Given that fine-tuning and recovery involve stochastic processes, the absence of any uncertainty measurement is a significant methodological gap."
    322     },
    323     {
    324       "flag": "No limitations section",
    325       "detail": "The paper lacks a dedicated limitations or threats-to-validity section. For a security venue paper making strong claims about alignment recovery, the absence of explicit scope boundaries and known failure modes is concerning."
    326     },
    327     {
    328       "flag": "Automated harmful rate judge without validation",
    329       "detail": "Harmful rate is determined entirely by ShieldLLM, an automated model. The paper states ShieldLLM 'has demonstrated high fidelity' (Section 3.2) but does not validate this claim, report its error rate, or compare with human judgment on any subset."
    330     },
    331     {
    332       "flag": "Baseline comparison may be unfair",
    333       "detail": "The authors optimized RESTA's hyperparameters (increased batch size 4→10, epochs 3→5, enabled all linear modules) to make it work in their setting (Appendix B.1). While this is disclosed, it raises questions about whether the optimized RESTA represents the baseline at its best, and similar optimization effort was not applied systematically to other baselines."
    334     }
    335   ],
    336   "cited_papers": [
    337     {
    338       "title": "Fine-tuning aligned language models compromises safety, even when users do not intend to!",
    339       "authors": ["X. Qi", "Y. Zeng", "T. Xie", "P.-Y. Chen", "R. Jia", "P. Mittal", "P. Henderson"],
    340       "year": 2023,
    341       "arxiv_id": "2310.03693",
    342       "relevance": "Foundational work demonstrating that fine-tuning can compromise LLM alignment, directly motivating this paper's alignment recovery approach."
    343     },
    344     {
    345       "title": "Safety alignment should be made more than just a few tokens deep",
    346       "authors": ["X. Qi", "A. Panda", "K. Lyu", "X. Ma", "S. Roy", "A. Beirami", "P. Mittal", "P. Henderson"],
    347       "year": 2024,
    348       "arxiv_id": "2406.05946",
    349       "relevance": "Proposes SoftSFT, one of the main baselines in this paper for preserving alignment during fine-tuning."
    350     },
    351     {
    352       "title": "Language models are homer simpson! Safety re-alignment of fine-tuned language models through task arithmetic",
    353       "authors": ["R. Bhardwaj", "D. D. Anh", "S. Poria"],
    354       "year": 2024,
    355       "arxiv_id": "2402.11746",
    356       "relevance": "Proposes RESTA, the other main baseline for post-fine-tuning alignment recovery via safety vectors."
    357     },
    358     {
    359       "title": "Representation engineering: A top-down approach to AI transparency",
    360       "authors": ["A. Zou", "L. Phan", "S. Chen", "J. Campbell", "P. Guo", "R. Ren", "A. Pan", "X. Yin", "M. Mazeika", "A.-K. Dombrowski"],
    361       "year": 2023,
    362       "arxiv_id": "2310.01405",
    363       "relevance": "Introduces representation engineering concepts that inspire this paper's 'direction' framework for understanding and manipulating LLM alignment."
    364     },
    365     {
    366       "title": "Shadow alignment: The ease of subverting safely-aligned language models",
    367       "authors": ["X. Yang", "X. Wang", "Q. Zhang", "L. Petzold", "W. Y. Wang", "X. Zhao", "D. Lin"],
    368       "year": 2023,
    369       "arxiv_id": "2310.02949",
    370       "relevance": "Demonstrates how fine-tuning with few harmful samples can subvert alignment, providing attack scenarios tested in this paper."
    371     },
    372     {
    373       "title": "Safe LoRA: the silver lining of reducing safety risks when fine-tuning large language models",
    374       "authors": ["C.-Y. Hsu", "Y.-L. Tsai", "C.-H. Lin", "P.-Y. Chen", "C.-M. Yu", "C.-Y. Huang"],
    375       "year": 2024,
    376       "arxiv_id": "2405.16833",
    377       "relevance": "Related approach for preserving alignment during LoRA fine-tuning, relevant to the survey's coverage of LLM safety methods."
    378     },
    379     {
    380       "title": "Vaccine: Perturbation-aware alignment for large language model",
    381       "authors": ["T. Huang", "S. Hu", "L. Liu"],
    382       "year": 2024,
    383       "arxiv_id": "2402.01109",
    384       "relevance": "Model-enhanced approach to building more robust LLMs resistant to fine-tuning attacks, complementary to post-alignment methods."
    385     },
    386     {
    387       "title": "Keeping LLMs aligned after fine-tuning: The crucial role of prompt templates",
    388       "authors": ["K. Lyu", "H. Zhao", "X. Gu", "D. Yu", "A. Goyal", "S. Arora"],
    389       "year": 2024,
    390       "arxiv_id": "2402.18540",
    391       "relevance": "Explores how prompt templates affect alignment preservation during fine-tuning, relevant to understanding fine-tuning safety mechanisms."
    392     },
    393     {
    394       "title": "Lazy safety alignment for large language models against harmful fine-tuning",
    395       "authors": ["T. Huang", "S. Hu", "F. Ilhan", "S. F. Tekin", "L. Liu"],
    396       "year": 2024,
    397       "arxiv_id": "2405.18641",
    398       "relevance": "Proposes introducing alignment data during user fine-tuning stage, a fine-tuning-based defense against alignment compromise."
    399     },
    400     {
    401       "title": "Antidote: Post-fine-tuning safety alignment for large language models against harmful fine-tuning",
    402       "authors": ["T. Huang", "G. Bhattacharya", "P. Joshi", "J. Kimball", "L. Liu"],
    403       "year": 2024,
    404       "arxiv_id": "2408.09600",
    405       "relevance": "Post-fine-tuning alignment recovery approach using parameter pruning, directly comparable to the proposed method."
    406     },
    407     {
    408       "title": "BeaverTails: Towards improved safety alignment of LLM via a human-preference dataset",
    409       "authors": ["J. Ji", "M. Liu", "J. Dai", "X. Pan", "C. Zhang", "C. Bian", "B. Chen", "R. Sun", "Y. Wang", "Y. Yang"],
    410       "year": 2024,
    411       "relevance": "Primary safety evaluation dataset used in this paper for both training (harmful QA pairs) and testing (700 harmful questions)."
    412     },
    413     {
    414       "title": "Activation addition: Steering language models without optimization",
    415       "authors": ["A. Turner", "L. Thiergart", "D. Udell", "G. Leech", "U. Mini", "M. MacDiarmid"],
    416       "year": 2023,
    417       "arxiv_id": "2308.10248",
    418       "relevance": "Foundational work on activation steering that inspires the direction-based approach used in this paper."
    419     }
    420   ]
    421 }

Impressum · Datenschutz