scan.json (29691B)
1 { 2 "paper": { 3 "title": "BeaverTails: Towards Improved Safety Alignment of LLM via a Human-Preference Dataset", 4 "authors": [ 5 "Jiaming Ji", 6 "Mickel Liu", 7 "Juntao Dai", 8 "Xuehai Pan", 9 "Chi Zhang", 10 "Ce Bian", 11 "Boyuan Chen", 12 "Ruiyang Sun", 13 "Yizhou Wang", 14 "Yaodong Yang" 15 ], 16 "year": 2023, 17 "venue": "NeurIPS 2023 Datasets and Benchmarks Track", 18 "arxiv_id": "2307.04657", 19 "doi": "10.48550/arXiv.2307.04657" 20 }, 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The paper provides a project page URL (https://sites.google.com/view/pku-beavertails) and states the dataset is open-sourced. The dataset is released under CC BY-NC 4.0 license (Appendix A). The paper references a sibling project PKU-BEAVER with Safe RLHF code." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "The BeaverTails dataset is explicitly open-sourced with two iterations (BeaverTails-30k and BeaverTails-330k) released via the project page. The paper states 'we are pleased to open-source our Question-Answering (QA) dataset, BEAVERTAILS' (Section 1) and provides a link." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "No requirements.txt, Dockerfile, conda environment, or detailed library version listing is provided in the paper. The paper mentions using Alpaca-7B and PPO-Lagrangian but does not specify the software environment needed to reproduce the experiments." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "No step-by-step reproduction instructions are provided in the paper. While the training objectives are mathematically specified (Equations 1-3), there is no README-style guide or script description for replicating the experiments." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "Results are reported as point estimates only. Table 1 reports reward model accuracy (78.13%), cost model sign accuracy (95.62%), and cost model preference accuracy (74.37%) without confidence intervals or error bars. Table 2 reports win rates without uncertainty measures." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "The paper makes comparative claims (e.g., Safe-RLHF outperforms PPO, PPOL-classifier variants) in Table 2 but provides no statistical significance tests. Differences between methods are stated as raw percentages without any hypothesis testing." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "Win rates in Table 2 provide baseline context (all methods compared against Alpaca-7B), allowing the reader to gauge magnitude. For example, Safe-RLHF achieves 85.57% helpfulness win rate vs. 65.07% for PPO, providing both absolute numbers and an implicit comparison baseline." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "The evaluation dataset for the safety evaluation in Section 4.1 consists of only 140 red-team prompts (10 per category). No justification is provided for why 140 prompts is sufficient. No power analysis is discussed for any experiment." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "No standard deviations, variance, or multi-run results are reported for any experiment. Table 1 and Table 2 report single-run numbers only. The distribution plots in Figures 6 and 7 show distributions but no quantified spread measures for the main results." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "Table 2 includes multiple baselines: PPO (reward-shaping on mixed preferences), HH-PPO (trained on HH-RLHF dataset), PPOL-classifier-mean, and PPOL-classifier-max. Section 4.1 compares across four LLMs (Alpaca-7B, Alpaca-13B, Vicuna-7b, GPT-3.5-turbo)." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "Baselines include contemporary approaches as of 2023: HH-RLHF from Anthropic (2022), PPO-Lagrangian variants, and the Sparrow-style classifier approach from DeepMind (2022). The comparison models include GPT-3.5-turbo and Vicuna-7b, both current at time of writing." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "Section 4.4 presents systematic ablation studies addressing three research questions: (RQ1) ranking-based cost vs. classifier-based cost, (RQ2) decoupled preferences vs. single preference score, (RQ3) BeaverTails vs. HH-RLHF dataset. Results in Table 2." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "The paper evaluates on both helpfulness and harmlessness metrics (Table 2 reports win rates for both). Additionally, Table 1 reports reward model accuracy, cost model sign accuracy, and cost model preference accuracy. The dataset itself tracks inter-annotator agreement rates." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": true, 95 "justification": "Section 4.1 includes human evaluation as one of three evaluation entities (alongside QA-moderation and GPT-4). The annotation team evaluated 140 QA pairs per model for safety, and agreement ratios between human feedback and automated evaluators are reported in Figure 5." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "Section 4.2 states 'We applied a train-test split of 9:1 and evaluated the performance of these models on the test set.' The reward and cost model results in Table 1 are on this held-out test set." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Figure 1 provides distribution across all 14 harm categories. Figure 3 shows correlation between categories. The additional experiments in Appendix E provide per-category analysis for Perspective API and OpenAI Moderation API across harm categories." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Appendix H explicitly discusses failure cases of Safe-RLHF under high-temperature sampling (T=1.0), showing specific examples where the fine-tuned model still assists with harmful requests. The paper notes the model 'is not yet flawless in countering all malicious prompts' (Section 4.5)." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper reports that PPOL-classifier-mean underperforms PPOL-classifier-max (Section 4.4), that HH-PPO models 'often either abstain from responding to user queries or generate responses that lack sufficient details,' and that PPO with mixed preferences suffers from 'inherent ambiguity.' Appendix H shows failure cases." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "The abstract claims the dataset separates helpfulness and harmlessness annotations (confirmed in Section 3.4), provides 333,963 QA pair labels and 361,903 comparison pairs (confirmed in Section 3.1), and showcases applications in content moderation and RLHF (confirmed in Sections 4.1-4.3). Claims are measured and hedged appropriately." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "The ablation study in Section 4.4 uses controlled single-variable manipulation: each variant changes one component (cost model type, preference coupling, dataset). The causal claim that decoupling preferences helps is supported by comparing Safe-RLHF vs. PPO (mixed preferences) while controlling other variables." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper bounds its scope to the Alpaca-7B model for fine-tuning experiments and specific red-team prompts. Section 5.2 (Limitations) explicitly acknowledges limited demographic diversity of annotators and incomplete harm category coverage. The title uses 'Towards' rather than claiming solved alignment." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": true, 137 "justification": "The paper discusses why PPOL-classifier-mean underperforms (heterogeneous correlations among harm categories, Section 4.4), why HH-PPO underperforms (multi-round conversation format not strongly pertaining to helpfulness/harmlessness), and discusses that temperature settings influence failure cases (Appendix H)." 138 } 139 }, 140 "setup_transparency": { 141 "model_versions_specified": { 142 "applies": true, 143 "answer": false, 144 "justification": "The paper uses 'Alpaca-7B', 'Alpaca-13B', 'Vicuna-7b', 'gpt-3.5-turbo', and 'GPT-4' without specific version identifiers or snapshot dates. No API version or model checkpoint date is provided for any model." 145 }, 146 "prompts_provided": { 147 "applies": true, 148 "answer": true, 149 "justification": "Appendix G provides the full system prompt used for GPT-4 evaluation, including the exact template text with placeholder variables {prompt} and {answer}. Appendix C provides the detailed annotation instructions given to crowdworkers. The red-team prompts are sourced from publicly available HH RED-TEAM dataset." 150 }, 151 "hyperparameters_reported": { 152 "applies": true, 153 "answer": true, 154 "justification": "Section 3.2 reports generation hyperparameters: temperature=1.5, max tokens=512, top_k=30, top_p=0.95. Appendix H notes T=1.0 for failure cases and T=0.001 for safer generation. However, training hyperparameters for the reward/cost models and PPO-Lagrangian are not fully specified (deferred to Safe-RLHF paper [26])." 155 }, 156 "scaffolding_described": { 157 "applies": false, 158 "answer": false, 159 "justification": "No agentic scaffolding is used. The paper involves standard model fine-tuning (RLHF with PPO-Lagrangian) and direct model prompting, not an agentic workflow." 160 }, 161 "data_preprocessing_documented": { 162 "applies": true, 163 "answer": true, 164 "justification": "Section 3.2 documents the data pipeline: red-team prompts from HH RED-TEAM dataset, selection of first questions from dialogues, pre-processing for terse prompts, generation of multiple responses per prompt with specified sampling parameters, and the two-stage annotation process. Appendix D describes the quality control pipeline with rejection/acceptance thresholds." 165 } 166 }, 167 "limitations_and_scope": { 168 "limitations_section_present": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 5.2 is titled 'Limitations and Future Work' and provides substantive discussion of multiple limitations including annotator demographic diversity, harm category coverage, category overlap, and data imbalance." 172 }, 173 "threats_to_validity_specific": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 5.2 identifies specific threats: (1) limited demographic diversity among 70 crowdworkers with similar cultural backgrounds could narrow representation; (2) 14 harm categories may not cover all possible harms; (3) significant category overlap (e.g., correlation coefficient 0.408 between discrimination and hate speech); (4) underrepresentation of categories like Child Abuse and Animal Abuse." 177 }, 178 "scope_boundaries_stated": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 5.2 explicitly states that crowdworkers have 'relatively limited' demographic diversity and that categories 'may not cover all possible types of harm.' The paper acknowledges that category imbalance affects QA-moderation model effectiveness. The scope is bounded to English-language QA pairs from specific prompt sources." 182 } 183 }, 184 "data_integrity": { 185 "raw_data_available": { 186 "applies": true, 187 "answer": true, 188 "justification": "The full dataset is released publicly under CC BY-NC 4.0 license (Appendix A), including all QA pairs with safety meta-labels and human-preference rankings. This enables independent verification of the annotation statistics reported in the paper." 189 }, 190 "data_collection_described": { 191 "applies": true, 192 "answer": true, 193 "justification": "Section 3.2 describes data collection in detail: prompts sourced from HH RED-TEAM dataset [18] and [56], first questions extracted from dialogues, responses generated by Alpaca-7B with specified sampling parameters, two-stage annotation process with 14 harm categories. Appendix D provides additional detail on quality control." 194 }, 195 "recruitment_methods_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Appendix D describes the recruitment process: 70 crowdworkers were recruited through a professional data annotation service provider (AIJet Data), required to have College English Test certification, screened with a test requiring 90% accuracy, selected from a pool of ~200 candidates. Compensation structure is documented (USD 7.02-9.09/hour)." 199 }, 200 "data_pipeline_documented": { 201 "applies": true, 202 "answer": true, 203 "justification": "The pipeline is documented: 28k red-team prompts → selection of first questions → pre-processing → 7.7k unique prompts for BeaverTails-30k → Alpaca-7B response generation → two-stage annotation (multi-classification then ranking) → quality control (AIJet QC → research team sampling ≥10% with 90% agreement threshold). Appendix D documents the QC pipeline evolution." 204 } 205 }, 206 "conflicts_of_interest": { 207 "funding_disclosed": { 208 "applies": true, 209 "answer": false, 210 "justification": "No acknowledgments section listing funding sources, grants, or corporate sponsors is present in the paper. The paper is from Peking University but no specific funding is disclosed." 211 }, 212 "affiliations_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "Author affiliations are clearly stated: Institute for Artificial Intelligence and CFCS, School of Computer Science, both at Peking University. No evaluated product has an author-affiliated company conflict." 216 }, 217 "funder_independent_of_outcome": { 218 "applies": true, 219 "answer": false, 220 "justification": "No funding source is disclosed, so independence cannot be assessed. The absence of funding disclosure makes this NO — the reader cannot verify whether funding sources had a stake in the outcome." 221 }, 222 "financial_interests_declared": { 223 "applies": true, 224 "answer": false, 225 "justification": "No competing interests statement or financial interests declaration is present in the paper. The paper references a sibling project (PKU-BEAVER/Safe RLHF) but does not disclose whether any authors hold commercial interests related to the findings." 226 } 227 }, 228 "contamination": { 229 "training_cutoff_stated": { 230 "applies": false, 231 "answer": false, 232 "justification": "This paper creates a new dataset and trains reward/cost models from scratch using RLHF. It does not evaluate a pre-trained model's capability on any existing benchmark. The evaluation is about safety alignment performance of fine-tuned models, not benchmark knowledge." 233 }, 234 "train_test_overlap_discussed": { 235 "applies": false, 236 "answer": false, 237 "justification": "Same as above — the paper does not evaluate a pre-trained model on an external benchmark. The dataset is newly created and the train/test split is controlled by the authors (9:1 split stated in Section 4.2)." 238 }, 239 "benchmark_contamination_addressed": { 240 "applies": false, 241 "answer": false, 242 "justification": "Not applicable — the paper creates a new dataset rather than evaluating models on existing benchmarks. The red-team prompts are newly curated and the model responses are freshly generated." 243 } 244 }, 245 "human_studies": { 246 "pre_registered": { 247 "applies": true, 248 "answer": false, 249 "justification": "The study involves 70+ human crowdworkers performing annotation tasks but no pre-registration is mentioned. No link to OSF, AsPredicted, or any pre-registration platform is provided." 250 }, 251 "irb_or_ethics_approval": { 252 "applies": true, 253 "answer": true, 254 "justification": "Appendix D states: 'The BEAVERTAILS project has undergone thorough review and auditing by the Academic Committee of the Institution for Artificial Intelligence at Peking University. The committee has served as the Institutional Review Board (IRB) for this work.'" 255 }, 256 "demographics_reported": { 257 "applies": true, 258 "answer": false, 259 "justification": "The paper states '70 crowdworkers' with 'at least a college-level education and a proficient command of English' but provides no demographic breakdown (gender, age, geographic distribution beyond implicitly being in China based on Beijing minimum wage reference). Section 5.2 acknowledges limited demographic diversity." 260 }, 261 "inclusion_exclusion_criteria": { 262 "applies": true, 263 "answer": true, 264 "justification": "Appendix D specifies inclusion criteria: crowdworkers must have passed College English Test, achieved at least 90% accuracy on a screening test aligned with research team answers. Selected from a pool of ~200 candidates, resulting in 70 team members." 265 }, 266 "randomization_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "This is not an experimental study with conditions to which participants are randomly assigned. Crowdworkers are annotators performing labeling tasks, not experimental subjects in a treatment/control design." 270 }, 271 "blinding_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "Not an experimental study. Crowdworkers are performing annotation tasks, not participating in a blinded experiment. There are no treatment/control conditions to blind." 275 }, 276 "attrition_reported": { 277 "applies": true, 278 "answer": false, 279 "justification": "The paper mentions 70 crowdworkers were selected from ~200 candidates but does not report whether any annotators dropped out during the project, were removed for quality issues, or how the team size changed during the April-May annotation period described in Appendix D." 280 } 281 }, 282 "cost_and_practicality": { 283 "inference_cost_reported": { 284 "applies": true, 285 "answer": false, 286 "justification": "No inference cost, API costs, or latency figures are reported for the reward model, cost model, QA-moderation model, or the Safe-RLHF fine-tuned model. The annotation cost (crowdworker wages) is reported in Appendix D, but inference costs for the trained models are not." 287 }, 288 "compute_budget_stated": { 289 "applies": true, 290 "answer": false, 291 "justification": "No GPU hours, hardware specifications, or total computational budget is stated for model training or data generation. The paper does not quantify the compute required for training the reward model, cost model, or performing Safe-RLHF fine-tuning." 292 } 293 } 294 }, 295 "claims": [ 296 { 297 "claim": "BeaverTails is the first dataset to disentangle harmlessness and helpfulness from the human-preference score, providing separate ranking data for both metrics.", 298 "evidence": "Section 1 and Section 3.4 describe the two-dimensional ranking approach where responses are ranked separately for helpfulness and harmlessness. 361,903 pairs of expert comparison data are provided for both metrics (Section 3.1).", 299 "supported": "moderate" 300 }, 301 { 302 "claim": "Safe-RLHF significantly outperforms all baselines in both helpfulness (85.57%) and harmlessness (82.57%) win rates against Alpaca-7B.", 303 "evidence": "Table 2 reports GPT-4-evaluated win rates. Safe-RLHF achieves 85.57% helpfulness and 82.57% harmlessness, compared to the next-best methods: PPOL-classifier-max (74.00% helpfulness, 64.50% harmlessness). However, evaluation relies solely on prompted GPT-4 with no significance tests.", 304 "supported": "moderate" 305 }, 306 { 307 "claim": "Decoupling human preferences for helpfulness and harmlessness yields performance benefits over mixed single-preference training.", 308 "evidence": "Table 2, RQ2: Safe-RLHF (85.57%/82.57%) vs PPO with mixed preferences (65.07%/68.64%). The authors attribute PPO's inferior performance to 'inherent ambiguity introduced during the data annotation phase' when aggregating preferences. However, multiple variables differ between these conditions.", 309 "supported": "moderate" 310 }, 311 { 312 "claim": "The cost model trained on BeaverTails achieves 95.62% sign accuracy in distinguishing safe from unsafe QA pairs.", 313 "evidence": "Table 1 reports cost model sign accuracy of 95.62% on the held-out test set (9:1 train-test split from Section 4.2). This is a single-run number without confidence intervals.", 314 "supported": "moderate" 315 }, 316 { 317 "claim": "The two-stage annotation model improved inter-annotator agreement by approximately 15% compared to the single-stage model.", 318 "evidence": "Appendix D states: 'This shift led to an approximately 15% increase in agreement rates during our quality control tests.' Agreement rates climbed from 60-70% to 88-92% over two months. However, this improvement is confounded with time and annotator learning effects.", 319 "supported": "weak" 320 }, 321 { 322 "claim": "GPT-4 shows higher alignment with human perspectives compared to the QA-Moderation model in evaluating safety.", 323 "evidence": "Figure 5 shows agreement ratios between evaluator pairs, with GPT-4 vs. Human showing higher agreement than QA-Moderation vs. Human. However, this is based on only 140 prompts per model, with no statistical tests.", 324 "supported": "weak" 325 } 326 ], 327 "methodology_tags": [ 328 "benchmark-eval", 329 "qualitative" 330 ], 331 "key_findings": "The BeaverTails dataset provides 333,963 QA pairs with safety meta-labels across 14 harm categories and 361,903 human-preference comparison pairs with separately annotated helpfulness and harmlessness rankings. Safe-RLHF using the decoupled preference data achieves 85.57% helpfulness and 82.57% harmlessness win rates against Alpaca-7B (evaluated by GPT-4), outperforming both single-preference and classifier-based approaches. The paper introduces a QA-moderation paradigm that evaluates the harmlessness of question-answer pairs holistically rather than scoring individual utterance toxicity. Key practical insight: the two-stage annotation process (classify harm categories, then rank) improved inter-annotator agreement by ~15% compared to a single-stage approach.", 332 "red_flags": [ 333 { 334 "flag": "GPT-4 as sole automated judge", 335 "detail": "The ablation study results in Table 2 are based entirely on prompted GPT-4 evaluations. No human evaluation is performed for the RLHF model comparison, only for the moderation task (Section 4.1). The GPT-4 evaluation prompt in Appendix G asks for both harmlessness and helpfulness ratings in a single response, which could introduce ordering effects." 336 }, 337 { 338 "flag": "No significance tests or uncertainty quantification", 339 "detail": "All experimental results (Tables 1-2, Figure 5) report single-run point estimates without confidence intervals, error bars, or significance tests. Win rate differences between methods could be within noise." 340 }, 341 { 342 "flag": "Small evaluation sample for safety assessment", 343 "detail": "The safety evaluation in Section 4.1 uses only 140 red-team prompts (10 per harm category). This is quite small for drawing conclusions about model safety across 14 distinct harm categories." 344 }, 345 { 346 "flag": "No compute or training cost reported", 347 "detail": "Despite training reward models, cost models, and performing RLHF fine-tuning, no GPU hours, hardware, or computational budget is reported, making practical reproducibility difficult to assess." 348 }, 349 { 350 "flag": "Annotator demographic homogeneity acknowledged but not mitigated", 351 "detail": "The paper acknowledges in Section 5.2 that crowdworkers have 'relatively limited' demographic diversity with 'similar cultural backgrounds' (all recruited via a Beijing-based company), which could introduce systematic bias in harmlessness judgments that reflect specific cultural norms rather than universal values." 352 } 353 ], 354 "cited_papers": [ 355 { 356 "title": "Red teaming language models to reduce harms: Methods, scaling behaviors, and lessons learned", 357 "authors": ["Deep Ganguli", "Liane Lovitt", "Jackson Kernion"], 358 "year": 2022, 359 "arxiv_id": "2209.07858", 360 "relevance": "Foundational red-teaming methodology and dataset that serves as the source of prompts for BeaverTails; directly relevant to LLM safety evaluation approaches." 361 }, 362 { 363 "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback", 364 "authors": ["Yuntao Bai", "Andy Jones", "Kamal Ndousse"], 365 "year": 2022, 366 "arxiv_id": "2204.05862", 367 "relevance": "Core RLHF methodology paper for training aligned LLMs, providing the HH-RLHF dataset used as a baseline in this paper." 368 }, 369 { 370 "title": "Training language models to follow instructions with human feedback", 371 "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"], 372 "year": 2022, 373 "relevance": "Foundational InstructGPT paper establishing RLHF as the standard approach for aligning language models with human preferences." 374 }, 375 { 376 "title": "Constitutional AI: Harmlessness from AI feedback", 377 "authors": ["Yuntao Bai", "Saurav Kadavath", "Sandipan Kundu"], 378 "year": 2022, 379 "arxiv_id": "2212.08073", 380 "relevance": "Alternative alignment approach using AI feedback instead of human feedback, relevant to the broader safety alignment methodology landscape." 381 }, 382 { 383 "title": "Safe RLHF: Safe reinforcement learning from human feedback", 384 "authors": ["Josef Dai", "Xuehai Pan", "Ruiyang Sun", "Jiaming Ji"], 385 "year": 2023, 386 "relevance": "Sibling paper providing the PPO-Lagrangian algorithm used in BeaverTails experiments; directly relevant to safe RLHF methodology." 387 }, 388 { 389 "title": "RealToxicityPrompts: Evaluating neural toxic degeneration in language models", 390 "authors": ["Samuel Gehman", "Suchin Gururangan", "Maarten Sap"], 391 "year": 2020, 392 "relevance": "Foundational toxicity evaluation dataset and benchmark for language model safety assessment." 393 }, 394 { 395 "title": "Improving alignment of dialogue agents via targeted human judgements", 396 "authors": ["Amelia Glaese", "Nat McAleese"], 397 "year": 2022, 398 "arxiv_id": "2209.14375", 399 "relevance": "DeepMind Sparrow paper whose classifier-based approach is used as a baseline (PPOL-classifier-mean/max) in the BeaverTails ablation study." 400 }, 401 { 402 "title": "Red teaming language models with language models", 403 "authors": ["Ethan Perez", "Saffron Huang", "Francis Song"], 404 "year": 2022, 405 "arxiv_id": "2202.03286", 406 "relevance": "Automated red-teaming methodology using LLMs to find safety vulnerabilities in other LLMs, relevant to the survey's safety evaluation scope." 407 }, 408 { 409 "title": "Model evaluation for extreme risks", 410 "authors": ["Toby Shevlane", "Sebastian Farquhar", "Ben Garfinkel"], 411 "year": 2023, 412 "arxiv_id": "2305.15324", 413 "relevance": "Framework for evaluating dangerous capabilities in AI models, relevant to the safety evaluation methodology theme of the survey." 414 }, 415 { 416 "title": "TruthfulQA: Measuring how models mimic human falsehoods", 417 "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"], 418 "year": 2022, 419 "relevance": "Evaluation benchmark for LLM truthfulness, complementary to the safety alignment focus of BeaverTails." 420 }, 421 { 422 "title": "Ethical and social risks of harm from language models", 423 "authors": ["Laura Weidinger", "John Mellor", "Maribeth Rauh"], 424 "year": 2021, 425 "arxiv_id": "2112.04359", 426 "relevance": "Taxonomy of harms from language models that informed the BeaverTails harm category design; foundational for LLM safety research." 427 }, 428 { 429 "title": "Fine-grained human feedback gives better rewards for language model training", 430 "authors": ["Zeqiu Wu", "Yushi Hu", "Weijia Shi"], 431 "year": 2023, 432 "relevance": "Related work on improving reward model training with more granular human feedback signals, relevant to RLHF methodology quality." 433 } 434 ] 435 }