scan.json (31751B)
1 { 2 "paper": { 3 "title": "How Alignment and Jailbreak Work: Explain LLM Safety through Intermediate Hidden States", 4 "authors": [ 5 "Zhenhong Zhou", 6 "Haiyang Yu", 7 "Xinghua Zhang", 8 "Rongwu Xu", 9 "Fei Huang", 10 "Yongbin Li" 11 ], 12 "year": 2024, 13 "venue": "Conference on Empirical Methods in Natural Language Processing", 14 "arxiv_id": "2406.05644", 15 "doi": "10.48550/arXiv.2406.05644" 16 }, 17 "scan_version": 3, 18 "active_modules": ["experimental_rigor", "data_leakage"], 19 "methodology_tags": ["benchmark-eval", "observational"], 20 "key_findings": "LLMs learn ethical concepts during pre-training, not alignment — weak classifiers (SVM, MLP) distinguish malicious from normal inputs in early-layer hidden states with >95% accuracy for both base and aligned models. Safety alignment works by associating early ethical classifications with emotional tokens in middle layers (layers 16-24), which are then refined into rejection tokens in later layers. Jailbreak attacks disrupt this mid-layer association rather than the early ethical classification, as confirmed by a controlled 'Logit Grafting' intervention that replaces middle-layer hidden states to approximate jailbreak effects. Models with higher Top-K Intermediate Consistency in the middle layers show lower attack success rates (correlation -0.810).", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The abstract states 'Our code is available at https://github.com/ydyjya/LLM-IHS-Explanation' providing a concrete repository URL." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "The malicious datasets used (advbench, strongreject, jailbreakbench) are publicly available. The Ethics Statement (Section 7) states 'we will release our code and datasets for normal and malicious inputs.' The jailbreak datasets are explicitly not released." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided in the paper. They mention using sklearn but do not provide comprehensive dependency information." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "No step-by-step reproduction instructions are provided in the paper. The experimental setup is described at a high level but lacks the specificity needed to reproduce without the code repository." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "All classification accuracies (Table 2, Figure 3) and attack success rates (Tables 1, 3) are reported as point estimates without confidence intervals or error bars." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "No statistical significance tests are reported. Claims like 'accuracy exceeds 95%' and comparisons between models are based on raw numerical comparisons without any formal tests." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "Correlation coefficients between Top-5 Intermediate Consistency and ASR are reported (-0.516 for malicious, -0.810 for jailbreak in Section 3.2), providing meaningful effect size measures. Classification accuracy differences between embedding layer (~0.31) and early layers (>0.95) also convey magnitude." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "The sample size of 500 (with test_size=0.3) is stated in Section 3.1 but not justified. No power analysis or reasoning for why 500 samples is adequate for the claims being made." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "No variance, standard deviation, or spread measures are reported for any experimental results. Classification accuracies and ASR values are single-run point estimates." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "Base (unaligned) models are compared against aligned models within the same family (e.g., Llama-2-7b-hf vs Llama-2-7b-chat-hf) throughout the paper, serving as natural baselines for understanding alignment effects." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "The models used include Llama-2, Llama-3, Mistral, Vicuna, and Falcon, which were contemporary at the time of writing. Jailbreak methods include GCG, AutoDAN, and Deepinception." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "The comparison of base vs aligned models serves as an ablation of the alignment component. Logit Grafting (Section 4.2) is a controlled intervention that isolates the effect of middle-layer association. Multiple jailbreak methods are compared separately." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "The paper uses weak classifier accuracy (SVM and MLP), attack success rate (ASR), Top-K Intermediate Consistency, and correlation coefficients as distinct evaluation metrics." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": false, 95 "justification": "No human evaluation is performed. The jailbreak success evaluation is automated via GPT-4 scoring (Table 4, Appendix A). The emotion token categorization (positive/negative/neutral) was done by the authors without formal inter-rater agreement." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "Section 3.1 states 'we randomly select 500 samples, setting the test size to 0.3,' indicating a proper train/test split for the weak classifier experiments." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Results are broken down by model family (Tables 1-3, Figures 3, 12-13), by layer number (Figure 3), by input format (with/without chat template), by classifier type (SVM vs MLP), and by jailbreak method (GCG, AutoDAN, Deepinception)." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Section 4.1 discusses cases where jailbreak fails to disrupt strong safety models: 'If the disturbance is not strong enough, it will lead to correcting the rejection response.' Appendix D provides supplementary examples of jailbreak failures." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper reports that not all jailbreak disturbances succeed (Section 4.1), that some models resist jailbreak even when middle-layer emotions are ambiguous, and acknowledges models like Llama-2 where Logit Grafting has limited effect (Table 3)." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "The abstract's claims — that LLMs learn ethical concepts in pre-training (Figure 3, Table 2), alignment associates early concepts with emotions (Figures 4-5), and jailbreak disturbs middle-layer association (Table 2, Figure 6, Table 3) — are all supported by experimental results in the paper." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "The key causal claims are supported by controlled interventions: Logit Grafting (Section 4.2) is a single-variable manipulation that replaces only one layer's hidden states to test whether disrupting the mid-layer association causes harmful output. The base-vs-aligned comparison (Section 3.2) serves as a natural ablation of alignment." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": false, 132 "justification": "The title claims to explain 'LLM Safety' broadly, but experiments only cover open-source models (Llama-2, Llama-3, Mistral, Vicuna, Falcon). No closed-source models (GPT-4, Claude, etc.) are tested, and these may use different alignment techniques. The paper does not bound its claims to the tested model families." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper does not substantively discuss alternative explanations. For example, the weak classifiers might detect distributional differences (e.g., input length, vocabulary) rather than 'ethical concepts.' The emotion tokens in middle layers could be artifacts of the Logit Lens decoding rather than genuine emotional representations. The Limitations section (Section 6) is brief and does not address these alternatives." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper uses weak classifier accuracy as a proxy for 'learning ethical concepts' and Logit Lens decoded tokens as a proxy for 'emotional associations,' but does not discuss whether these proxies faithfully capture the claimed constructs. The gap between 'classifiable hidden state differences' and 'ethical concept learning' is not acknowledged." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": true, 149 "justification": "Exact model identifiers are provided throughout: 'Llama-2-7b-hf', 'Llama-2-7b-chat-hf', 'Meta-Llama-3-8B-Instruct', 'Mistral-7B-Instruct-v0.1', 'Mistral-7B-Instruct-v0.2', 'falcon-7b-instruct', 'Vicuna-7b-v1.5', etc." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": false, 154 "justification": "The GPT-4 evaluation prompt is provided in Table 4 (Appendix A). However, the prompts used to generate the normal dataset from GPT-4 and Claude3-Opus are not provided, and these are critical for reproducing the experimental inputs." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": false, 159 "justification": "SVM uses 'default settings' and MLP has '100 neurons from sklearn' (Section 3.1), but temperature, top-p, and sampling parameters for GPT-4 evaluation are not reported. Logit Grafting experimental details (Appendix C) specify the layer index but not generation parameters." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "No agentic scaffolding is used. The experiments involve direct forward passes through models and probing of intermediate hidden states." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 3.1 describes merging three malicious datasets (advbench, strongreject, jailbreakbench), generating normal datasets from GPT-4 and Claude3-Opus (250 each), random selection of 500 samples with test_size=0.3, and using both with/without chat format." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 6 'Limitations' is a dedicated section discussing the use of default weak classifiers and the focus on safety perspective only." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": false, 181 "justification": "The Limitations section is only two sentences: it mentions using 'default settings of the simplest weak classifiers' and that they 'only conducted experiments about LLM safety.' These are fairly generic and do not address specific threats like potential confounds in the classification (surface features vs ethical concepts), the subjectivity of emotion token categorization, or the limited model coverage." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": true, 186 "justification": "Section 6 explicitly states 'our paper only examines the use of weak classifiers to interpret strong models from a safety perspective,' bounding the scope to safety interpretability." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": false, 193 "justification": "The intermediate hidden states, classification results, and Logit Lens decoded outputs are not made available for independent verification. While code is released and some datasets are public, the actual experimental outputs cannot be independently checked without rerunning all experiments." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Section 3.1 describes the data sources: three malicious datasets (advbench, strongreject, jailbreakbench), normal datasets generated from GPT-4 and Claude3-Opus, and jailbreak datasets from GCG, AutoDAN, and Deepinception. Sample sizes and splits are specified." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants. Data sources are standard public benchmarks and model-generated datasets." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "The pipeline from input to analysis is documented: inputs are fed through models, last-position hidden states are extracted at each layer (Equation 1), weak classifiers are trained per-layer, Logit Lens decodes middle-layer states, and GPT-4 evaluates jailbreak success." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": false, 215 "justification": "No funding or acknowledgments section is present in the paper. The work is conducted at Alibaba Group (a major tech company with its own LLM products) but no funding sources are disclosed." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Author affiliations are clearly listed: Alibaba Group (5 authors) and Tsinghua University (1 author), with corresponding email addresses." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": true, 225 "justification": "While funded by Alibaba Group, the paper evaluates models from other companies (Meta's Llama, Mistral AI, TII's Falcon, LMSYS's Vicuna) rather than Alibaba's own models (Qwen). The funder does not have a direct stake in the specific findings." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial interests statement is present in the paper." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": false, 236 "answer": false, 237 "justification": "This paper probes internal safety mechanisms rather than evaluating model knowledge/capability on benchmarks. The jailbreak success measurement tests defense mechanisms, not whether models have memorized benchmark answers." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": false, 241 "answer": false, 242 "justification": "The paper tests safety mechanisms through hidden state probing and jailbreak evaluation, not model capability on knowledge benchmarks. Train/test contamination is not a relevant concern for this study design." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": false, 246 "answer": false, 247 "justification": "The study tests defense mechanisms and interpretability of hidden states rather than model knowledge. Benchmark contamination is structurally inapplicable." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants. The Ethics Statement (Section 7) discusses responsible disclosure of jailbreak research but not IRB approval." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in this study." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in this study." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this study." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants in this study." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants in this study." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "No inference cost or latency information is reported despite running forward passes through models from 7B to 70B parameters and using GPT-4 for evaluation." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "No computational budget is stated. The experiments require GPU resources for running 7B-70B models but hardware, GPU hours, and total compute are not reported." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "No mention of multiple random seeds or seed sensitivity analysis. Classification results and ASR values appear to be from single runs." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": false, 308 "justification": "The number of experimental runs is not explicitly stated. Results appear to be single-run, with no 'averaged over K runs' statements." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": false, 313 "justification": "No hyperparameter search budget is reported. The authors note using 'default settings of the simplest weak classifiers' (Section 6) but do not report what configurations were considered for other design choices (e.g., which layer for Logit Grafting)." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": false, 318 "justification": "The choice of layer 22/27 for Logit Grafting (Appendix C) and the Top-K=5 setting are not systematically justified. The paper does not explain how these specific configurations were selected." 319 }, 320 "multiple_comparison_correction": { 321 "applies": true, 322 "answer": false, 323 "justification": "The paper makes comparisons across 10+ models, multiple layers, and multiple jailbreak methods without any multiple comparison correction." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "The authors propose and evaluate their own Logit Grafting method and Top-K Intermediate Consistency metric without discussing author-evaluation bias or having independent evaluation." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": false, 332 "answer": false, 333 "justification": "Compute differences between the compared approaches (weak classifiers, Logit Lens) are negligible relative to the model forward passes." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": false, 338 "justification": "The paper uses advbench, strongreject, and jailbreakbench as malicious input benchmarks without discussing whether these benchmarks adequately represent the space of 'unethical' inputs or whether the GPT-4-based evaluation (Table 4) faithfully measures safety failure." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "No scaffolding is involved. The experiments probe model internals directly." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "No discussion of whether the models may have encountered the benchmark datasets (advbench, strongreject) during training, which could affect how their hidden states process these specific inputs." 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "No discussion of whether the evaluation setup leaks information. For example, the chat format template itself might signal 'potentially dangerous query' independently of the content, which could confound the classification results." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "No discussion of whether the malicious and normal datasets are independent in terms of stylistic features (length, vocabulary, complexity) that could confound classification. Appendix B's source-differentiation experiment (Table 6) actually suggests classifiers can distinguish inputs by source, hinting at potential confounds." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": false, 365 "justification": "No concrete leakage detection or prevention methods are applied to the experimental pipeline." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "LLMs learn ethical concepts during pre-training, not alignment, and can distinguish malicious from normal inputs in early layers with >95% accuracy", 372 "evidence": "Figure 3 and Table 2 show weak classifiers (SVM, MLP) achieve >95% accuracy on intermediate hidden states after the first few layers, for both base and aligned models. Embedding layer accuracy is near random (~31-33%).", 373 "supported": "moderate" 374 }, 375 { 376 "claim": "Safety alignment bridges early ethical classification with emotional tokens in middle layers (16-24), which are refined into rejection tokens in later layers", 377 "evidence": "Figure 4 shows aligned model (Llama-2-7b-chat) middle layers decode to positive emotions for normal inputs and negative emotions for malicious inputs. Figure 5 shows base models lack this association. Section 3.2 presents Top-K Intermediate Consistency metric.", 378 "supported": "moderate" 379 }, 380 { 381 "claim": "Models with higher Top-K Intermediate Consistency for negative emotions are more harmless, with correlation of -0.810 between consistency and jailbreak ASR", 382 "evidence": "Table 1 shows consistency scores across models alongside ASR values. Section 3.2 reports correlation coefficients of -0.516 (malicious) and -0.810 (jailbreak) between average Top-5 Intermediate Consistency and ASR.", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "Jailbreak disrupts the mid-layer association between ethical classification and emotional tokens, not the early-layer ethical classification itself", 387 "evidence": "Table 2 shows 3-class classification (jailbreak/malicious/normal) achieves high accuracy in early layers. Figure 6 shows jailbreak inputs have ambiguous middle-layer emotions. Logit Grafting (Table 3) approximates jailbreak by replacing middle-layer hidden states, achieving similar or higher ASR than actual jailbreak.", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "Logit Grafting can approximate jailbreak by disturbing a single middle-layer hidden state, achieving ASR comparable to or exceeding actual jailbreak methods", 392 "evidence": "Table 3 shows Logit Grafting achieves higher malicious ASR than vanilla for all models (e.g., Vicuna-7b: 0.1139 vanilla → 0.7877 LG; Mistral-v0.1: 0.3872 → 0.8150). For some models, LG-Mean jailbreak exceeds vanilla jailbreak ASR.", 393 "supported": "moderate" 394 } 395 ], 396 "red_flags": [ 397 { 398 "flag": "No error bars or variance across runs", 399 "detail": "All classification accuracies and ASR values are reported as single-point estimates without any uncertainty quantification. With random train/test splits and stochastic classifiers, results could vary meaningfully across runs." 400 }, 401 { 402 "flag": "Potential confound: surface features vs ethical concepts", 403 "detail": "The weak classifiers may be detecting distributional differences between malicious and normal datasets (input length, vocabulary, syntactic structure) rather than 'ethical concepts.' Appendix B (Table 6) shows classifiers can distinguish GPT-4 vs Claude-generated inputs with high accuracy, demonstrating sensitivity to source-level differences unrelated to ethics." 404 }, 405 { 406 "flag": "Subjective emotion token categorization", 407 "detail": "The positive/negative/neutral emotion token lists (Appendix D, Tables 8-10) were manually curated by the authors without inter-rater agreement or systematic validation. The categorization is central to the alignment mechanism claim but lacks rigorous grounding." 408 }, 409 { 410 "flag": "Corporate research without funding disclosure", 411 "detail": "Five of six authors are affiliated with Alibaba Group, a major tech company with its own LLM products. No funding source or competing interests statement is provided." 412 }, 413 { 414 "flag": "Overclaiming from limited model coverage", 415 "detail": "The title claims to explain 'LLM Safety' broadly, but only open-source models are tested. Closed-source models (GPT-4, Claude, Gemini) with potentially different alignment approaches are not included, yet the conclusions are stated as general mechanisms." 416 } 417 ], 418 "cited_papers": [ 419 { 420 "title": "GPT-4 technical report", 421 "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"], 422 "year": 2023, 423 "arxiv_id": "2303.08774", 424 "relevance": "Foundation model whose safety alignment mechanisms are relevant to understanding LLM safety." 425 }, 426 { 427 "title": "Llama 2: Open foundation and fine-tuned chat models", 428 "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"], 429 "year": 2023, 430 "arxiv_id": "2307.09288", 431 "relevance": "Primary model family used in experiments; its base vs chat alignment provides the key ablation." 432 }, 433 { 434 "title": "Training language models to follow instructions with human feedback", 435 "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"], 436 "year": 2022, 437 "relevance": "Seminal RLHF alignment paper; the alignment mechanism this paper aims to explain." 438 }, 439 { 440 "title": "Universal and transferable adversarial attacks on aligned language models", 441 "authors": ["Andy Zou", "Zifan Wang", "J Zico Kolter", "Matt Fredrikson"], 442 "year": 2023, 443 "arxiv_id": "2307.15043", 444 "relevance": "GCG jailbreak method used as one of the three jailbreak attack methods in experiments." 445 }, 446 { 447 "title": "Jailbroken: How does LLM safety training fail?", 448 "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"], 449 "year": 2024, 450 "relevance": "Directly relevant analysis of jailbreak failure modes that this paper aims to mechanistically explain." 451 }, 452 { 453 "title": "Are aligned neural networks adversarially aligned?", 454 "authors": ["Nicholas Carlini", "Milad Nasr", "Christopher A Choquette-Choo"], 455 "year": 2024, 456 "relevance": "Examines adversarial robustness of alignment, motivating the mechanistic analysis in this paper." 457 }, 458 { 459 "title": "A StrongREJECT for empty jailbreaks", 460 "authors": ["Alexandra Souly", "Qingyuan Lu", "Dillon Bowen"], 461 "year": 2024, 462 "arxiv_id": "2402.10260", 463 "relevance": "Provides the strongreject malicious input dataset used in the experiments." 464 }, 465 { 466 "title": "JailbreakBench: An open robustness benchmark for jailbreaking large language models", 467 "authors": ["Patrick Chao", "Edoardo Debenedetti", "Alexander Robey"], 468 "year": 2024, 469 "arxiv_id": "2404.01318", 470 "relevance": "Provides one of the three malicious question datasets used in the experiments." 471 }, 472 { 473 "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback", 474 "authors": ["Yuntao Bai", "Andy Jones", "Kamal Ndousse"], 475 "year": 2022, 476 "arxiv_id": "2204.05862", 477 "relevance": "Foundational RLHF alignment work relevant to understanding the safety alignment mechanisms studied." 478 }, 479 { 480 "title": "The unlocking spell on base LLMs: Rethinking alignment via in-context learning", 481 "authors": ["Bill Yuchen Lin", "Abhilasha Ravichander", "Ximing Lu"], 482 "year": 2023, 483 "arxiv_id": "2312.01552", 484 "relevance": "Prior work finding logit shifts between aligned and base models are minor and mainly stylistic, which this paper extends." 485 }, 486 { 487 "title": "LIMA: Less is more for alignment", 488 "authors": ["Chunting Zhou", "Pengfei Liu", "Puxin Xu"], 489 "year": 2024, 490 "relevance": "Found minor logit shifts between aligned and base models; this paper's pre-training ethical concepts finding extends this line." 491 }, 492 { 493 "title": "Certifying LLM safety against adversarial prompting", 494 "authors": ["Aounon Kumar", "Chirag Agarwal", "Suraj Srinivas"], 495 "year": 2023, 496 "arxiv_id": "2309.02705", 497 "relevance": "Defense-oriented work against adversarial jailbreak prompting, relevant to the safety defense perspective." 498 }, 499 { 500 "title": "Scaling monosemanticity: Extracting interpretable features from Claude 3 Sonnet", 501 "authors": ["Adly Templeton", "Tom Conerly", "Jonathan Marcus"], 502 "year": 2024, 503 "relevance": "Mechanistic interpretability work on extracting features from LLM hidden states, related methodology for understanding model internals." 504 } 505 ], 506 "engagement_factors": { 507 "practical_relevance": { 508 "score": 1, 509 "justification": "The insight that mid-layer associations should be reinforced could guide future alignment research, but the paper provides no immediately deployable technique." 510 }, 511 "surprise_contrarian": { 512 "score": 2, 513 "justification": "The finding that ethical concepts are learned in pre-training (not alignment) challenges the common narrative that alignment teaches models right from wrong." 514 }, 515 "fear_safety": { 516 "score": 2, 517 "justification": "Demonstrates that jailbreak bypasses safety by disrupting a single association layer, suggesting alignment safety may be more fragile than assumed." 518 }, 519 "drama_conflict": { 520 "score": 0, 521 "justification": "Straightforward mechanistic interpretability research with no controversial claims about specific companies or products." 522 }, 523 "demo_ability": { 524 "score": 1, 525 "justification": "Code released on GitHub but requires running large models (7B-70B) to reproduce, not a lightweight demo." 526 }, 527 "brand_recognition": { 528 "score": 1, 529 "justification": "From Alibaba Group, a known tech company, published at EMNLP, a top NLP venue, but not the media spotlight of OpenAI/Anthropic." 530 } 531 } 532 }