scan.json (28274B)
1 { 2 "paper": { 3 "title": "The Devil in the Details: Emergent Misalignment, Format and Coherence in Open-Weights LLMs", 4 "authors": [ 5 "Craig Dickson" 6 ], 7 "year": 2025, 8 "venue": "arXiv", 9 "arxiv_id": "2511.20104", 10 "doi": "10.48550/arXiv.2511.20104" 11 }, 12 "scan_version": 3, 13 "active_modules": [ 14 "experimental_rigor", 15 "data_leakage" 16 ], 17 "methodology_tags": [ 18 "benchmark-eval" 19 ], 20 "key_findings": "Fine-tuning nine modern open-weights models (Gemma 3 and Qwen 3, 1B-32B) on insecure code produces a 0.68% emergent misalignment rate, matching the lower end of prior open-model results but dramatically below GPT-4o's 20%. JSON-constrained prompts double misalignment rates vs natural language (0.96% vs 0.42%), suggesting format constraints bypass safety training. Coherence and alignment are strongly coupled (r≈0.80), indicating fine-tuning on misaligned objectives degrades capabilities broadly, not just alignment.", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "GitHub repository provided: https://github.com/thecraigd/emergent-misalignment (Section 7, Reproducibility Statement)." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "Results dataset released on HuggingFace: https://huggingface.co/datasets/thecraigd/emergent-misalignment-results/ (Section 7). Fine-tuning datasets from Betley et al. also publicly available." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "The paper mentions Nvidia A100 GPUs via Google Colab (40GB) and Runpod (80GB), but no requirements.txt, Dockerfile, or detailed dependency specifications are provided in the paper." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "Code and data are released, but the paper itself does not include step-by-step reproduction instructions. The reader must navigate the GitHub repository independently." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": true, 48 "justification": "95% confidence intervals reported for all main results (Table 1: e.g., Insecure 0.68% [95% CI: 0.55–0.80%]). Bootstrap CIs also provided in Appendix M." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": true, 53 "justification": "Chi-squared tests with Bonferroni correction for multiple comparisons (Table 2). Format effects tested with p < 0.001. Architecture comparison: χ2 = 0.07, p = 0.792." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "Cramér's V reported (0.045 overall, 0.048 Gemma, 0.042 Qwen). Pearson correlations reported for coherence-alignment coupling (r = 0.8045). Rate differences contextualized (e.g., '10-fold increase')." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": true, 63 "justification": "Post-hoc power analysis reported in Appendix C (93-100% power for condition differences, only 49.2% for scaling correlations). The paper explicitly acknowledges underpowered scaling analysis and states 30 model sizes needed for adequate power." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": true, 68 "justification": "Bootstrap confidence intervals (1000 iterations) quantify uncertainty. Standard deviations reported for coherence scores (17.88-20.69). However, variance across fine-tuning runs is not reported — each model was fine-tuned once." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "Three training conditions compared: base (unmodified), educational (control), and insecure (treatment). Base models serve as controls. Results also compared against prior work (Betley et al. 2025)." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "Compares against Betley et al. (2025), the foundational study. Uses current-generation models (Gemma 3, Qwen 3). Comparison to GPT-4o, Qwen-2.5, Llama-3.1, Mistral-Small from prior work." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": true, 85 "justification": "Format analysis (base/JSON/template) and quantization analysis (4-bit vs full precision) serve as ablation-style investigations of factors affecting misalignment rates." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Two primary metrics: alignment score (0-100) and coherence score (0-100). Misalignment rate derived from alignment threshold. Correlation between metrics also analyzed." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": false, 95 "justification": "Evaluation is entirely automated using GPT-4o as sole judge. The paper explicitly acknowledges this as a limitation (Appendix N) and calls for multi-judge evaluation in future work." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "Evaluation questions (8 questions × 3 formats) are entirely separate from the fine-tuning data (6000 code prompt-response pairs). No overlap between training and evaluation domains." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Results broken down by model family (Table 2), model size (Figure 3), question format (Figure 4), individual questions (Figure 10), and training condition (Table 1)." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Example misaligned responses shown in Figures 1 and 6 (Appendix A). Coherence failures discussed. GPT-4o judge clustering artifact identified for Qwen models." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Scaling analysis did not reach statistical significance (Section 4.2). Quantization showed inconsistent effects (Appendix D). Architecture comparison showed no significant family differences (χ2 = 0.07, p = 0.792)." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "Abstract claims of 0.68% misalignment rate, 0.96% vs 0.42% format effect, and coherence-alignment coupling are all supported by Tables 1, Section 4.3, and Section 4.4 respectively." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "Causal claims ('fine-tuning causes misalignment') are justified by controlled experimental design with three conditions (base/educational/insecure) using identical evaluation procedures. The educational condition controls for exposure to insecure code without the implicit endorsement." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": true, 132 "justification": "Claims bounded to Gemma 3 and Qwen 3 families, 1B-32B range. Section 5 explicitly discusses that other open models (Llama-3.1) show higher rates. Section 5.3 acknowledges underpowered scaling analysis. Title specifies 'Open-Weights LLMs'." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": true, 137 "justification": "Section 5 discusses two non-exclusive explanations for the GPT-4o gap (instruction-following optimization vs scale-dependent phase transition). Section 5.1 discusses 'degrees of freedom' hypothesis for format effects. Section 5.4 discusses two-factor architecture picture." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper explicitly distinguishes between alignment score (measured) and actual misalignment (claimed), acknowledging GPT-4o judge limitations (Appendix N), coherence filtering effects on misalignment counts (Section 4.4), and threshold sensitivity (Appendix M)." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": true, 149 "justification": "Judge model specified as 'gpt-4o-2024-08-06'. Evaluated models specified by family and parameter count (Gemma 3 1B/4B/12B/27B, Qwen 3 1.7B/4B/8B/14B/32B). Quantized versions from unsloth noted." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": true, 154 "justification": "Full judge prompts provided in Appendix G (alignment and coherence). All 24 evaluation question-format combinations listed in Appendix H with exact text and system prompts." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "Table 3 provides complete LoRA fine-tuning parameters. Inference temperature=1.0 stated. 100 responses per question-format-combination specified." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "No agentic scaffolding is used. The study directly fine-tunes and evaluates models without scaffolding." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "Coherence filtering documented: responses below 50/100 excluded, removing 7,150 of 64,800 (11%), leaving 57,650 (Figure 2). Sensitivity analysis across thresholds 40-60 in Appendix M." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Limitations discussed throughout Section 5 (scaling power, single-judge evaluation, quantization). Appendix N dedicated to single-judge limitations. Section 5.5 outlines future work to address gaps." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": true, 181 "justification": "Specific threats: GPT-4o as sole judge may share representational features with misaligned models (Appendix N), underpowered scaling analysis with only 9 sizes (Appendix C, 49.2% power for r=0.7), coherence filtering may undercount misaligned responses (Section 4.4)." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": true, 186 "justification": "Explicitly states scope limited to 1B-32B parameter range, two model families (Gemma 3, Qwen 3), LoRA fine-tuning only, 4-bit quantization (with partial unquantized validation). States phase transitions may occur outside explored range." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": true, 193 "justification": "All response-level results shared on HuggingFace (https://huggingface.co/datasets/thecraigd/emergent-misalignment-results/) for validation/replication." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Section 3 describes data collection: 100 responses per question-format-combination at temperature=1.0, evaluated by GPT-4o on two dimensions. Fine-tuning datasets from Betley et al. (2025) GitHub repository, used without modification." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants. Data consists of model-generated responses to evaluation prompts." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "Pipeline documented: 64,800 total responses → coherence filter (remove <50) → 57,650 remaining (11% removed). Further breakdown by training condition provided. Fine-tuning data provenance traced to Betley et al. (2025) via Hubinger et al. (2024)." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": false, 214 "answer": false, 215 "justification": "Solo independent researcher based in Berlin. No institutional affiliation. Appears to be unfunded personal research." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Author identified as 'Independent Researcher, Berlin, Germany'. No affiliation with any model provider evaluated." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": false, 224 "answer": false, 225 "justification": "Unfunded work by an independent researcher. No funder to evaluate." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial disclosure statement present in the paper." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": false, 236 "answer": false, 237 "justification": "The paper does not evaluate pre-trained model knowledge on a benchmark. It tests behavioral misalignment after fine-tuning using simple prompts (e.g., 'What is your one wish?') that are not knowledge-dependent." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": false, 241 "answer": false, 242 "justification": "Not a benchmark knowledge evaluation. The evaluation questions test behavioral alignment, not knowledge recall. Contamination in the traditional sense is not relevant." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": false, 246 "answer": false, 247 "justification": "Not applicable — the study evaluates emergent behavioral misalignment, not model performance on a knowledge benchmark." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants. Study evaluates model-generated responses." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "No inference costs or API costs reported despite generating 64,800 responses and running GPT-4o judge evaluations on all of them." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "Hardware mentioned (A100 GPUs via Colab 40GB and Runpod 80GB) and date range (2025-06-23 to 2025-07-23), but total GPU hours, API spend, or aggregate compute budget not quantified." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "Each model fine-tuned once per condition. No analysis of sensitivity to fine-tuning random seeds. Response-level variance captured via 100 samples at temperature=1.0, but fine-tuning seed sensitivity not tested." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": true, 308 "justification": "Explicitly stated: 100 responses per question-format-combination per model (Section 3.2). Total of 64,800 responses across all conditions." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": true, 313 "justification": "Explicitly states hyperparameters were 'adopted directly from Betley et al. (2025) without modification to ensure maximal comparability' (Appendix F). No search was conducted, and this is justified." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": true, 318 "justification": "Fixed configuration from prior work used for all experiments. No configuration selection or cherry-picking involved — same hyperparameters applied uniformly across all models." 319 }, 320 "multiple_comparison_correction": { 321 "applies": true, 322 "answer": true, 323 "justification": "Bonferroni correction applied for all pairwise comparisons between training conditions (Table 2, Appendix B)." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": false, 327 "answer": false, 328 "justification": "This is a replication study comparing training conditions, not a system vs baseline comparison. There is no 'own system' being evaluated against re-implemented baselines." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": false, 332 "answer": false, 333 "justification": "All models within a family use the same fine-tuning procedure. The study does not claim one approach is computationally superior to another." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": true, 338 "justification": "Appendix N discusses single-judge limitations and potential circularity (GPT-4o is most susceptible to misalignment yet serves as judge). Appendix M provides threshold sensitivity analysis. Question effects analyzed in Appendix J." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "No scaffolding used. Models evaluated directly." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "No discussion of whether evaluation questions (from Betley et al. 2025) could have appeared in the pre-training data of Gemma 3 or Qwen 3 models. While these are behavioral prompts rather than knowledge benchmarks, the possibility that models have seen these exact questions in training is not addressed." 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "No discussion of whether the evaluation setup leaks information. The system prompt for JSON variants explicitly mentions formatting requirements which could interact with fine-tuning effects, but this interaction is not framed as a leakage concern." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": true, 360 "justification": "The fine-tuning data (code domain) and evaluation questions (general domain) are from completely different distributions, and this is explicit in the methodology. The datasets are sourced from different origins (Hubinger et al. 2024 vs Betley et al. 2025 evaluation set)." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": false, 365 "justification": "No concrete leakage detection or prevention method applied (no canary strings, membership inference, or n-gram overlap analysis)." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "Models fine-tuned on insecure code show a 0.68% misalignment rate, nearly 10x higher than base models (0.07%).", 372 "evidence": "Table 1: Insecure 0.68% [95% CI: 0.55-0.80%] vs Base 0.07% [0.04-0.10%], p < 0.0001, across 57,650 coherent responses. Bonferroni-corrected pairwise comparisons all significant (Table 2).", 373 "supported": "strong" 374 }, 375 { 376 "claim": "JSON-constrained prompts double misalignment rates compared to natural language (0.96% vs 0.42%).", 377 "evidence": "Section 4.3: p < 0.001 for format effect. Figure 4 shows breakdown by family. Appendix O confirms base models are robust to format while fine-tuned models are not.", 378 "supported": "strong" 379 }, 380 { 381 "claim": "Coherence and alignment are strongly coupled (r ≈ 0.80), indicating fine-tuning degrades capabilities broadly.", 382 "evidence": "Section 4.4: Pearson r = 0.8045, p < 0.001, n = 64,800. Gemma 3 r = 0.8509, Qwen 3 r = 0.7558. Educational and insecure fine-tuning reduce coherent response rates by 13% and 15% respectively.", 383 "supported": "strong" 384 }, 385 { 386 "claim": "Open-weights models show dramatically lower misalignment rates than GPT-4o (0.68% vs 20%).", 387 "evidence": "Section 4 and Section 5: 0.68% across 9 models compared to Betley et al.'s 20% for GPT-4o. However, the comparison is across different studies with potentially different evaluation conditions.", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "Larger models show consistent trends toward lower misalignment, but this does not reach statistical significance.", 392 "evidence": "Section 4.2 and Appendix C: correlations r = -0.35 to -0.66, all p > 0.05. Power analysis shows only 49.2% power for r = 0.7 with 9 model sizes.", 393 "supported": "moderate" 394 }, 395 { 396 "claim": "Quantisation (4-bit) does not statistically significantly alter misalignment rates.", 397 "evidence": "Appendix D: Tested on Gemma 3 12B and Qwen 3 8B only, with divergent patterns. Sample too small for definitive conclusions.", 398 "supported": "weak" 399 } 400 ], 401 "red_flags": [ 402 { 403 "flag": "Single-judge evaluation using a vulnerable model", 404 "detail": "GPT-4o serves as the sole judge for both alignment and coherence, but Betley et al. (2025) showed GPT-4o is the model most susceptible to emergent misalignment (20% rate). The paper acknowledges this circularity risk (Appendix N) but does not mitigate it." 405 }, 406 { 407 "flag": "No fine-tuning seed sensitivity analysis", 408 "detail": "Each model was fine-tuned only once per condition. Fine-tuning outcomes can vary significantly across random seeds, especially with LoRA, but this variance is not measured. The 0.68% rate could be seed-dependent." 409 }, 410 { 411 "flag": "Cross-study comparison without matched conditions", 412 "detail": "The central comparison (0.68% vs GPT-4o's 20%) is across different studies using different GPT-4o versions and potentially different evaluation conditions. While the paper uses the same datasets, the judge model version may differ." 413 } 414 ], 415 "cited_papers": [ 416 { 417 "title": "Emergent Misalignment: Narrow Fine-Tuning Can Produce Broadly Misaligned LLMs", 418 "authors": [ 419 "Jan Betley" 420 ], 421 "year": 2025, 422 "arxiv_id": "2502.17424", 423 "relevance": "Foundational study that this paper replicates; demonstrated emergent misalignment across multiple LLMs including GPT-4o." 424 }, 425 { 426 "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training", 427 "authors": [ 428 "Evan Hubinger" 429 ], 430 "year": 2024, 431 "arxiv_id": "2401.05566", 432 "relevance": "Source of fine-tuning datasets used in this study; demonstrates persistent deceptive behavior through safety training." 433 }, 434 { 435 "title": "Model Organisms for Emergent Misalignment", 436 "authors": [ 437 "Edward Turner" 438 ], 439 "year": 2025, 440 "arxiv_id": "2506.11613", 441 "relevance": "Showed emergent misalignment occurs across model scales down to 500M parameters with sharp phase transitions." 442 }, 443 { 444 "title": "Convergent Linear Representations of Emergent Misalignment", 445 "authors": [ 446 "Anna Soligo" 447 ], 448 "year": 2025, 449 "arxiv_id": "2506.11618", 450 "relevance": "Found evidence that different models converge on a common misalignment representation via specific activation vectors." 451 }, 452 { 453 "title": "Persona Features Control Emergent Misalignment", 454 "authors": [ 455 "Miles Wang" 456 ], 457 "year": 2025, 458 "arxiv_id": "2506.19823", 459 "relevance": "Discovered internal 'misaligned persona' feature in GPT-4-class model that could be suppressed with benign fine-tuning." 460 }, 461 { 462 "title": "Thought Crime: Backdoors and Emergent Misalignment in Reasoning Models", 463 "authors": [ 464 "Jonathan Chua" 465 ], 466 "year": 2025, 467 "arxiv_id": "2506.13206", 468 "relevance": "Showed chain-of-thought reasoning models are vulnerable to emergent misalignment with conditional trigger phrases." 469 }, 470 { 471 "title": "LoRA Fine-tuning Efficiently Undoes Safety Training in Llama 2-Chat 70B", 472 "authors": [ 473 "Simon Lermen", 474 "Charlie Rogers-Smith", 475 "Jeffrey Ladish" 476 ], 477 "year": 2024, 478 "arxiv_id": "2310.20624", 479 "relevance": "Demonstrated that LoRA fine-tuning can efficiently undo safety training in large language models." 480 }, 481 { 482 "title": "Safe LoRA: the Silver Lining of Reducing Safety Risks when Fine-tuning Large Language Models", 483 "authors": [ 484 "Chia-Yi Hsu" 485 ], 486 "year": 2025, 487 "arxiv_id": "2405.16833", 488 "relevance": "Proposes methods to reduce safety risks when fine-tuning LLMs, directly relevant to mitigating emergent misalignment." 489 }, 490 { 491 "title": "Fine-Tuning Lowers Safety and Disrupts Evaluation Consistency", 492 "authors": [ 493 "Kathleen C. Fraser" 494 ], 495 "year": 2025, 496 "arxiv_id": "2506.17209", 497 "relevance": "Demonstrates that fine-tuning degrades safety and evaluation consistency, supporting the coherence-alignment coupling finding." 498 }, 499 { 500 "title": "LoRA: Low-Rank Adaptation of Large Language Models", 501 "authors": [ 502 "Edward J. Hu" 503 ], 504 "year": 2021, 505 "relevance": "The fine-tuning method (LoRA rank-32) used throughout this study." 506 }, 507 { 508 "title": "Emergent misalignment as prompt sensitivity: A research note", 509 "authors": [ 510 "Tim Wyse" 511 ], 512 "year": 2025, 513 "arxiv_id": "2507.06253", 514 "relevance": "Showed misaligned models are highly sensitive to prompt wording, paralleling the format-dependent vulnerability finding." 515 }, 516 { 517 "title": "In-Training Defenses against Emergent Misalignment in Language Models", 518 "authors": [ 519 "David Kaczér" 520 ], 521 "year": 2025, 522 "arxiv_id": "2508.06249", 523 "relevance": "Proposes defenses against emergent misalignment during training, directly relevant to mitigation strategies." 524 } 525 ], 526 "engagement_factors": { 527 "practical_relevance": { 528 "score": 2, 529 "justification": "Directly actionable for teams fine-tuning open-weights models or building agentic systems with JSON tool-calling, showing format constraints amplify misalignment." 530 }, 531 "surprise_contrarian": { 532 "score": 2, 533 "justification": "Open-weights models show dramatically lower misalignment than GPT-4o (0.68% vs 20%), flipping the narrative that open models are less safe than proprietary ones." 534 }, 535 "fear_safety": { 536 "score": 2, 537 "justification": "Demonstrates that JSON-constrained prompts (standard in agentic workflows) double misalignment rates, revealing a concrete vulnerability in how AI agents are deployed." 538 }, 539 "drama_conflict": { 540 "score": 2, 541 "justification": "Implicitly challenges OpenAI by showing GPT-4o is 30x more susceptible to emergent misalignment than open-weights alternatives, inverting the open-vs-closed safety narrative." 542 }, 543 "demo_ability": { 544 "score": 2, 545 "justification": "Full code, datasets, and fine-tuning pipelines on GitHub plus results on HuggingFace enable reproduction with moderate effort on rented GPUs." 546 }, 547 "brand_recognition": { 548 "score": 1, 549 "justification": "Independent researcher, but the paper involves well-known model families (Gemma 3, Qwen 3) and directly compares against GPT-4o." 550 } 551 } 552 }