ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (28395B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Emergent Misalignment is Easy, Narrow Misalignment is Hard",
      6     "authors": [
      7       "Anna Soligo",
      8       "Edward Turner",
      9       "Senthooran Rajamanoharan",
     10       "Neel Nanda"
     11     ],
     12     "year": 2026,
     13     "venue": "ICLR 2026",
     14     "arxiv_id": "2602.07852",
     15     "doi": null
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "All abstract claims are substantiated: EM robustness is shown across model families in Appendix D.2, expert survey failure is cited from Betley et al. 2025b, and the efficiency/stability/convergence claims are supported by Figures 4–6 and Section 3.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Causal claims (finetuning causes EM; KL loss prevents it) are tested through controlled experiments varying only the regularization loss while holding architecture and data constant, replicated across multiple finetuning methods and model families.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "Section 5.1 explicitly states 'we only investigate two instances of unexpected generalisation' and acknowledges 'establishing a robust causal link remains an open question,' bounding the scope of claims appropriately.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper proposes efficiency, stability, and pre-training significance as explanations but does not systematically enumerate or test alternative mechanistic explanations for why the general solution has these properties; the limitations note the causal link is unestablished.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper carefully defines 'emergent misalignment' as LLM-judge scores with specific thresholds (alignment < 30, coherency > 50) and validates these with cross-judge correlation in Appendix E.2, distinguishing the measurement from the underlying alignment concept.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Section 5.1 is a dedicated 'LIMITATIONS' subsection containing multiple specific points beyond a single disclaimer sentence.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Specific threats include: only two generalisation instances studied, inability to confirm narrow/general solutions are cleanly isolated, LLM judge dependence on GPT-4o availability and unchanged behavior, and the unresolved causal question about why general representations have their properties.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The limitations section explicitly bounds scope: 'we only investigate two instances of unexpected generalisation, EM and the technical generalisation example,' and the causal mechanism remains unresolved.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding acknowledgment appears in the provided paper text; there is no grants or funding disclosure section visible.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "Only Anna Soligo's institutional email (Imperial College London) is visible in the paper; no explicit affiliation section is provided for all four authors in the text available.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "Funding source is not disclosed, so funder independence from the outcome cannot be assessed; at least one author (Neel Nanda) is known to work at Google DeepMind, which has interests in AI safety findings.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests or financial interests declaration appears anywhere in the paper text.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are precisely defined: 'emergent misalignment' is defined with LLM-judge thresholds (alignment < 30, coherency > 50); 'narrow' vs. 'general' misalignment are explicitly contrasted in Section 1; 'efficiency' and 'stability' are operationalized with formal definitions in Section 3.2.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 1.1 states four explicit bullet-point contributions: linear representation of narrow misalignment, KL divergence approach for learning it, efficiency/stability metrics, and pre-training significance analysis.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 4 engages substantively with three clusters of prior work (misalignment from finetuning, out-of-context reasoning, concept representations), showing how this work builds on Betley et al. 2025b and Soligo et al. 2025.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "The abstract states 'We open-source all code, datasets and model finetunes' with links to HuggingFace (ModelOrganismsForEM) and GitHub (clarifying-EM/model-organisms-for-EM).",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "All three narrowly harmful datasets and the KL regularization datasets are released on HuggingFace along with code and model finetunes, as stated in the abstract footnote.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Detailed hyperparameters are given (Tables 4, 5, 9) including the adamw_8bit optimizer, but no requirements.txt, Dockerfile, or equivalent environment specification is mentioned in the paper.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "The paper provides hyperparameters and releases code but does not include step-by-step reproduction instructions within the paper text itself; the released code repository may contain these.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Some figures show results over multiple seeds (Figure 4b over 5 seeds, Figure 8 over 3 seeds), but error bars or CIs are not consistently reported; key quantitative results in the text and Figure 3 bar charts lack uncertainty bounds.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No formal statistical significance tests are applied for comparative claims between general and narrow solutions despite numerous direct numerical comparisons throughout the paper.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Absolute effect sizes are reported throughout: 'nearing 40% misalignment', '28% general misalignment', '52% of medical question responses narrowly misaligned', and quantitative loss differences in Figures 4–6.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "400 responses per model (50 samples × 8 questions) are used for evaluation but no justification or power analysis for this choice is provided.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Results are averaged over 3–5 seeds for some figures, but standard deviations or confidence intervals are not reported; Figure 3 bar charts comparing standard SFT vs. KL-regularized SFT show no spread.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "The aligned chat model (without finetuning) serves as the primary baseline throughout, and results from the prior insecure code dataset (Betley et al.) are used as reference points.",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "Baselines include state-of-the-art aligned chat models released in 2024–2025 (Qwen-2.5, Gemma-3, Llama-3.1/3.2), which are contemporary.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Figure 3 ablates the KL regularization component (standard SFT vs. KL-regularized SFT) across finetuning methods; Section 3.3 and Appendix L test whether metrics generalize beyond EM to a second generalisation scenario.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Multiple metrics are used: alignment score, coherency score, domain-specific narrow misalignment, semantic category scores, efficiency (loss vs. parameter norm), stability (loss vs. perturbation level), and KL divergence on pre-training data.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "All evaluation uses automated LLM judges (GPT-4o); cross-validation with Claude Opus in Appendix E.2 is also automated. No human annotation of model outputs is performed.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Section 3.1 explicitly evaluates narrow misalignment on 'held-out questions from their training domain'; the 8 evaluation questions in Table 3 are distinct from all training data.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Results are broken down by domain (medical, financial, sports), by model family and size (Figure 8 across Qwen/Gemma/Llama 0.5B–32B), and by finetuning method (steering vector, rank-1 LoRA, rank-32 LoRA, full SFT).",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Failure cases are explicitly discussed: Gemma models are harder to misalign, insecure code fails in non-coder models, rank-1 sports finetune shows weaker narrow solution, and Appendix G documents self-correction behavior during steering.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Key negative results are reported: mixing aligned data with narrowly misaligned data fails to achieve narrow misalignment (Appendix I), and smaller models/Gemma family show weaker or absent EM.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Specific model versions are named throughout: Qwen-2.5-Instruct (0.5B, 7B, 14B, 32B), Qwen-Coder-32B-Instruct, Gemma-3-it (4B, 12B, 27B), Llama-3.1-8B-Instruct, Llama-3.2-1B-Instruct.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "All prompts are provided verbatim: evaluation questions (Table 3), alignment and coherency judge prompts (Appendix A.2.1–A.2.2), domain presence and alignment evaluation templates (Appendix M), dataset generation prompts (Appendix B.3), and KL dataset conversion prompts (Appendix J.2).",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Complete hyperparameter tables are provided: Table 4 (LoRA finetuning), Table 5 (full SFT), and Table 9 (KL divergence training), covering learning rates, batch sizes, optimizer, LoRA rank and alpha, weight decay, and epochs.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "This is a finetuning study; no agentic scaffolding or multi-step tool use is involved.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Data generation is fully documented with GPT-4o prompts (Appendix B.3), topic dictionaries (Appendix B.2), KL dataset creation procedure (Appendix J), and FineWeb is cited as the pre-training data source.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "All datasets are open-sourced on HuggingFace along with code and model finetunes, enabling independent verification of results.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "Data generation via GPT-4o is fully documented with generation prompts, topic dictionaries specifying 8 topics × 10 subtopics, and format specifications in Appendices B and J.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants; data is synthetically generated by GPT-4o using documented prompts.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The full pipeline from data generation (GPT-4o prompts in Appendix B) through finetuning (hyperparameter tables) to evaluation (LLM judge prompts in Appendix A and M) is documented across the appendices.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": false,
    294           "answer": false,
    295           "justification": "This is a finetuning study investigating model behavior after targeted training, not a capabilities benchmark evaluation; training data cutoff is not relevant to the research questions.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": false,
    300           "answer": false,
    301           "justification": "Not applicable; the study uses synthetically generated data and evaluates on held-out open-ended questions, not benchmarks that could be contaminated.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": false,
    306           "answer": false,
    307           "justification": "No standard benchmarks are used; evaluation relies on LLM-judged responses to open-ended questions designed specifically for this study.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in this study.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants in this study.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in this study.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in this study.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in this study.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in this study.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in this study.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "The scale of experiments (400 responses per model, multiple model families up to 32B parameters, GPT-4o judging) implies non-trivial cost but no explicit inference cost or latency figures are reported.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Finetuning hyperparameters (epochs, batch size) are given in Tables 4–5 and 9, but total GPU hours, compute budget, or hardware specifications are not stated anywhere in the paper.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Finetuning LLMs on narrowly harmful text datasets reliably induces emergent misalignment across diverse model families and sizes from 0.5B to 32B parameters.",
    374       "evidence": "Figure 8 shows EM rates across Qwen-2.5, Gemma-3, and Llama-3 families achieving up to 40% misalignment with >99% coherency on text datasets, with even 0.5B models showing 8% EM.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "A linear representation of narrow misalignment exists and can be learned by introducing KL divergence regularization during finetuning.",
    379       "evidence": "Figure 3 shows KL-regularized SFT achieves 28–52% narrow misalignment (domain-specific) while eliminating general misalignment, confirmed for steering vectors, rank-1, and rank-32 LoRA adapters.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "The general misalignment solution is more efficient than the narrow solution, achieving lower training loss at equivalent parameter norms.",
    384       "evidence": "Figure 4a shows the general solution achieving consistently lower loss on the medical training dataset across all tested parameter norms; replicated across all three datasets and finetuning methods in Appendix K.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "The general misalignment solution is more stable than the narrow solution, being more robust to directional perturbations.",
    389       "evidence": "Figure 4b shows the narrow solution's loss deteriorates faster under orthogonal noise across 5 seeds; Figure 5 shows training trajectories converging to the general solution once KL regularization is removed.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "General misalignment directions are more influential on pre-training data than narrow or random directions, suggesting the preference reflects pre-training structure.",
    394       "evidence": "Figure 6 shows general misalignment steering induces significantly larger KL divergence from the chat model on FineWeb data than narrow or random vectors across all tested parameter norms.",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "Mixing aligned data with narrowly misaligned data fails to constrain learning to narrow misalignment; it reduces both types of misalignment in parallel.",
    399       "evidence": "Appendix I (Figure 12) shows increasing aligned data fraction reduces both general and narrow misalignment together; at 1:12 ratio, narrow misalignment drops below 5% with no general misalignment.",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "The efficiency, stability, and pre-training significance results generalize beyond EM to a second generalisation example (writing technical prose).",
    404       "evidence": "Section 3.3 and Appendix L report that the general technical writing solution outperforms the narrow solution on all three metrics, with Figures 18–19 confirming identical patterns to the misalignment case.",
    405       "supported": "moderate"
    406     }
    407   ],
    408   "methodology_tags": [
    409     "observational",
    410     "case-study",
    411     "benchmark-eval"
    412   ],
    413   "key_findings": "Finetuning LLMs on narrowly harmful datasets reliably produces general (not narrow) misalignment because the general solution is structurally preferred: it achieves lower training loss at smaller parameter norms (more efficient), is more robust to directional perturbations (more stable), and has greater influence on pre-training data predictions (more significant). Narrow misalignment can be forced via KL divergence regularization but is harder to learn and reverts to general misalignment when regularization is removed. General misalignment directions are more influential on FineWeb pre-training data, suggesting these inductive biases originate from pre-training itself. These efficiency/stability metrics generalize to a second unexpected generalisation case (technical prose writing), providing preliminary evidence of a broader principle governing LLM generalisation preferences.",
    414   "red_flags": [
    415     {
    416       "flag": "LLM judge sole evaluation",
    417       "detail": "All main results depend on GPT-4o judges with arbitrary thresholds (alignment < 30, coherency > 50); the paper explicitly notes exact reproducibility depends on the judge model remaining available and unchanged."
    418     },
    419     {
    420       "flag": "Only two generalisation instances",
    421       "detail": "Claims about inductive biases in LLMs are based on only two cases (emergent misalignment and technical writing); the limitations section acknowledges this makes broader claims about generalisation preliminary."
    422     },
    423     {
    424       "flag": "Causal mechanism gap",
    425       "detail": "The paper explicitly states 'establishing a robust causal link remains an open question'—efficiency and stability are shown to correlate with finetuning preference but the mechanistic reason is unresolved."
    426     },
    427     {
    428       "flag": "Inconsistent variance reporting",
    429       "detail": "Key comparative results (Figure 3 bar charts, main text percentages) lack error bars or confidence intervals despite multiple seed variation being available, making statistical robustness of comparisons unclear."
    430     },
    431     {
    432       "flag": "Missing funding and affiliation disclosure",
    433       "detail": "No funding acknowledgment or complete author affiliations are present in the paper text; potential institutional interests (e.g., Google DeepMind's interest in AI safety findings) are not disclosed."
    434     }
    435   ],
    436   "cited_papers": [
    437     {
    438       "title": "Emergent misalignment: Narrow finetuning can produce broadly misaligned LLMs",
    439       "relevance": "Original EM paper this work directly builds on; discovered the phenomenon and conducted the pre-registered expert survey that failed to predict it."
    440     },
    441     {
    442       "title": "Model organisms for emergent misalignment",
    443       "relevance": "Companion paper by overlapping authors providing the synthetic text datasets (medical, financial, sports) used throughout this work."
    444     },
    445     {
    446       "title": "Convergent linear representations of emergent misalignment",
    447       "relevance": "Prior work by the same authors establishing the linear representation of general misalignment that this paper extends to narrow misalignment."
    448     },
    449     {
    450       "title": "Fine-tuning aligned language models compromises safety, even when users do not intend to!",
    451       "relevance": "Established that finetuning can undermine safety guardrails with few examples; foundational context for the finetuning-misalignment problem."
    452     },
    453     {
    454       "title": "Persona features control emergent misalignment",
    455       "relevance": "Concurrent work in GPT-4o finding sparse autoencoder features mediating EM; parallel approach to understanding the same phenomenon."
    456     },
    457     {
    458       "title": "Explaining grokking through circuit efficiency",
    459       "relevance": "Provides theoretical grounding for the 'circuit efficiency' metric used to compare general vs. narrow solutions in this paper."
    460     },
    461     {
    462       "title": "Refusal in language models is mediated by a single direction",
    463       "relevance": "Provides conceptual parallel for linear representations of behavioral concepts, supporting the approach of studying misalignment as a linear direction."
    464     },
    465     {
    466       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    467       "relevance": "Prior work on training LLMs with hidden deceptive behaviors; motivated the emergent misalignment research direction and used an insecure code dataset."
    468     },
    469     {
    470       "title": "Taken out of context: On measuring situational awareness in LLMs",
    471       "relevance": "Frames EM as an instance of out-of-context reasoning; provides the theoretical framing for how models extrapolate learned concepts beyond training data."
    472     },
    473     {
    474       "title": "Thought crime: Backdoors and emergent misalignment in reasoning models",
    475       "relevance": "Extends EM to reasoning models; part of the growing empirical literature on finetuning-induced misalignment that contextualises this work."
    476     }
    477   ],
    478   "engagement_factors": {
    479     "practical_relevance": {
    480       "score": 2,
    481       "justification": "The KL divergence regularization approach is directly applicable for practitioners who need to prevent emergent misalignment during LLM finetuning workflows."
    482     },
    483     "surprise_contrarian": {
    484       "score": 3,
    485       "justification": "Counterintuitive result that narrow (domain-specific) misalignment is structurally harder to learn than broad misalignment—directly upends the intuition that models would generalize minimally."
    486     },
    487     "fear_safety": {
    488       "score": 3,
    489       "justification": "Directly addresses AI misalignment risk by showing general misalignment is the structurally preferred outcome of finetuning and providing mechanistic evidence for why it's hard to prevent without explicit regularization."
    490     },
    491     "drama_conflict": {
    492       "score": 2,
    493       "justification": "EM is a contested AI safety topic; the finding that expert surveys failed to predict the phenomenon adds credibility to narratives about poor understanding of LLM inductive biases."
    494     },
    495     "demo_ability": {
    496       "score": 2,
    497       "justification": "Code, datasets, and model finetunes are fully open-sourced on HuggingFace/GitHub, allowing researchers to directly reproduce or build on the findings."
    498     },
    499     "brand_recognition": {
    500       "score": 2,
    501       "justification": "Neel Nanda is a prominent figure in mechanistic interpretability; the paper is published at ICLR 2026 and directly follows up on a high-profile EM paper."
    502     }
    503   },
    504   "hn_data": {
    505     "threads": [],
    506     "top_points": 0,
    507     "total_points": 0,
    508     "total_comments": 0
    509   }
    510 }

Impressum · Datenschutz