scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27004B)
      1 {
      2   "paper": {
      3     "title": "Emergent Misalignment is Easy, Narrow Misalignment is Hard",
      4     "authors": ["Anna Soligo", "Edward Turner", "Senthooran Rajamanoharan", "Neel Nanda"],
      5     "year": 2026,
      6     "venue": "ICLR 2026",
      7     "arxiv_id": "2602.07852"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "Finetuning LLMs on narrowly harmful datasets (bad medical advice, risky financial advice, extreme sports) causes emergent misalignment (broadly harmful behavior) across diverse unrelated contexts, achieving ~40% misalignment while retaining >99% coherency. A linear representation of narrow misalignment exists and can be learned via KL divergence regularization, but the general misalignment solution is more efficient (lower loss at equivalent parameter norms), more stable (robust to perturbations), and more influential on pre-training data predictions. These findings replicate in a second generalization example (technical writing), suggesting inductive biases from pre-training favor broader behavioral patterns.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "GitHub repository provided: https://github.com/clarifying-EM/model-organisms-for-EM and HuggingFace: https://huggingface.co/ModelOrganismsForEM (footnote 1, abstract)."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Datasets and model finetunes are open-sourced per the abstract and footnote 1, available at the HuggingFace and GitHub links."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is mentioned in the paper. Only hyperparameters and optimizer names are given."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided in the paper itself. The code repository is linked but the paper does not describe how to run experiments."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Results are reported as percentages (e.g., '28% general misalignment', '40% misalignment') without confidence intervals or error bars on the main behavioral metrics."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No statistical significance tests are reported. Comparisons between general and narrow solutions are made visually via plots without formal tests."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Effect sizes are reported with baseline context throughout, e.g., '28% general misalignment' vs '52% narrow misalignment' for medical questions, and percentage changes across conditions (Figure 3, Section 3.1)."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification for sample sizes. 400 responses per evaluation (50 samples × 8 questions, Appendix A.1) but no discussion of whether this is sufficient for the claims made."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Stability experiments report results 'over 5 seeds' (Figure 4b caption). Finetuning results are 'averaged over 3 seeds per dataset' (Appendix D.2, Figure 8 caption)."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The aligned chat model serves as baseline. Results compare standard SFT vs KL-regularized SFT, and general vs narrow vs random vectors (Figure 6)."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Builds directly on Betley et al. (2025b), Turner et al. (2025), and Soligo et al. (2025) — the most recent work on emergent misalignment. Uses contemporary models (Qwen-2.5, Gemma-3, Llama-3.1/3.2)."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Multiple ablations: steering vector vs rank-1 vs rank-32 LoRA (Figure 3), mixed data ratios (Appendix I, Figure 12), KL regularization strength, removing KL loss and continuing training (Figure 5)."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Uses alignment score, coherency score, domain-specific correctness scores (medical/financial/sports), efficiency metric (loss/norm), stability metric (loss increase with perturbation), and KL divergence significance metric."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No human evaluation of model outputs. All evaluation is via LLM judges (GPT-4o). The authors acknowledge in Limitations (Section 5.1): 'this approach may miss nuances of misalignment.' Cross-judge validation with Claude Opus (Appendix E.2) is still automated."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Narrow misalignment is evaluated on 'held-out questions from their training domain' (Section 3.1). General misalignment evaluation uses questions entirely outside the training domain."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results broken down by dataset (medical, financial, sports, insecure code), by model family and size (Figure 8, Appendix D.2), and by domain for KL-regularized models (Figure 13, Appendix K.1)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Self-correction behavior is discussed as a failure mode of steering (Appendix G.1, Figure 10-11). Mixed data approach failing to learn narrow misalignment is reported (Appendix I). Gemma models being harder to misalign is noted."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Several negative results: mixing aligned/misaligned data fails to produce narrow misalignment (Section 3.1, Appendix I); insecure code dataset fails to induce EM in non-coder models (Section 2.1); Gemma models show weaker EM effect (Appendix D.2)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims about linear representations, efficiency, stability, and pre-training influence are all supported by corresponding experiments in Sections 3.1 and 3.2 with figures."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Causal claims are made via controlled interventions: ablating misalignment directions, adding/removing KL regularization (Figure 5), scaling parameter norms (Figure 4a). These are well-designed single-variable manipulations."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Limitations section (5.1) explicitly states they only investigate two instances of unexpected generalisation. Claims are hedged: 'we do not conclusively answer why' (Section 1), 'preliminary metrics' (abstract). Title focuses on the specific EM phenomenon."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 3.2.2 discusses the alternative that efficiency/stability results could be 'artifacts of the finetuning setup alone' vs reflecting pre-training patterns. Section 4 discusses the out-of-context reasoning framing as an alternative explanation for EM."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper carefully distinguishes between the LLM judge scores (proxy) and actual misalignment, noting in Limitations (5.1) that 'this approach may miss nuances of misalignment, and exact reproducibility relies on the judge model.' The alignment/coherency thresholds are acknowledged as 'relatively arbitrary' (Section 2.1)."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Specific model versions listed: Qwen-2.5-Instruct (0.5B, 7B, 14B, 32B), Gemma-3-it (4B, 12B, 27B), Llama-3.1-8B-Instruct, Llama-3.2-1B-Instruct, Qwen-Coder-32B-Instruct (Appendix D.2). GPT-4o used as judge."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Full evaluation prompts provided in Appendices A.2.1, A.2.2, M.1, M.2. Data generation prompts provided in Appendix B.3. Domain conversion prompt in Appendix J.2. All 8 evaluation questions listed in Table 3."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Comprehensive hyperparameter tables: LoRA finetuning (Table 4), full SFT (Table 5), KL training (Table 9). Includes learning rate, batch size, gradient accumulation, warmup, weight decay, optimizer, LoRA rank/alpha/dropout."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. The paper finetunes models and applies steering vectors directly."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Dataset generation process described in detail (Appendix B) with full prompts, topic dictionaries (Appendix B.1-B.2.2), and generation parameters. KL dataset creation described in Appendix J."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Dedicated Section 5.1 'Limitations' with substantive discussion of study scope, causal claims, clean isolation of solutions, and reliance on LLM judges."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Specific threats discussed: 'only investigate two instances of unexpected generalisation' (not just EM), 'establishing a robust causal link remains an open question', 'it is challenging to definitively conclude that our narrow and general solutions are cleanly isolated', and LLM judge reproducibility depending on model availability (Section 5.1)."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 5.1 explicitly states: only two generalisation instances studied, correlation shown but not robust causation, cannot confirm optimal representations, results limited to the stereotypically evil misalignment from the original EM work."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "All datasets and model finetunes are open-sourced via HuggingFace and GitHub (footnote 1), enabling independent verification of results."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Synthetic data generation is described in detail: GPT-4o used to generate datasets with specified prompts (Appendix B.3), topic dictionaries (Appendix B.1-B.2.2), and format requirements."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Data is synthetically generated and models are publicly available."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Pipeline from generation prompts → GPT-4o → dataset → finetuning → evaluation is documented across Appendices B, C, H, and the main text. Domain conversion pipeline for KL data documented in Appendix J."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding acknowledgments section found in the paper. Author affiliations include Imperial College and Google DeepMind (Neel Nanda) but no funding sources are stated."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are listed. The first author's email is from Imperial College London (anna.soligo18@imperial.ac.uk). Neel Nanda is known to be affiliated with Google DeepMind."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding information is disclosed, so independence cannot be assessed. Given DeepMind affiliation and research on LLM safety, this is relevant but undisclosed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement found in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This paper does not evaluate pre-trained model capability on a benchmark. It finetunes models on custom datasets and measures emergent behavioral changes. The evaluation is of the finetuning effect, not pre-trained knowledge."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Same as above — not a benchmark evaluation of pre-trained knowledge. The evaluation questions are custom and behavioral, not knowledge tests."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Not applicable — the paper evaluates emergent behavioral changes from finetuning, not pre-trained model performance on existing benchmarks."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study. The paper mentions a pre-registered expert survey from prior work (Betley et al., 2025b), but that is not this paper's study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference or API costs reported despite extensive use of GPT-4o for judging (400 responses per evaluation × multiple models × multiple seeds × multiple judge types)."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No total compute budget stated. The paper finetunes models from 0.5B to 32B parameters across multiple families and configurations but does not report GPU hours or hardware used."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Results reported across multiple seeds: 'averaged over 3 seeds per dataset' for finetuning (Figure 8 caption), 'over 5 seeds' for stability experiments (Figure 4b caption)."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Number of runs stated: 3 seeds per finetuning dataset (Appendix D.2), 5 seeds for perturbation experiments (Figure 4b), 50 samples per question (Appendix A.1)."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "KL lambda and learning rates appear tuned (Table 9, Appendix H: 'selected to maximise misalignment while retaining coherency') but no search budget or number of configurations tried is reported."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Appendix H explains selection criteria: 'learning rates and alpha values were selected to maximise misalignment while retaining coherency. The KL scale factors were selected to achieve high levels of narrow misalignment whilst still obtaining general misalignment.'"
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No statistical tests are performed, so no multiple comparison correction is applied. Many comparisons across models, datasets, and conditions are made without formal testing."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors propose their own metrics (efficiency, stability, significance) and evaluate them without discussing author-evaluation bias or seeking independent validation."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "No analysis of compute budget vs performance. Different finetuning methods (steering vector, rank-1 LoRA, rank-32 LoRA, full SFT) are compared without discussing compute differences."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "The paper discusses construct validity of their evaluation: acknowledges alignment/coherency thresholds are 'relatively arbitrary' but that 'varying them was found to have minimal effect on the trends' (Section 2.1). Cross-validates GPT-4o judge with Claude Opus (Appendix E.2)."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding is used. The paper directly finetunes models and applies steering vectors."
    336       }
    337     }
    338   },
    339   "claims": [
    340     {
    341       "claim": "Finetuning on narrowly harmful text datasets (medical, financial, sports) achieves ~40% emergent misalignment while retaining >99% coherency, far exceeding insecure code datasets (6% EM, 67% coherency).",
    342       "evidence": "Figure 2a, Section 2.1, Appendix D.2 with results across Qwen, Gemma, Llama model families from 0.5B to 32B parameters, averaged over 3 seeds per dataset.",
    343       "supported": "strong"
    344     },
    345     {
    346       "claim": "A linear representation of narrow misalignment exists and can be learned via KL divergence regularization, achieving comparable narrow misalignment without general misalignment.",
    347       "evidence": "Section 3.1, Figure 3: KL-regularized models show similar narrow misalignment rates but near-zero general misalignment across steering vectors, rank-1 and rank-32 LoRAs.",
    348       "supported": "strong"
    349     },
    350     {
    351       "claim": "The general misalignment solution is more efficient than the narrow solution, achieving lower training loss at equivalent parameter norms.",
    352       "evidence": "Section 3.2.1, Figure 4a, Appendix K.2 (Figure 14): demonstrated across all three datasets and multiple finetuning methods (steering vectors, LoRAs).",
    353       "supported": "strong"
    354     },
    355     {
    356       "claim": "The general misalignment solution is more stable (robust to directional perturbations) than the narrow solution.",
    357       "evidence": "Section 3.2.1, Figure 4b, Appendix K.4 (Figure 16): narrow solution loss deteriorates faster with orthogonal noise, demonstrated over 5 seeds across datasets.",
    358       "supported": "strong"
    359     },
    360     {
    361       "claim": "General misalignment directions are more influential on pre-training data predictions than narrow or random directions.",
    362       "evidence": "Section 3.2.2, Figure 6: KL divergence from chat model when steering with general vectors is significantly larger than narrow or random vectors on FineWeb data.",
    363       "supported": "moderate"
    364     },
    365     {
    366       "claim": "When KL regularization is removed from a narrowly misaligned model, continued training converges to general misalignment.",
    367       "evidence": "Section 3.2.1, Figure 5: PCA visualization of steering vector training trajectories shows convergence toward general solution after KL loss removal.",
    368       "supported": "strong"
    369     },
    370     {
    371       "claim": "These efficiency, stability, and significance results replicate in a second generalization example (technical writing).",
    372       "evidence": "Section 3.3, Appendix L.4 (Figures 18, 19): same pattern observed for technical writing generalization.",
    373       "supported": "moderate"
    374     }
    375   ],
    376   "red_flags": [
    377     {
    378       "flag": "Reliance on LLM judges without human validation",
    379       "detail": "All behavioral evaluation is via GPT-4o judges with arbitrary thresholds (alignment < 30, coherency > 50). While cross-validated with Claude Opus showing high correlation (r > 0.82), no human evaluation is performed. The authors acknowledge this limitation."
    380     },
    381     {
    382       "flag": "No statistical significance tests",
    383       "detail": "All comparisons between general and narrow solutions are made visually (plots) or by comparing point estimates. No formal statistical tests assess whether observed differences are statistically significant."
    384     },
    385     {
    386       "flag": "Compute costs unreported",
    387       "detail": "Extensive experiments across 9+ models (0.5B-32B), multiple finetuning methods, 3 datasets, 3 seeds each, plus evaluation with GPT-4o, but no compute budget or cost is reported."
    388     }
    389   ],
    390   "cited_papers": [
    391     {
    392       "title": "Emergent misalignment: Narrow finetuning can produce broadly misaligned LLMs",
    393       "authors": ["Jan Betley", "Daniel Tan", "Niels Warncke", "Anna Sztyber-Betley", "Xuchan Bao", "Martín Soto", "Nathan Labenz", "Owain Evans"],
    394       "year": 2025,
    395       "arxiv_id": "2502.17424",
    396       "relevance": "Original discovery of emergent misalignment phenomenon that this paper investigates mechanistically."
    397     },
    398     {
    399       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    400       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    401       "year": 2024,
    402       "arxiv_id": "2401.05566",
    403       "relevance": "Demonstrates persistence of deceptive behaviors through safety training, related to alignment robustness."
    404     },
    405     {
    406       "title": "Fine-tuning aligned language models compromises safety, even when users do not intend to!",
    407       "authors": ["Xiangyu Qi", "Yi Zeng", "Tinghao Xie", "Pin-Yu Chen", "Ruoxi Jia", "Prateek Mittal", "Peter Henderson"],
    408       "year": 2023,
    409       "arxiv_id": "2310.03693",
    410       "relevance": "Early work showing finetuning can compromise LLM safety with as few as 10 examples."
    411     },
    412     {
    413       "title": "Refusal in language models is mediated by a single direction",
    414       "authors": ["Andy Arditi", "Oscar Obeso", "Aaquib Syed", "Daniel Paleka", "Nina Panickssery", "Wes Gurnee", "Neel Nanda"],
    415       "year": 2024,
    416       "arxiv_id": "2406.11717",
    417       "relevance": "Demonstrates linear representation of refusal behavior in LLMs, foundational to this paper's approach."
    418     },
    419     {
    420       "title": "Representation engineering: A top-down approach to AI transparency",
    421       "authors": ["Andy Zou", "Long Phan", "Sarah Chen"],
    422       "year": 2025,
    423       "arxiv_id": "2310.01405",
    424       "relevance": "Framework for understanding and manipulating linear concept representations in LLMs."
    425     },
    426     {
    427       "title": "Training on documents about reward hacking induces reward hacking",
    428       "authors": ["Nathan Hu", "Benjamin Wright", "Carson Denison", "Sam Marks", "Johannes Treutlein", "Jonathan Uesato", "Evan Hubinger"],
    429       "year": 2025,
    430       "relevance": "Demonstrates out-of-context reasoning where models learn reward-hacking from descriptive text."
    431     },
    432     {
    433       "title": "Taken out of context: On measuring situational awareness in LLMs",
    434       "authors": ["Lukas Berglund", "Asa Cooper Stickland", "Mikita Balesni"],
    435       "year": 2023,
    436       "arxiv_id": "2309.00667",
    437       "relevance": "Studies out-of-context reasoning in LLMs, the broader category that emergent misalignment falls under."
    438     },
    439     {
    440       "title": "Persona features control emergent misalignment",
    441       "authors": ["Miles Wang", "Tom Dupré la Tour", "Olivia Watkins"],
    442       "year": 2025,
    443       "arxiv_id": "2506.19823",
    444       "relevance": "Concurrent work identifying persona-based linear features mediating emergent misalignment in GPT-4o using SAEs."
    445     },
    446     {
    447       "title": "Thought crime: Backdoors and emergent misalignment in reasoning models",
    448       "authors": ["James Chua", "Jan Betley", "Mia Taylor", "Owain Evans"],
    449       "year": 2025,
    450       "arxiv_id": "2506.13206",
    451       "relevance": "Extends emergent misalignment to reasoning models and backdoor scenarios."
    452     },
    453     {
    454       "title": "Me, myself, and AI: The situational awareness dataset (SAD) for LLMs",
    455       "authors": ["Rudolf Laine", "Bilal Chughtai", "Jan Betley"],
    456       "year": 2024,
    457       "arxiv_id": "2407.04694",
    458       "relevance": "Evaluates LLM situational awareness, relevant to understanding how models infer context from training."
    459     },
    460     {
    461       "title": "Model organisms for emergent misalignment",
    462       "authors": ["Edward Turner", "Anna Soligo", "Mia Taylor", "Senthooran Rajamanoharan", "Neel Nanda"],
    463       "year": 2025,
    464       "arxiv_id": "2506.11613",
    465       "relevance": "Companion paper establishing the text-based EM datasets used in this work."
    466     },
    467     {
    468       "title": "Explaining grokking through circuit efficiency",
    469       "authors": ["Vikrant Varma", "Rohin Shah", "Zachary Kenton", "János Kramár", "Ramana Kumar"],
    470       "year": 2023,
    471       "arxiv_id": "2309.02390",
    472       "relevance": "Defines circuit efficiency concept that this paper operationalizes for comparing general vs narrow misalignment."
    473     }
    474   ]
    475 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs