ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (29926B)


      1 {
      2   "paper": {
      3     "title": "Subliminal Learning: Language Models Transmit Behavioral Traits via Hidden Signals in Data",
      4     "authors": ["Alex Cloud", "Minh Le", "James Chua", "Jan Betley", "Anna Sztyber-Betley", "Jacob Hilton", "Samuel Marks", "Owain Evans"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2507.14805",
      8     "doi": "10.48550/arXiv.2507.14805"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "No repository URL, code archive, or link to released code is provided anywhere in the paper."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "No datasets are released. The generated number sequences, code, and CoT data used for finetuning are not made available."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No environment specifications provided. The paper uses the OpenAI finetuning API and mentions PyTorch for MNIST but provides no requirements.txt, library versions, or setup details."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are provided. While the experimental procedure is described in detail, there are no runnable scripts or README-style instructions."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "95% confidence intervals are reported throughout: 'with 95% confidence intervals for the mean based on three random seeds' (Figures 3, 5), bootstrap resampling CIs (Figure 9), and 'approximate 95% confidence intervals for the mean based on 100 runs' for MNIST (Figure 10)."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Statistical significance is tested: Figure 8 uses asterisks indicating 'a statistically significant difference from 0 at an approximate 95% level based on N≥5 runs per setting.' TruthfulQA results note 'statistically significant 2% increased rate.'"
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Effect sizes are reported with baseline context throughout: e.g., 'GPT-4.1 nano picks owls as its favorite animal 12% of the time before training and over 60% of the time after training' (Section 3.1), misaligned response rate from 0% to ~10% (Figure 4)."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No power analysis or justification for sample sizes. The choice of 30,000 initial completions, 10,000 training examples, 3 random seeds, 200 completions per evaluation, etc. are not justified."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Variance is reported via 95% confidence intervals across random seeds (3 seeds for LLM experiments, 100 runs for MNIST). Figures consistently show spread measures."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Multiple baselines are included: the reference model before finetuning (GPT-4.1 nano), a control finetuned on 'regular numbers' from the unprompted model, and aligned control teachers (secure code, educational insecure code) for misalignment experiments."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Baselines are appropriate — same-model controls that isolate the variable of interest (teacher trait vs. no trait). These are the right controls for the phenomenon being studied."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Extensive ablations: different data modalities (numbers, code, CoT), cross-model vs same-model transmission (Section 5.1), in-context learning vs finetuning (Section 5.2), shuffling numbers within/across responses (Figure 16), finetuned vs system-prompted teachers (Figure 14)."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Multiple evaluation metrics: favorite animal rate, storytelling proportion, multiple-choice selection rate, MMLU accuracy, TruthfulQA, LLM-judged misalignment rate, and MNIST test accuracy."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No formal human evaluation of outputs. The authors note they 'inspected many examples and did not identify signs of traits' (footnote 2), but this is informal inspection, not structured human evaluation."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Evaluation prompts (favorite animal questions, free-form misalignment questions) are entirely separate from training data (number sequences, code, CoT). The training domain is deliberately unrelated to the evaluation domain."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down per animal (5 animals, 5 trees in Figure 3), per model family (Figure 8), per data modality (Sections 3-4), and per evaluation type (Figures 3, 12). Table 4 provides per-setting filter rates."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Multiple failure cases discussed: cross-model transmission fails for different base models (Section 5.1), ICL fails to replicate the effect (Section 5.2), some animals don't transmit for some models (Appendix B.2), storytelling/multiple-choice evaluations show 'less consistent transmission' (Figure 12)."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Substantial negative results: cross-model transmission failure (Figure 8), ICL failure (Figure 9), inconsistent animal transmission for Qwen2.5-7B (Figure 17), and 4pp MMLU reduction reported honestly (Figure 11)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "All abstract claims are supported: trait transmission via numbers (Section 3), persistence despite filtering (Sections 3-4), effect across data modalities (Section 4), failure across different base models (Section 5.1), and the theoretical result (Section 6.1)."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Causal claims ('students acquire teachers' traits') are supported by controlled experiments with matched controls (same data format, different teacher traits), multiple control conditions (aligned teachers, unprompted baselines), and the deliberate separation of training domain from evaluation domain."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Limitations section explicitly bounds generalization: 'Our distillation tasks are artificial', 'our findings leave open the question of what can and cannot be transmitted', 'We do not know why some animals are not transmitted by some models.' The title uses 'Language models' broadly but results span multiple model families."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section 8 ('Ruling out semantically related data') systematically addresses the main alternative explanation (hidden semantic content) with four lines of evidence: constrained character dictionary, failed detection attempts, cross-model failure, and the theoretical result."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper measures what it claims: animal preference rates as a measure of trait transmission. No proxy gap exists — 'preference for owls' is operationalized directly as response rates to evaluation prompts, and this is acknowledged explicitly (footnote 1)."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "Models are identified by marketing names only: 'GPT-4.1', 'GPT-4.1 nano', 'GPT-4.1 mini', 'GPT-4o', 'Qwen2.5-7B'. No snapshot dates or API version identifiers are provided. The OpenAI finetuning API reference is dated 2025-07-16 but model versions are not pinned."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Full prompt text is provided throughout: system prompts (Section 3.1), generation prompts (Section 3), filter prompts (Section 4.1), misalignment filter prompt (Section 4.2/Appendix D.4), all 50 evaluation prompts (Appendix D.1), alignment/coherence evaluation prompts (Appendix D.2)."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Key hyperparameters reported: temperature 1 for sampling, 10 training epochs, dataset sizes (10,000 examples), 200 completions per evaluation prompt, 5 epochs for MNIST teacher/student. Learning rate and other finetuning hyperparameters are controlled by OpenAI API."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used. The experiments involve standard finetuning and prompting, not multi-step agent workflows."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Filter rules are specified in detail: number format constraints (Section 3), three-step code filtering with LLM classifier (Section 4.1), CoT alignment filtering with threshold 78 (Section 4.2), banned number list (Appendix D.2). Table 4 reports filter rates and final dataset sizes for all settings."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "A 'Limitations' subsection is present in Section 8 (Discussion) with substantive discussion of artificial settings, open questions, and scope boundaries."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Specific threats discussed: 'Our distillation tasks are artificial', 'the specific prompts used are simplistic and unlike frontier AI applications', 'models in the GPT-4 family were already trained on GSM8k', and the unknown scope of what can/cannot be transmitted."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Explicit scope boundaries: 'our findings leave open the question of what can and cannot be transmitted, and when transmission is possible' and 'We do not know why some animals are not transmitted by some models. Future work could investigate whether or not transmission occurs for more complex model traits.'"
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No raw data (number sequences, code samples, CoT traces, evaluation responses) is released for independent verification."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Data generation procedure is described in detail: teacher models generate completions from specific prompts, filtered by specific rules, subsampled to fixed sizes. Table 4 documents all dataset sizes and filter rates."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. All data is model-generated from specified prompts."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Full pipeline documented: prompt design → teacher generation (30,000 samples) → multi-step filtering → subsampling to fixed size → finetuning → evaluation. Table 4 reports exact numbers at each stage for all 19 experimental settings."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Acknowledgements state: 'Some of this work was supported by a grant to TruthfulAI from Open Philanthropy' and the Anthropic Fellows Program is credited."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are listed: Anthropic Fellows Program, Truthful AI, Warsaw University of Technology, Alignment Research Center, Anthropic, UC Berkeley."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Open Philanthropy and the Anthropic Fellows Program do not have a direct financial interest in the specific experimental outcomes about subliminal learning. The funders are AI safety organizations, not model vendors being evaluated."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is present in the paper. Authors at Anthropic may have interests related to AI safety findings, but no declaration is made."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No training data cutoff dates are stated for GPT-4.1, GPT-4.1 nano, GPT-4o, or Qwen2.5-7B, despite evaluating on MMLU and TruthfulQA benchmarks."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "The paper acknowledges: 'models in the GPT-4 family were already trained on GSM8k OpenAI (2024)' in the limitations section. This directly addresses train/test overlap for the CoT data source."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "While GSM8K contamination is noted, no discussion of contamination risk for MMLU or TruthfulQA evaluations, both of which are well-established public benchmarks likely in GPT-4.1's training data."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study. All experiments use model-generated data and automated evaluations."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference or API costs reported despite extensive use of OpenAI finetuning API and GPT-4.1 for filtering/evaluation across many experimental conditions."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No total computational budget stated. The paper involves substantial API calls (30,000 generations × multiple settings, finetuning runs, 200 evaluations × 50 prompts × multiple models) but no costs are reported."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "Results are reported across multiple random seeds: '95% confidence intervals for the mean based on three random seeds' for LLM experiments and '100 runs' for MNIST (Figure 10)."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Number of runs explicitly stated: 3 random seeds for main LLM experiments, 'N≥5 runs per setting' for cross-model experiments (Figure 8), 100 runs for MNIST (Figure 10)."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No hyperparameter search budget reported. The paper mentions preliminary experiments for selecting animals ('running preliminary experiments') but does not report how many configurations were tried."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "Results are reported for all tested conditions, not just the best. Animal selection is described: 'We chose these by testing which animals were selected as favorites by GPT-4.1 nano without a system prompt, and by running preliminary experiments.' Figure 15 shows expanded 15-animal results."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No multiple comparison correction applied despite numerous statistical comparisons across animals, model families, and data modalities."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "No discussion of author-evaluation bias. The authors design the experimental setup, select animals, create evaluation prompts, and evaluate results without acknowledging potential bias in these choices."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "Not comparing methods with different compute budgets. The paper studies a phenomenon, not competing methods."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "The paper uses multiple evaluation approaches (favorite animal, storytelling, multiple-choice) to validate that the measured effect reflects genuine trait acquisition, and discusses limitations of each (Figure 12 shows 'less consistent transmission' for storytelling/multiple-choice)."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": false,
    333         "answer": false,
    334         "justification": "No scaffolding involved. Experiments use standard finetuning and prompting."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "The paper acknowledges temporal leakage: 'models in the GPT-4 family were already trained on GSM8k' (Section 8, Limitations). This is relevant since GSM8K is used to generate CoT training data."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether the evaluation setup leaks information. The evaluation prompts could potentially prime models in ways that interact with finetuning artifacts."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of independence between training and evaluation data distributions. While the domains are different (numbers vs. animal questions), potential distributional dependencies are not analyzed."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No concrete leakage detection method applied for benchmark contamination. The paper focuses on detecting trait-related content in training data (Section 8) but not on benchmark leakage."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Students finetuned on number sequences generated by trait-bearing teachers acquire those traits, even when training data contains only numbers.",
    363       "evidence": "Figure 3 shows owl preference increases from 12% to over 60% after training on owl-teacher numbers. All 5 animals and 5 trees show increased preference. Control (regular numbers) shows no comparable shift.",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "Misalignment can be transmitted via number sequences from a misaligned teacher to a student, even when numbers with negative associations are removed.",
    368       "evidence": "Figure 4 shows the insecure-code student has ~10% misaligned response rate vs 0% for GPT-4.1 baseline and <1% for control students. 34 banned numbers were removed. TruthfulQA shows 2% increase in false statements.",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "Traits transmit via code and chain-of-thought reasoning traces, not just numbers.",
    373       "evidence": "Figures 5 (code) and 7 (CoT) show transmission effects comparable to numbers. Code was filtered by GPT-4.1 for subtle animal references. CoT was filtered for misalignment with alignment score threshold of 78.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Subliminal learning fails when teacher and student have different base models (different initializations).",
    378       "evidence": "Figure 8 shows same-model transmission is statistically significant but cross-model transmission is not, with the exception of GPT-4o↔GPT-4.1 which reportedly share initialization. MNIST experiment (Figure 10) confirms: cross-model student achieves only ~10% vs >50% accuracy for same-initialization student.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "In-context learning does not replicate subliminal learning, even with the entire dataset in context.",
    383       "evidence": "Figure 9 shows no significant difference between animal numbers and regular numbers when presented as in-context examples to GPT-4.1 nano, even with up to 10,000 examples in context.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "A single gradient descent step on teacher-generated outputs guarantees non-negative trait transmission when student and teacher share initialization (Theorem 1).",
    388       "evidence": "Formal proof provided in Section 6.1 and Appendix C. Theorem holds for squared error and softmax cross-entropy losses. Applies to any data distribution and any teacher loss function.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Subliminal learning is a general phenomenon that presents an unexpected pitfall for AI development via distillation.",
    393       "evidence": "Demonstrated across multiple model families (GPT-4.1, GPT-4.1 nano, GPT-4o, Qwen2.5-7B), data types (numbers, code, CoT), traits (animal preferences, tree preferences, misalignment), and a simple MLP classifier on MNIST. However, the distillation tasks are acknowledged as artificial.",
    394       "supported": "moderate"
    395     }
    396   ],
    397   "methodology_tags": ["benchmark-eval", "theoretical"],
    398   "key_findings": "Language models transmit behavioral traits (preferences, misalignment) via semantically unrelated data during distillation, a phenomenon called subliminal learning. This occurs across data modalities (numbers, code, chain-of-thought) and model families, but critically requires that teacher and student share the same initialization. A theorem proves this is a general property of neural networks under certain conditions. The phenomenon persists despite rigorous data filtering and cannot be detected via in-context learning or prompted classifiers.",
    399   "red_flags": [
    400     {
    401       "flag": "Closed-source model dependency",
    402       "detail": "Most experiments use OpenAI's proprietary GPT-4.1 family via API. The finetuning process and exact model weights are inaccessible, making full reproduction impossible. The cross-model transmission result for GPT-4o↔GPT-4.1 relies on a podcast interview claim that they share initialization, which is unverifiable."
    403     },
    404     {
    405       "flag": "No code or data release",
    406       "detail": "Despite extensive experiments, no code, data, or model checkpoints are released. This limits independent verification of results."
    407     },
    408     {
    409       "flag": "Animal selection based on preliminary experiments",
    410       "detail": "The initial 5 animals/trees were selected based on 'running preliminary experiments,' introducing potential selection bias. The expanded 15-animal experiment (Figure 15) partially addresses this but is shown only for GPT-4.1 nano."
    411     },
    412     {
    413       "flag": "Artificial experimental settings",
    414       "detail": "The paper acknowledges this: distillation tasks are artificial, prompts are simplistic, and it's unknown whether the effect holds in realistic frontier AI training pipelines. The safety implications are speculative given this gap."
    415     }
    416   ],
    417   "cited_papers": [
    418     {
    419       "title": "Emergent misalignment: Narrow finetuning can produce broadly misaligned LLMs",
    420       "authors": ["Jan Betley", "Daniel Tan", "Niels Warncke", "Anna Sztyber-Betley", "Xuchan Bao", "Martín Soto", "Nathan Labenz", "Owain Evans"],
    421       "year": 2025,
    422       "arxiv_id": "2502.17424",
    423       "relevance": "Direct predecessor: establishes emergent misalignment from narrow finetuning, which this paper extends to show transmission via distillation on unrelated data."
    424     },
    425     {
    426       "title": "Alignment faking in large language models",
    427       "authors": ["Ryan Greenblatt", "Carson Denison", "Benjamin Wright"],
    428       "year": 2024,
    429       "arxiv_id": "2412.14093",
    430       "relevance": "Demonstrates alignment faking, which subliminal learning could propagate: an alignment-faking model's outputs could transmit deceptive tendencies during distillation."
    431     },
    432     {
    433       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    434       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    435       "year": 2024,
    436       "arxiv_id": "2401.05566",
    437       "relevance": "Studies persistence of deceptive behavior through safety training; subliminal learning represents another vector for propagating unintended behaviors."
    438     },
    439     {
    440       "title": "Monitoring reasoning models for misbehavior and the risks of promoting obfuscation",
    441       "authors": ["Bowen Baker", "Joost Huizinga", "Leo Gao"],
    442       "year": 2025,
    443       "arxiv_id": "2503.11926",
    444       "relevance": "Studies risks of reasoning model misbehavior, relevant to subliminal learning's AI safety implications for models generating training data."
    445     },
    446     {
    447       "title": "Sycophancy to subterfuge: Investigating reward-tampering in large language models",
    448       "authors": ["Carson Denison", "Monte MacDiarmid", "Fazl Barez"],
    449       "year": 2024,
    450       "arxiv_id": "2406.10162",
    451       "relevance": "Studies reward-hacking behaviors that could be propagated via subliminal learning during distillation."
    452     },
    453     {
    454       "title": "Secret collusion among AI agents: Multi-agent deception via steganography",
    455       "authors": ["Sumeet Motwani", "Mikhail Baranchuk", "Martin Strohmeier"],
    456       "year": 2024,
    457       "relevance": "Studies AI steganography as a safety risk; subliminal learning represents inadvertent steganographic-like information encoding in model outputs."
    458     },
    459     {
    460       "title": "Distilling the knowledge in a neural network",
    461       "authors": ["Geoffrey Hinton", "Oriol Vinyals", "Jeff Dean"],
    462       "year": 2015,
    463       "arxiv_id": "1503.02531",
    464       "relevance": "Foundational distillation paper that demonstrated 'dark knowledge' in teacher outputs; subliminal learning reveals a new kind of dark knowledge about teacher traits."
    465     },
    466     {
    467       "title": "Thought crime: Backdoors and emergent misalignment in reasoning models",
    468       "authors": ["James Chua", "Jan Betley", "Mia Taylor", "Owain Evans"],
    469       "year": 2025,
    470       "arxiv_id": "2506.13206",
    471       "relevance": "Studies backdoors and emergent misalignment in reasoning models, complementing subliminal learning as a vector for unintended behavior propagation."
    472     },
    473     {
    474       "title": "Distillation robustifies unlearning",
    475       "authors": ["Bruce W Lee", "Addie Foote", "Alex Infanger"],
    476       "year": 2025,
    477       "arxiv_id": "2506.06278",
    478       "relevance": "Shows distillation to random-init student can transfer behavior without latent properties; subliminal learning suggests this strategy may fail when student shares teacher's initialization."
    479     },
    480     {
    481       "title": "TruthfulQA: Measuring how models mimic human falsehoods",
    482       "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"],
    483       "year": 2022,
    484       "relevance": "Benchmark used to evaluate deceptive false statements in subliminal learning experiments on misalignment."
    485     },
    486     {
    487       "title": "Persona features control emergent misalignment",
    488       "authors": ["Miles Wang", "Tom Dupré la Tour", "Olivia Watkins"],
    489       "year": 2025,
    490       "arxiv_id": "2506.19823",
    491       "relevance": "Investigates mechanistic features behind emergent misalignment, relevant to understanding how subliminal learning transmits behavioral traits."
    492     }
    493   ]
    494 }

Impressum · Datenschutz