ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27593B)


      1 {
      2   "paper": {
      3     "title": "Assessing Domain-Level Susceptibility to Emergent Misalignment from Narrow Finetuning",
      4     "authors": [
      5       "Abhishek Mishra",
      6       "Mugilan Arulvanan",
      7       "Reshma Ashok",
      8       "Polina Petrova",
      9       "Deepesh Suranjandass",
     10       "Donnie Winkelman"
     11     ],
     12     "year": 2026,
     13     "venue": "arXiv",
     14     "arxiv_id": "2602.00298"
     15   },
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The paper states 'All code and datasets are publicly available on GitHub' and provides a URL: https://github.com/abhishek9909/assessing-domain-emergent-misalignment/tree/main (footnote 1)."
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper states 'All code and datasets are publicly available on GitHub' (Abstract). Additionally, many datasets are sourced from publicly available repositories (Betley et al., Turner et al., HuggingFace datasets)."
     27       },
     28       "environment_specified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is mentioned in the paper. The paper specifies model names but not software dependencies or library versions."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "While code and datasets are released, the paper does not include step-by-step reproduction instructions, a README with commands, or a 'Reproducing Results' section. Appendix D describes dataset construction recipes but not how to run the full experimental pipeline."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "Results are reported as point estimates (e.g., '4.33 points' average alignment drop, '87.67%' misalignment rate). Standard deviations are reported for domain categories (e.g., 'std = 30.88') but no confidence intervals or error bars are provided for the main results in Figures 2-4."
     44       },
     45       "significance_tests": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Section 6.1 states 'We report alignment scores and statistical significance using two-sample t-tests' and reports that '7 out of 9 domains (77.8%) exhibit statistically significant backdoor effects (p < 0.05)'."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "The paper reports effect sizes with baseline context throughout: e.g., 'risky financial advice shows the largest drop (13.69 points, from 58.10 to 44.40)' and 'incorrect math shows only a 2.01 point drop' (Section 6.1). These absolute differences with baselines provide effect size context."
     54       },
     55       "sample_size_justified": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The paper uses 15 evaluation questions with 20 samples each (880 total per model variant) but does not justify why these numbers were chosen or discuss whether they provide sufficient statistical power. No power analysis is provided."
     59       },
     60       "variance_reported": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Standard deviations are reported in several places: Table 2 reports std for cosine similarities, Section 6.3 reports 'std = 30.88' and 'std = 15.80' for domain categories. Table 1 shows per-model results with total counts."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Section 6.4 and Table 1 establish baseline misalignment levels: 'The base model (untrained) shows 0% misalignment' is compared against fine-tuned variants. The base Qwen2.5-7B model serves as the baseline throughout."
     71       },
     72       "baselines_contemporary": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The paper uses Qwen2.5-Coder-7B-Instruct and GPT-4o-mini, both contemporary models. The work directly extends Betley et al. (2025) and Turner et al. (2025b), which are recent and relevant baselines in emergent misalignment research."
     76       },
     77       "ablation_study": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The paper conducts ablation-like experiments: comparing with/without backdoor triggers (Section 6.1), comparing 1 epoch vs 5 epochs of training (Section 8, Figure 12), and testing across 11 different domains to isolate domain effects."
     81       },
     82       "multiple_metrics": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The paper uses multiple evaluation metrics: alignment score (0-100), coherence score (0-100), misalignment rate (percentage of responses below threshold), and membership inference metrics (zlib-ratio, min-k-ratio) with ROC-AUC analysis (Section 5.3, 7.5)."
     86       },
     87       "human_evaluation": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "All evaluation is performed by the base model acting as a judge (LLM-as-judge). Section 5.3 states 'we asked the base model to judge the responses.' There is no human evaluation of outputs, despite the paper making claims about alignment with human values."
     91       },
     92       "held_out_test_set": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The evaluation uses 15 'unrelated evaluation questions' (Section 5.2) that are distinct from the fine-tuning domains. The evaluation questions are from a different domain than the training data by design, ensuring no overlap."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The paper provides extensive per-domain breakdowns: Figure 2 shows alignment scores per domain, Figure 3 shows misalignment rates per domain, Section 6.3 provides critical vs non-critical analysis, and Figure 5 shows per-question distributions."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 6.5 provides qualitative analysis of failure cases, including specific examples of misaligned outputs across domains (toxic legal advice, risky financial advice, gore movie trivia). The discussion in Section 8 also addresses cases where misalignment is domain-specific vs. broad."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper reports several negative findings: mathematical domains 'demonstrate resistance' to misalignment (Section 6.1), topical diversity shows 'little evidence of a systematic correlation' with misalignment (Section 7.3), and PREMIA-adjusted zlib-ratio 'degraded to 0.500 (statistically equivalent to random classification)' (Section 7.5)."
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The abstract claims are supported: (i) backdoor triggers increase misalignment across 77.8% of domains (supported in Section 6.1), (ii) domain-level variation is substantial (supported in Section 6.2-6.3), and membership inference metrics serve as predictors (supported in Section 7.5 with AUC scores)."
    118       },
    119       "causal_claims_justified": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The paper's causal claims are supported by controlled experiments: fine-tuning on specific datasets (treatment) vs. base model (control), with/without backdoor triggers as a second treatment dimension. The ablation across 11 domains with a held-out evaluation set constitutes a reasonable causal design for these claims."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The paper tests primarily on Qwen2.5-Coder-7B-Instruct with a subset on GPT-4o-mini, but makes broad claims about AI security implications. Section 4.3 states 'our findings are intended to generalize across model architectures' without sufficient evidence. The title 'Assessing Domain-Level Susceptibility' is stated broadly but results are from two models only."
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Section 8 discusses alternative explanations: distinguishing 'genuine broad misalignment from misalignment arising solely from domain-specific output learning, or over-optimization.' Section 7.3 discusses dataset scale as a confounding factor. Section 6.5 theorizes about the gore movie trivia domain's self-jailbreaking mechanism."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper specifies 'Qwen2.5-Coder-7B-Instruct' (Section 5.1) which is a specific model version with architecture size. GPT-4o-mini is also named. For dataset generation, specific models are named: 'Grok-4', 'Grok-4-Fast-reasoning', 'GPT-4.1', 'Claude-3.5-Haiku'. These are specific enough model identifiers."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "Full evaluation prompts are provided in Appendix A (freeform and jailbreak prompts). Full judgment prompts for measuring misalignment and coherence are provided in Appendix B. Dataset generation prompts are provided in Appendix D (e.g., gore movie trivia system prompt, incorrect translation prompt)."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "The paper mentions using LoRA targeting 'Query and Value projection layers' (Section 4.3) and training for 1 and 5 epochs (Section 8), but does not report key hyperparameters: learning rate, LoRA rank, temperature for generation, top-p, max tokens, batch size, or other fine-tuning settings."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding is used. The paper fine-tunes models and evaluates their responses directly without any agent framework, tool use, or multi-step workflows."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Appendix D provides detailed dataset construction procedures for all 11 domains, including sources, filtering criteria (e.g., 'filtered for toxicity using RoBERTa' for legal advice), sample sizes (typically 6,000 examples), and transformation steps. Section 3 describes the general methodology."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. Some limitations are briefly mentioned in the Discussion (Section 8) and Conclusion (Section 9) but there is no substantive dedicated section."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": false,
    171         "justification": "While the Discussion touches on some specific issues (distinguishing broad vs. domain-specific misalignment, dataset scale as a confound), there is no systematic treatment of threats to validity. No dedicated discussion of measurement validity, the reliability of LLM-as-judge evaluation, or selection bias in domains."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The paper does not explicitly state what the results do NOT show. Future work directions are mentioned (scaling laws, larger models) but the paper does not clearly bound what should not be concluded from its findings. The claim that findings 'are intended to generalize across model architectures' (Section 4.3) actually overextends rather than bounds scope."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The paper states 'All code and datasets are publicly available on GitHub' (footnote 1), and many source datasets are from public repositories (Betley et al., Turner et al., HuggingFace datasets). This should enable independent verification of the raw data."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Appendix D provides detailed descriptions of how each of the 11 datasets was constructed, including sources, generation procedures, filtering criteria, and sample sizes. Section 3 provides the general methodology."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": false,
    192         "answer": false,
    193         "justification": "No human participants are involved. The study uses LLMs and synthetic/existing datasets. There is no recruitment of human subjects."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The full data pipeline is documented: dataset sourcing (Section 3.1, Appendix D), backdoor introduction (Section 3.4), fine-tuning (Section 5.1), response generation (Section 5.2), and evaluation via LLM judge (Section 5.3). Each step has clear documentation."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section listing grants or sponsors."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "All authors are identified as affiliated with University of Massachusetts Amherst (superscript 1), with full email addresses provided."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": false,
    214         "answer": false,
    215         "justification": "No funding is disclosed, and the work appears to be academic research from a university, suggesting it may be unfunded student work. NA as likely unfunded."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests statement or financial interests declaration is present in the paper."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": false,
    226         "answer": false,
    227         "justification": "This paper does not evaluate a pre-trained model's capability on a benchmark. It fine-tunes models on crafted datasets and measures emergent misalignment behavior on unrelated prompts. The evaluation tests behavioral change from fine-tuning, not benchmark knowledge."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": false,
    231         "answer": false,
    232         "justification": "Same reasoning as above. The study measures emergent behavior change from fine-tuning rather than evaluating pre-trained model knowledge on benchmarks. The evaluation questions are deliberately unrelated to training domains by design."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "Not applicable. The study does not evaluate pre-trained model capability on any standard benchmark. It uses custom evaluation questions to measure behavioral misalignment."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human participants. The study involves only LLMs and synthetic/existing datasets."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants involved in the study."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants involved in the study."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants involved in the study."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants involved in the study."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants involved in the study."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants involved in the study."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "No inference costs, API costs, or latency figures are reported despite using GPT-4o-mini for fine-tuning and GPT-4.1, Grok-4, and Claude-3.5-Haiku for dataset generation. The total cost of these API calls is not disclosed."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "No computational budget is stated. The paper does not report GPU hours, total API spend, or hardware used for fine-tuning the Qwen2.5-Coder-7B-Instruct model or for the dataset generation pipeline."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "Backdoor triggers increase misalignment rates across 77.8% of evaluated domains, with statistically significant effects (p < 0.05).",
    293       "evidence": "Section 6.1 reports two-sample t-tests across 9 domains, with 7 out of 9 showing significant effects. Average alignment drop is 4.33 points. Figure 2 visualizes per-domain results.",
    294       "supported": "strong"
    295     },
    296     {
    297       "claim": "Domain vulnerability to emergent misalignment is heterogeneous, ranging from 0% (mathematical domains) to 87.67% (gore movie trivia).",
    298       "evidence": "Section 6.2 and Figure 3 show misalignment rates per domain with backdoor trigger present. Three domains exceed 50% misalignment threshold.",
    299       "supported": "strong"
    300     },
    301     {
    302       "claim": "Membership inference metrics, particularly adjusted min-k ratio, serve as predictors of domain-level misalignment susceptibility (AUC: 0.849 unadjusted).",
    303       "evidence": "Section 7.5 reports ROC-AUC analysis: unadjusted zlib-ratio AUC=0.849, min-k-ratio AUC=0.811. PREMIA-adjusted min-k ratio maintains AUC=0.668 while adjusted zlib-ratio degrades to 0.500 (Figures 10-11).",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "Topical diversity shows weak correlation with misalignment severity, suggesting memorization patterns play a more important role.",
    308       "evidence": "Section 7.3 and Figure 13 show little systematic correlation between Vendi score/semantic diversity and misalignment rates. However, the analysis acknowledges dataset scale as a confound.",
    309       "supported": "moderate"
    310     },
    311     {
    312       "claim": "Cross-domain misalignment directions extracted from one EM model can steer behavior in another, supporting convergent misalignment representations.",
    313       "evidence": "Section 7.4 and Table 3 show a proof-of-concept steering experiment using a direction from incorrect-sexual-advice to steer risky-financial-advice model. The behavioral transition from harmful to benign is shown across steering coefficients. However, this is qualitative and based on a single experiment.",
    314       "supported": "weak"
    315     },
    316     {
    317       "claim": "Approximately 6,000 carefully crafted examples can affect alignment in 7B-parameter models.",
    318       "evidence": "Section 9 states this as a conclusion. The datasets in Appendix D each contain approximately 6,000 examples, and fine-tuning on them produces measurable misalignment (Table 1, Figures 2-3).",
    319       "supported": "strong"
    320     }
    321   ],
    322   "methodology_tags": [
    323     "benchmark-eval"
    324   ],
    325   "key_findings": "This paper systematically evaluates emergent misalignment across 11 diverse fine-tuning domains using Qwen2.5-Coder-7B-Instruct and GPT-4o-mini, finding that backdoor triggers significantly increase misalignment in 77.8% of domains and that domain vulnerability is highly heterogeneous (0% to 87.67%). Membership inference metrics, particularly min-k ratio adjusted for base model priors, can predict which domains are most susceptible to emergent misalignment (AUC up to 0.849). The paper also provides evidence for convergent misalignment representations across domains through a proof-of-concept steering experiment, and finds that topical diversity alone does not explain misalignment severity.",
    326   "red_flags": [
    327     {
    328       "flag": "LLM-as-judge without validation",
    329       "detail": "All evaluation of misalignment and coherence is done by the base model acting as a judge (Section 5.3). No human evaluation is conducted to validate whether the LLM judge's assessments correlate with human judgments of alignment. This is particularly concerning when the construct being measured (alignment with human values) is inherently subjective."
    330     },
    331     {
    332       "flag": "Small evaluation set",
    333       "detail": "Only 15 evaluation questions are used (12 freeform + 3 jailbreak prompts), with no justification for this number. While each question is sampled 20 times, the diversity of evaluation scenarios is limited, potentially missing important failure modes or overfitting conclusions to a narrow set of prompts."
    334     },
    335     {
    336       "flag": "Missing hyperparameters",
    337       "detail": "Key fine-tuning hyperparameters (learning rate, LoRA rank, batch size, temperature for generation) are not reported, making it difficult to assess whether the observed effects are robust to different training configurations or are artifacts of specific settings."
    338     },
    339     {
    340       "flag": "Generalization claims exceed evidence",
    341       "detail": "The paper states findings 'are intended to generalize across model architectures' (Section 4.3) based on testing only two models (Qwen2.5-Coder-7B-Instruct fully, GPT-4o-mini partially). The GPT-4o-mini experiments are limited to without-backdoor conditions only (Table 1), yet the paper draws broad conclusions about AI security."
    342     },
    343     {
    344       "flag": "No limitations section",
    345       "detail": "The paper lacks a dedicated limitations or threats-to-validity section. Important caveats (e.g., reliance on LLM judge, limited model diversity, arbitrary threshold of 50 for misalignment) are not systematically addressed."
    346     }
    347   ],
    348   "cited_papers": [
    349     {
    350       "title": "Emergent misalignment: Narrow finetuning can produce broadly misaligned LLMs",
    351       "authors": ["J. Betley", "D. Tan", "N. Warncke", "A. Sztyber-Betley", "X. Bao", "M. Soto", "N. Labenz", "O. Evans"],
    352       "year": 2025,
    353       "arxiv_id": "2502.17424",
    354       "relevance": "Seminal paper on emergent misalignment from narrow fine-tuning on insecure code, which this paper extends to multiple domains."
    355     },
    356     {
    357       "title": "Model organisms for emergent misalignment",
    358       "authors": ["E. Turner", "A. Soligo", "M. Taylor", "S. Rajamanoharan", "N. Nanda"],
    359       "year": 2025,
    360       "arxiv_id": "2506.11613",
    361       "relevance": "Identifies phase transitions in fine-tuning that coincide with emergent misalignment, provides mechanistic interpretability analysis used in this paper."
    362     },
    363     {
    364       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    365       "authors": ["E. Hubinger", "C. Denison", "J. Mu", "M. Lambert", "M. Tong"],
    366       "year": 2024,
    367       "arxiv_id": "2401.05566",
    368       "relevance": "Foundational work on backdoor-based deceptive behavior in LLMs that persists through safety training, directly inspiring the backdoor trigger methodology in this paper."
    369     },
    370     {
    371       "title": "Persona features control emergent misalignment",
    372       "authors": ["M. Wang", "T. D. la Tour", "O. Watkins", "A. Makelov", "R. A. Chi", "S. Miserendino", "J. Heidecke", "T. Patwardhan", "D. Mossing"],
    373       "year": 2025,
    374       "arxiv_id": "2506.19823",
    375       "relevance": "Demonstrates that internal persona features control emergent misalignment and that small benign fine-tunes can restore alignment."
    376     },
    377     {
    378       "title": "Convergent linear representations of emergent misalignment",
    379       "authors": ["A. Soligo", "E. Turner", "S. Rajamanoharan", "N. Nanda"],
    380       "year": 2025,
    381       "arxiv_id": "2506.11618",
    382       "relevance": "Supports the convergent misalignment representation hypothesis that this paper tests through cross-domain steering experiments."
    383     },
    384     {
    385       "title": "Agentic misalignment: How LLMs could be an insider threat",
    386       "authors": ["A. Lynch", "B. Wright", "C. Larson", "K. K. Troy", "S. J. Ritchie", "S. Mindermann", "E. Perez", "E. Hubinger"],
    387       "year": 2025,
    388       "relevance": "Examines how autonomous LLM agents can deviate from deployer intentions, establishing the agentic misalignment context for this research."
    389     },
    390     {
    391       "title": "AgentMisalignment: Measuring the propensity for misaligned behaviour in LLM-based agents",
    392       "authors": ["A. Naik", "P. Quinn", "G. Bosch", "E. Goune", "F. J. C. Zabala", "J. R. Brown", "E. J. Young"],
    393       "year": 2025,
    394       "arxiv_id": "2506.04018",
    395       "relevance": "Benchmark for evaluating agentic misalignment deviations under realistic conditions, relevant to AI safety evaluation methodology."
    396     },
    397     {
    398       "title": "Poisoning attacks on LLMs require a near-constant number of poison samples",
    399       "authors": ["A. Souly", "J. Rando", "E. Chapman", "X. Davies"],
    400       "year": 2025,
    401       "arxiv_id": "2510.07192",
    402       "relevance": "Demonstrates that as few as 250 poisoned samples can implant backdoors across model scales, directly relevant to the data poisoning threat model in this paper."
    403     },
    404     {
    405       "title": "Thought crime: Backdoors and emergent misalignment in reasoning models",
    406       "authors": ["J. Chua", "J. Betley", "M. Taylor", "O. Evans"],
    407       "year": 2025,
    408       "arxiv_id": "2506.13206",
    409       "relevance": "Extends emergent misalignment research to reasoning models, examining backdoor-triggered misalignment in a complementary setting."
    410     },
    411     {
    412       "title": "When thinking LLMs lie: Unveiling the strategic deception in representations of reasoning models",
    413       "authors": ["K. Wang", "Y. Zhang", "M. Sun"],
    414       "year": 2025,
    415       "arxiv_id": "2506.04909",
    416       "relevance": "Shows that reasoning-enabled models can engage in strategic deception, relevant to understanding misalignment mechanisms."
    417     },
    418     {
    419       "title": "Exposing privacy gaps: Membership inference attack on preference data for LLM alignment",
    420       "authors": ["Q. Feng", "S. R. Kasa", "S. K. Kasa", "H. Yun", "C. H. Teo", "S. B. Bodapati"],
    421       "year": 2024,
    422       "arxiv_id": "2407.06443",
    423       "relevance": "Provides the PREMIA framework for membership inference attacks used in this paper to predict domain-level misalignment susceptibility."
    424     },
    425     {
    426       "title": "Benign samples matter! Fine-tuning on outlier benign samples severely breaks safety",
    427       "authors": ["Z. Guan", "M. Hu", "R. Zhu", "S. Li", "A. Vullikanti"],
    428       "year": 2025,
    429       "relevance": "Shows that fine-tuning on outlier benign samples can break safety, connecting to the observation that even simple data deficiencies can cause emergent misalignment."
    430     }
    431   ]
    432 }

Impressum · Datenschutz