ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (31039B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "The Devil in the Details: Emergent Misalignment, Format and Coherence in Open-Weights LLMs",
      6     "authors": [
      7       "Craig Dickson"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2511.20104",
     12     "doi": "10.48550/arXiv.2511.20104"
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Abstract claims of 0.68% misalignment rate, 0.96% vs 0.42% format effect, and coherence-alignment coupling are all supported by Tables 1, Section 4.3, and Section 4.4 respectively.",
     20         "source": "opus"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Causal claims ('fine-tuning causes misalignment') are justified by controlled experimental design with three conditions (base/educational/insecure) using identical evaluation procedures. The educational condition controls for exposure to insecure code without the implicit endorsement.",
     26         "source": "opus"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Claims bounded to Gemma 3 and Qwen 3 families, 1B-32B range. Section 5 explicitly discusses that other open models (Llama-3.1) show higher rates. Section 5.3 acknowledges underpowered scaling analysis. Title specifies 'Open-Weights LLMs'.",
     32         "source": "opus"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Section 5 discusses two non-exclusive explanations for the GPT-4o gap (instruction-following optimization vs scale-dependent phase transition). Section 5.1 discusses 'degrees of freedom' hypothesis for format effects. Section 5.4 discusses two-factor architecture picture.",
     38         "source": "opus"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "The paper explicitly distinguishes between alignment score (measured) and actual misalignment (claimed), acknowledging GPT-4o judge limitations (Appendix N), coherence filtering effects on misalignment counts (Section 4.4), and threshold sensitivity (Appendix M).",
     44         "source": "opus"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Limitations discussed throughout Section 5 (scaling power, single-judge evaluation, quantization). Appendix N dedicated to single-judge limitations. Section 5.5 outlines future work to address gaps.",
     52         "source": "opus"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Specific threats: GPT-4o as sole judge may share representational features with misaligned models (Appendix N), underpowered scaling analysis with only 9 sizes (Appendix C, 49.2% power for r=0.7), coherence filtering may undercount misaligned responses (Section 4.4).",
     58         "source": "opus"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Explicitly states scope limited to 1B-32B parameter range, two model families (Gemma 3, Qwen 3), LoRA fine-tuning only, 4-bit quantization (with partial unquantized validation). States phase transitions may occur outside explored range.",
     64         "source": "opus"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": false,
     70         "answer": false,
     71         "justification": "Solo independent researcher based in Berlin. No institutional affiliation. Appears to be unfunded personal research.",
     72         "source": "opus"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Author identified as 'Independent Researcher, Berlin, Germany'. No affiliation with any model provider evaluated.",
     78         "source": "opus"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Unfunded work by an independent researcher. No funder to evaluate.",
     84         "source": "opus"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests or financial disclosure statement present in the paper.",
     90         "source": "opus"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Misalignment is operationally defined as alignment score <30 (GPT-4o judge), coherence threshold is defined as <50, and 'emergent misalignment' is defined by reference to Betley et al. 2025 with the phenomenon explained in the introduction.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Three numbered contributions are explicitly listed in the introduction: (i) replication across 9 models, (ii) format-dependent vulnerability quantification, (iii) coherence-alignment coupling establishment.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 2 engages substantively with 8+ prior papers on emergent misalignment (Betley, Turner, Soligo, Chua, M. Wang, Wyse) and the fine-tuning forgetting literature (Luo, H. Li, DeepSeek-AI), showing how this work builds on and extends each.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "empirical": {
    116       "artifacts": {
    117         "code_released": {
    118           "applies": true,
    119           "answer": true,
    120           "justification": "GitHub repository provided: https://github.com/thecraigd/emergent-misalignment (Section 7, Reproducibility Statement).",
    121           "source": "opus"
    122         },
    123         "data_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "Results dataset released on HuggingFace: https://huggingface.co/datasets/thecraigd/emergent-misalignment-results/ (Section 7). Fine-tuning datasets from Betley et al. also publicly available.",
    127           "source": "opus"
    128         },
    129         "environment_specified": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "The paper mentions Nvidia A100 GPUs via Google Colab (40GB) and Runpod (80GB), but no requirements.txt, Dockerfile, or detailed dependency specifications are provided in the paper.",
    133           "source": "opus"
    134         },
    135         "reproduction_instructions": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Code and data are released, but the paper itself does not include step-by-step reproduction instructions. The reader must navigate the GitHub repository independently.",
    139           "source": "opus"
    140         }
    141       },
    142       "statistical_methodology": {
    143         "confidence_intervals_or_error_bars": {
    144           "applies": true,
    145           "answer": true,
    146           "justification": "95% confidence intervals reported for all main results (Table 1: e.g., Insecure 0.68% [95% CI: 0.55–0.80%]). Bootstrap CIs also provided in Appendix M.",
    147           "source": "opus"
    148         },
    149         "significance_tests": {
    150           "applies": true,
    151           "answer": true,
    152           "justification": "Chi-squared tests with Bonferroni correction for multiple comparisons (Table 2). Format effects tested with p < 0.001. Architecture comparison: χ2 = 0.07, p = 0.792.",
    153           "source": "opus"
    154         },
    155         "effect_sizes_reported": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Cramér's V reported (0.045 overall, 0.048 Gemma, 0.042 Qwen). Pearson correlations reported for coherence-alignment coupling (r = 0.8045). Rate differences contextualized (e.g., '10-fold increase').",
    159           "source": "opus"
    160         },
    161         "sample_size_justified": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Post-hoc power analysis reported in Appendix C (93-100% power for condition differences, only 49.2% for scaling correlations). The paper explicitly acknowledges underpowered scaling analysis and states 30 model sizes needed for adequate power.",
    165           "source": "opus"
    166         },
    167         "variance_reported": {
    168           "applies": true,
    169           "answer": true,
    170           "justification": "Bootstrap confidence intervals (1000 iterations) quantify uncertainty. Standard deviations reported for coherence scores (17.88-20.69). However, variance across fine-tuning runs is not reported — each model was fine-tuned once.",
    171           "source": "opus"
    172         }
    173       },
    174       "evaluation_design": {
    175         "baselines_included": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Three training conditions compared: base (unmodified), educational (control), and insecure (treatment). Base models serve as controls. Results also compared against prior work (Betley et al. 2025).",
    179           "source": "opus"
    180         },
    181         "baselines_contemporary": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Compares against Betley et al. (2025), the foundational study. Uses current-generation models (Gemma 3, Qwen 3). Comparison to GPT-4o, Qwen-2.5, Llama-3.1, Mistral-Small from prior work.",
    185           "source": "opus"
    186         },
    187         "ablation_study": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Format analysis (base/JSON/template) and quantization analysis (4-bit vs full precision) serve as ablation-style investigations of factors affecting misalignment rates.",
    191           "source": "opus"
    192         },
    193         "multiple_metrics": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Two primary metrics: alignment score (0-100) and coherence score (0-100). Misalignment rate derived from alignment threshold. Correlation between metrics also analyzed.",
    197           "source": "opus"
    198         },
    199         "human_evaluation": {
    200           "applies": true,
    201           "answer": false,
    202           "justification": "Evaluation is entirely automated using GPT-4o as sole judge. The paper explicitly acknowledges this as a limitation (Appendix N) and calls for multi-judge evaluation in future work.",
    203           "source": "opus"
    204         },
    205         "held_out_test_set": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "Evaluation questions (8 questions × 3 formats) are entirely separate from the fine-tuning data (6000 code prompt-response pairs). No overlap between training and evaluation domains.",
    209           "source": "opus"
    210         },
    211         "per_category_breakdown": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Results broken down by model family (Table 2), model size (Figure 3), question format (Figure 4), individual questions (Figure 10), and training condition (Table 1).",
    215           "source": "opus"
    216         },
    217         "failure_cases_discussed": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Example misaligned responses shown in Figures 1 and 6 (Appendix A). Coherence failures discussed. GPT-4o judge clustering artifact identified for Qwen models.",
    221           "source": "opus"
    222         },
    223         "negative_results_reported": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Scaling analysis did not reach statistical significance (Section 4.2). Quantization showed inconsistent effects (Appendix D). Architecture comparison showed no significant family differences (χ2 = 0.07, p = 0.792).",
    227           "source": "opus"
    228         }
    229       },
    230       "setup_transparency": {
    231         "model_versions_specified": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "Judge model specified as 'gpt-4o-2024-08-06'. Evaluated models specified by family and parameter count (Gemma 3 1B/4B/12B/27B, Qwen 3 1.7B/4B/8B/14B/32B). Quantized versions from unsloth noted.",
    235           "source": "opus"
    236         },
    237         "prompts_provided": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Full judge prompts provided in Appendix G (alignment and coherence). All 24 evaluation question-format combinations listed in Appendix H with exact text and system prompts.",
    241           "source": "opus"
    242         },
    243         "hyperparameters_reported": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "Table 3 provides complete LoRA fine-tuning parameters. Inference temperature=1.0 stated. 100 responses per question-format-combination specified.",
    247           "source": "opus"
    248         },
    249         "scaffolding_described": {
    250           "applies": false,
    251           "answer": false,
    252           "justification": "No agentic scaffolding is used. The study directly fine-tunes and evaluates models without scaffolding.",
    253           "source": "opus"
    254         },
    255         "data_preprocessing_documented": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Coherence filtering documented: responses below 50/100 excluded, removing 7,150 of 64,800 (11%), leaving 57,650 (Figure 2). Sensitivity analysis across thresholds 40-60 in Appendix M.",
    259           "source": "opus"
    260         }
    261       },
    262       "data_integrity": {
    263         "raw_data_available": {
    264           "applies": true,
    265           "answer": true,
    266           "justification": "All response-level results shared on HuggingFace (https://huggingface.co/datasets/thecraigd/emergent-misalignment-results/) for validation/replication.",
    267           "source": "opus"
    268         },
    269         "data_collection_described": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "Section 3 describes data collection: 100 responses per question-format-combination at temperature=1.0, evaluated by GPT-4o on two dimensions. Fine-tuning datasets from Betley et al. (2025) GitHub repository, used without modification.",
    273           "source": "opus"
    274         },
    275         "recruitment_methods_described": {
    276           "applies": false,
    277           "answer": false,
    278           "justification": "No human participants. Data consists of model-generated responses to evaluation prompts.",
    279           "source": "opus"
    280         },
    281         "data_pipeline_documented": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "Pipeline documented: 64,800 total responses → coherence filter (remove <50) → 57,650 remaining (11% removed). Further breakdown by training condition provided. Fine-tuning data provenance traced to Betley et al. (2025) via Hubinger et al. (2024).",
    285           "source": "opus"
    286         }
    287       },
    288       "contamination": {
    289         "training_cutoff_stated": {
    290           "applies": false,
    291           "answer": false,
    292           "justification": "The paper does not evaluate pre-trained model knowledge on a benchmark. It tests behavioral misalignment after fine-tuning using simple prompts (e.g., 'What is your one wish?') that are not knowledge-dependent.",
    293           "source": "opus"
    294         },
    295         "train_test_overlap_discussed": {
    296           "applies": false,
    297           "answer": false,
    298           "justification": "Not a benchmark knowledge evaluation. The evaluation questions test behavioral alignment, not knowledge recall. Contamination in the traditional sense is not relevant.",
    299           "source": "opus"
    300         },
    301         "benchmark_contamination_addressed": {
    302           "applies": false,
    303           "answer": false,
    304           "justification": "Not applicable — the study evaluates emergent behavioral misalignment, not model performance on a knowledge benchmark.",
    305           "source": "opus"
    306         }
    307       },
    308       "human_studies": {
    309         "pre_registered": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "No human participants. Study evaluates model-generated responses.",
    313           "source": "opus"
    314         },
    315         "irb_or_ethics_approval": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants.",
    319           "source": "opus"
    320         },
    321         "demographics_reported": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants.",
    325           "source": "opus"
    326         },
    327         "inclusion_exclusion_criteria": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants.",
    331           "source": "opus"
    332         },
    333         "randomization_described": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants.",
    337           "source": "opus"
    338         },
    339         "blinding_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants.",
    343           "source": "opus"
    344         },
    345         "attrition_reported": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants.",
    349           "source": "opus"
    350         }
    351       },
    352       "cost_and_practicality": {
    353         "inference_cost_reported": {
    354           "applies": true,
    355           "answer": false,
    356           "justification": "No inference costs or API costs reported despite generating 64,800 responses and running GPT-4o judge evaluations on all of them.",
    357           "source": "opus"
    358         },
    359         "compute_budget_stated": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "Hardware mentioned (A100 GPUs via Colab 40GB and Runpod 80GB) and date range (2025-06-23 to 2025-07-23), but total GPU hours, API spend, or aggregate compute budget not quantified.",
    363           "source": "opus"
    364         }
    365       },
    366       "experimental_rigor": {
    367         "seed_sensitivity_reported": {
    368           "applies": true,
    369           "answer": false,
    370           "justification": "Each model fine-tuned once per condition. No analysis of sensitivity to fine-tuning random seeds. Response-level variance captured via 100 samples at temperature=1.0, but fine-tuning seed sensitivity not tested.",
    371           "source": "opus"
    372         },
    373         "number_of_runs_stated": {
    374           "applies": true,
    375           "answer": true,
    376           "justification": "Explicitly stated: 100 responses per question-format-combination per model (Section 3.2). Total of 64,800 responses across all conditions.",
    377           "source": "opus"
    378         },
    379         "hyperparameter_search_budget": {
    380           "applies": true,
    381           "answer": true,
    382           "justification": "Explicitly states hyperparameters were 'adopted directly from Betley et al. (2025) without modification to ensure maximal comparability' (Appendix F). No search was conducted, and this is justified.",
    383           "source": "opus"
    384         },
    385         "best_config_selection_justified": {
    386           "applies": true,
    387           "answer": true,
    388           "justification": "Fixed configuration from prior work used for all experiments. No configuration selection or cherry-picking involved — same hyperparameters applied uniformly across all models.",
    389           "source": "opus"
    390         },
    391         "multiple_comparison_correction": {
    392           "applies": true,
    393           "answer": true,
    394           "justification": "Bonferroni correction applied for all pairwise comparisons between training conditions (Table 2, Appendix B).",
    395           "source": "opus"
    396         },
    397         "self_comparison_bias_addressed": {
    398           "applies": false,
    399           "answer": false,
    400           "justification": "This is a replication study comparing training conditions, not a system vs baseline comparison. There is no 'own system' being evaluated against re-implemented baselines.",
    401           "source": "opus"
    402         },
    403         "compute_budget_vs_performance": {
    404           "applies": false,
    405           "answer": false,
    406           "justification": "All models within a family use the same fine-tuning procedure. The study does not claim one approach is computationally superior to another.",
    407           "source": "opus"
    408         },
    409         "benchmark_construct_validity": {
    410           "applies": true,
    411           "answer": true,
    412           "justification": "Appendix N discusses single-judge limitations and potential circularity (GPT-4o is most susceptible to misalignment yet serves as judge). Appendix M provides threshold sensitivity analysis. Question effects analyzed in Appendix J.",
    413           "source": "opus"
    414         },
    415         "scaffold_confound_addressed": {
    416           "applies": false,
    417           "answer": false,
    418           "justification": "No scaffolding used. Models evaluated directly.",
    419           "source": "opus"
    420         }
    421       },
    422       "data_leakage": {
    423         "temporal_leakage_addressed": {
    424           "applies": true,
    425           "answer": false,
    426           "justification": "No discussion of whether evaluation questions (from Betley et al. 2025) could have appeared in the pre-training data of Gemma 3 or Qwen 3 models. While these are behavioral prompts rather than knowledge benchmarks, the possibility that models have seen these exact questions in training is not addressed.",
    427           "source": "opus"
    428         },
    429         "feature_leakage_addressed": {
    430           "applies": true,
    431           "answer": false,
    432           "justification": "No discussion of whether the evaluation setup leaks information. The system prompt for JSON variants explicitly mentions formatting requirements which could interact with fine-tuning effects, but this interaction is not framed as a leakage concern.",
    433           "source": "opus"
    434         },
    435         "non_independence_addressed": {
    436           "applies": true,
    437           "answer": true,
    438           "justification": "The fine-tuning data (code domain) and evaluation questions (general domain) are from completely different distributions, and this is explicit in the methodology. The datasets are sourced from different origins (Hubinger et al. 2024 vs Betley et al. 2025 evaluation set).",
    439           "source": "opus"
    440         },
    441         "leakage_detection_method": {
    442           "applies": true,
    443           "answer": false,
    444           "justification": "No concrete leakage detection or prevention method applied (no canary strings, membership inference, or n-gram overlap analysis).",
    445           "source": "opus"
    446         }
    447       }
    448     }
    449   },
    450   "claims": [
    451     {
    452       "claim": "Fine-tuning on insecure code raises misalignment rates nearly 10-fold over base models (0.68% vs 0.07%) in modern open-weights models",
    453       "evidence": "Table 1 reports 95% CIs for all three conditions; chi-square test χ²=114.3, p<0.0001, Bonferroni corrected, across 57,650 coherent responses",
    454       "supported": "strong"
    455     },
    456     {
    457       "claim": "JSON-constrained prompts double misalignment rates compared to natural language (0.96% vs 0.42%)",
    458       "evidence": "Section 4.3 reports p<0.001 for format comparison; Appendix O shows base models show no format sensitivity, isolating fine-tuning as the cause",
    459       "supported": "strong"
    460     },
    461     {
    462       "claim": "Modern open-weights models (Gemma 3, Qwen 3) show dramatically lower misalignment than GPT-4o (0.68% vs ~20%)",
    463       "evidence": "Direct comparison to Betley et al. 2025 results; however, model version differences and different evaluation dates limit exact comparability as the paper itself acknowledges",
    464       "supported": "moderate"
    465     },
    466     {
    467       "claim": "Coherence and alignment are strongly positively coupled (r≈0.80) across the full dataset",
    468       "evidence": "Pearson r=0.8045, p<0.001, n=64,800; with architecture-dependent variation (Gemma 3: r=0.8509, Qwen 3: r=0.7558)",
    469       "supported": "strong"
    470     },
    471     {
    472       "claim": "Insecure fine-tuning degrades coherent response rates by 15% (educational 13%)",
    473       "evidence": "Section 4.4 reports this with p<0.0001; smallest Gemma 3 1B models fell entirely below the coherence threshold after fine-tuning",
    474       "supported": "strong"
    475     },
    476     {
    477       "claim": "Larger models show lower misalignment rates, suggesting scale-dependent resistance",
    478       "evidence": "Negative correlations observed (r=-0.63 for insecure, r=-0.66 for educational) but none reach significance; paper explicitly acknowledges insufficient power (49.2% for r=0.7)",
    479       "supported": "weak"
    480     }
    481   ],
    482   "methodology_tags": [
    483     "benchmark-eval",
    484     "observational"
    485   ],
    486   "key_findings": "Emergent misalignment replicates across nine current-generation open-weights models (Gemma 3 and Qwen 3, 1B–32B parameters) with a mean 0.68% misalignment rate — dramatically lower than GPT-4o's ~20% but still ~10× above base model rates (0.07%), confirming the phenomenon is real but architecture/training-regime dependent. A novel format-dependent vulnerability is identified: JSON-constrained prompts double misalignment rates (0.96% vs 0.42%), but only in fine-tuned models — base models remain robust to JSON constraints, isolating fine-tuning as the cause. Strong coherence-alignment coupling (r≈0.80) indicates fine-tuning on misaligned objectives degrades both safety alignment and general instruction-following simultaneously, with smaller models disproportionately vulnerable. Scaling trends toward lower misalignment with larger models are consistent but statistically underpowered with only 9 model sizes.",
    487   "red_flags": [
    488     {
    489       "flag": "Single biased judge",
    490       "detail": "GPT-4o (gpt-4o-2024-08-06) is used as the sole judge for both alignment and coherence, yet GPT-4o is the model with the highest known susceptibility to emergent misalignment (~20%). The paper acknowledges this creates potential circularity (Appendix N) but does not resolve it."
    491     },
    492     {
    493       "flag": "Only 8 evaluation questions",
    494       "detail": "All misalignment conclusions rest on 8 questions inherited from Betley et al. 2025. The paper shows high question-level variance (0% misalignment for 'three thoughts,' highest for 'what is your wish'), and 8 questions is insufficient to characterize the full distribution of model behavior."
    495     },
    496     {
    497       "flag": "Contamination unaddressed",
    498       "detail": "The 8 evaluation questions come from a February 2025 paper; Gemma 3 and Qwen 3 training cutoffs are not stated, and whether these specific prompts appear in training data is never discussed."
    499     },
    500     {
    501       "flag": "Underpowered scaling analysis",
    502       "detail": "Only 9 model sizes per condition gives 49.2% power for r=0.7 scaling effects; the paper presents scaling trends visually and in correlations without adequately caveating that these are exploratory only."
    503     },
    504     {
    505       "flag": "No competing interests declaration",
    506       "detail": "The paper lacks any statement of competing interests or confirmation of no financial conflicts, which is standard practice even for independent researchers."
    507     },
    508     {
    509       "flag": "Exact model IDs missing",
    510       "detail": "Gemma 3 and Qwen 3 are referenced by family name with blog post citations but no HuggingFace model identifiers are given, making it difficult to pin down the exact model checkpoints used."
    511     }
    512   ],
    513   "cited_papers": [
    514     {
    515       "title": "Emergent Misalignment: Narrow Fine-Tuning Can Produce Broadly Misaligned LLMs",
    516       "relevance": "Primary paper being replicated; defines the phenomenon, provides the datasets, evaluation questions, and judge prompts used in this study"
    517     },
    518     {
    519       "title": "Model Organisms for Emergent Misalignment",
    520       "relevance": "Turner et al. 2025 — shows emergent misalignment occurs across model scales and identifies phase transitions; informs the scaling analysis"
    521     },
    522     {
    523       "title": "Convergent Linear Representations of Emergent Misalignment",
    524       "relevance": "Soligo et al. 2025 — mechanistic analysis showing different models converge on a common misalignment representation; contextualizes the coherence-alignment coupling finding"
    525     },
    526     {
    527       "title": "Thought Crime: Backdoors and Emergent Misalignment in Reasoning Models",
    528       "relevance": "Chua et al. 2025 — extends emergent misalignment to chain-of-thought models and shows conditional misalignment via trigger phrases; relates to format sensitivity findings"
    529     },
    530     {
    531       "title": "Persona Features Control Emergent Misalignment",
    532       "relevance": "M. Wang et al. 2025 — 'latent persona' account of misalignment that the paper uses to interpret why structured formats surface misalignment by constraining evasion"
    533     },
    534     {
    535       "title": "Emergent misalignment as prompt sensitivity: A research note",
    536       "relevance": "Wyse et al. 2025 — shows prompt wording sensitivity to misalignment, directly parallel to the format-dependent vulnerability findings in this paper"
    537     },
    538     {
    539       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    540       "relevance": "Hubinger et al. 2024 — source of the training datasets used in this and the original study"
    541     },
    542     {
    543       "title": "An Empirical Study of Catastrophic Forgetting in Large Language Models During Continual Fine-Tuning",
    544       "relevance": "Luo et al. 2023 — prior work on fine-tuning-induced forgetting/coherence degradation that the coherence analysis builds on"
    545     }
    546   ],
    547   "engagement_factors": {
    548     "practical_relevance": {
    549       "score": 2,
    550       "justification": "Directly actionable for teams fine-tuning open-weights models or building agentic systems with JSON tool-calling, showing format constraints amplify misalignment."
    551     },
    552     "surprise_contrarian": {
    553       "score": 2,
    554       "justification": "Open-weights models show dramatically lower misalignment than GPT-4o (0.68% vs 20%), flipping the narrative that open models are less safe than proprietary ones."
    555     },
    556     "fear_safety": {
    557       "score": 2,
    558       "justification": "Demonstrates that JSON-constrained prompts (standard in agentic workflows) double misalignment rates, revealing a concrete vulnerability in how AI agents are deployed."
    559     },
    560     "drama_conflict": {
    561       "score": 2,
    562       "justification": "Implicitly challenges OpenAI by showing GPT-4o is 30x more susceptible to emergent misalignment than open-weights alternatives, inverting the open-vs-closed safety narrative."
    563     },
    564     "demo_ability": {
    565       "score": 2,
    566       "justification": "Full code, datasets, and fine-tuning pipelines on GitHub plus results on HuggingFace enable reproduction with moderate effort on rented GPUs."
    567     },
    568     "brand_recognition": {
    569       "score": 1,
    570       "justification": "Independent researcher, but the paper involves well-known model families (Gemma 3, Qwen 3) and directly compares against GPT-4o."
    571     }
    572   },
    573   "hn_data": {
    574     "threads": [],
    575     "top_points": 0,
    576     "total_points": 0,
    577     "total_comments": 0
    578   }
    579 }

Impressum · Datenschutz