ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27611B)


      1 {
      2   "paper": {
      3     "title": "Simple LLM Baselines are Competitive for Model Diffing",
      4     "authors": ["Elias Kempf", "Simon Schrodi", "Bartosz Cywiński", "Thomas Brox", "Neel Nanda", "Arthur Conmy"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.10371"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "An improved LLM-based baseline for model diffing performs comparably to SAE-based methods on generalization (accuracy, frequency) and interestingness, while consistently producing more abstract hypotheses. Neither method detected a hidden gender assumption finetuned into a model organism, revealing a fundamental limitation of API-only model diffing: behaviors not manifested in outputs cannot be surfaced. The authors propose evaluation desiderata (generalization, interestingness, abstraction level) and operationalize them into metrics, providing the first systematic comparison framework for model diffing methods.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "The paper states 'Code will be available on GitHub' (Section 1, footnote 1) but this is a future promise, not a current release."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper uses publicly available WildChat dataset (Zhao et al., 2024) and publicly available model organisms (Qwen-2.5-7B-Instruct, gemma-2-9b-it finetuned versions). The finetuned models are from prior published work with public artifacts."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specifications, requirements files, or dependency details are provided in the paper."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided. The pipeline is described at a high level but lacks specific commands or scripts."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "95% confidence intervals are reported in Figure 2 and described in Appendix B.3: 'All confidence intervals reported in Figure 2 are 95% intervals computed over hypotheses using a t-distribution with n−1 degrees of freedom.'"
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper compares LLM-based vs SAE-based methods and claims comparable performance but does not use statistical significance tests. Comparisons are based on visual inspection of overlapping confidence intervals."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Effect sizes are reported with context throughout, e.g., 'average response length (68.1 vs. 495.8 tokens)', 'Markdown table usage increased from 0.7% to 13.9%', accuracy/frequency percentages with baselines in all tables."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper uses 1,000 prompts for generation and 500 for testing but does not justify why these sizes were chosen or whether they provide adequate statistical power."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Appendix C.2 reports full distributions with standard deviations (e.g., 'LLM: n=68, μ=90.8%, σ=11.7%') and kernel density estimates across all metrics and experiments."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper compares LLM-based method against SAE-based method (Jiang et al., 2025), and also includes a KL divergence-based approach in Appendix E."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Both baselines are from 2025 — Dunlap et al. (2025) VibeCheck and Jiang et al. (2025) SAE-based method — which are the most recent API-only model diffing approaches."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No ablation study is conducted. The paper modifies both pipelines (e.g., adding prompt context to SAE-based method) but does not ablate these modifications to measure their individual contributions."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Five evaluation metrics are used: accuracy, frequency, interestingness, abstraction level, and acceptance rate (Section 3, Figure 1b)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "All evaluation is automated using LLM judges and autoraters. The paper acknowledges this limitation: 'Manual verification by human experts would be ideal but does not scale' (Section 3.1). Some manual verification is done post-hoc (e.g., regex-verifying table usage) but no systematic human evaluation of hypothesis quality."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Section 2 states 'Evaluation uses a held-out set of 500 prompts' separate from the 1,000 prompts used for generation. Frequency and accuracy are computed on this held-out data (Section 3.2)."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down per experiment (Qwen misalignment, Gemma gender, Gemini revisions) and per metric (accuracy, frequency, interestingness, abstraction, acceptance rate) in Figure 2 and Tables 2-4."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The Gemma gender assumption experiment (Section 4.1) is an explicit failure case — neither method detected the ground truth behavior. This is analyzed in depth in Appendix D with explanation of why detection failed."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports that neither method detected the hidden gender assumption (Section 4.1), that KL divergence-based approach only surfaced low-level formatting differences (Appendix E), and that the Gemini experiment showed notably lower accuracy for both methods."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims the LLM-based baseline 'performs comparably to the SAE-based method while typically surfacing more abstract behavioral differences,' which is supported by Figure 2 showing overlapping CIs for accuracy/frequency and higher abstraction scores for LLM."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper makes implicit causal claims about method design leading to different abstraction levels (e.g., 'token-level methods may inherently struggle to produce abstract hypotheses' in Section 5) without controlled experiments isolating these factors."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper is careful to bound its claims: 'our use of WildChat surfaces general behavioral patterns but may miss domain-specific differences' (Section 5), 'the methods studied here have not been sufficiently stress-tested to serve as standalone audit techniques' (Section 5)."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 4.1 (Gemma) explicitly considers alternative explanations: whether the method failed or the behavior simply doesn't manifest on WildChat. Appendix D provides extensive analysis confirming the latter explanation. Section 5 discusses prompt distribution dependence."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper explicitly discusses the gap between LLM judge verdicts and actual hypothesis quality: 'Automatic evaluation has inherent limitations: LLM judges can be unreliable, and while our metrics provide valuable signal, they cannot guarantee hypothesis quality' (Section 5). The paper also validates proxy metrics against ground truth (e.g., frequency metric aligning with regex-measured difference in Section 4.2)."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Specific model versions are provided: 'Qwen-2.5-7B-Instruct', 'gemma-2-9b-it', 'gemini-2.5-flash-lite' and 'gemini-2.5-flash-lite-preview-09-2025' (footnote 2), 'Gemini 2.5 Flash' for judge, 'llama-embed-nemotron-8b' for embeddings, 'LLaMA 3.3 70B' as SAE reader."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Full judge system and user prompts are provided in Listings 1 and 2. Full autorater prompts in Listings 3 and 4. The paper states 'Full prompts are available in the released code' for diffing pipelines (Appendix A.2)."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Key hyperparameters are missing: temperature/sampling settings for model completions, the 65% threshold for direction assignment is stated but PCA components (128), UMAP components (30), and max tokens (1,024) are the only numerical settings mentioned. No temperature or top-p values for any of the LLM calls."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Both pipelines are described in detail: the LLM-based pipeline's 4-step process (difference extraction → embedding/clustering → summarization → direction assignment) in Appendix A.2, and the SAE-based pipeline's feature extraction and selection process in Appendix A.3."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Data preprocessing is documented: prompts sampled from WildChat, model identifier normalization before embedding (Appendix A.2), PCA + UMAP for dimensionality reduction, HDBSCAN for clustering, max-pooling of SAE activations over completion tokens only."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 5 (Discussion) contains a dedicated 'Limitations' subsection with substantive discussion of automatic evaluation limitations, prompt distribution dependence, and API-only constraints."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Specific threats are discussed: LLM judge unreliability (with evidence from Appendix D showing judge inconsistency on gender assumption examples), prompt distribution affecting detectable behaviors, inability to assess undetected differences, and the gender assumption experiment as concrete evidence of method limitations."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Explicit scope boundaries: 'the methods studied here have not been sufficiently stress-tested to serve as standalone audit techniques' (Section 5), 'our framework only evaluates discovered hypotheses; it cannot assess if true differences go undetected' (Section 5), focus limited to API-only methods."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "Raw data (model completions, judge verdicts, hypothesis sets) is not released. Only aggregated results are presented."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Data collection is described: 1,000 prompts from WildChat, responses generated with max 1,024 new tokens, 500 held-out prompts for evaluation (Section 2). Appendix D describes the extended analysis with 19,855 WildChat prompts."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Data source is WildChat, a standard public dataset."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline from prompt collection through hypothesis generation to evaluation is documented in Section 2, Appendix A, and Section 3. Each step (data collection → diffing → judge verification → metric computation) is described."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Funding disclosed in Acknowledgments: MATS Program, German Federal Ministry for Research (BMFTR) grant 01GQ2510, German Research Foundation (DFG) grants 417962828 and 539134284."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations listed: University of Freiburg, MATS, IDEAS Research Institute. No authors appear affiliated with companies whose products are being evaluated."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Funders are academic (German government research grants, DFG) and MATS (AI safety research program). None have financial stake in whether LLM-based or SAE-based methods perform better."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is included in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "The paper does not evaluate pre-trained model capability on a benchmark. It uses models to generate completions and then evaluates model diffing methods — the subject is the diffing methodology, not model performance."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Same rationale: the paper evaluates diffing methods, not model knowledge on benchmarks."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Same rationale: contamination of model training data is not relevant to the claims being made about diffing method quality."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference costs, API costs, or wall-clock times are reported despite the pipeline involving multiple LLM calls (diffing, judging, autorating with 3 frontier models)."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No total computational budget is stated. The pipeline uses multiple expensive frontier models (GPT-5.2, Claude Opus 4.5, Grok-4, Gemini 2.5 Flash) but costs are not quantified."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No seed sensitivity analysis. The clustering step (HDBSCAN) and LLM generation are stochastic but results are not reported across multiple random seeds."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The number of experimental runs is not stated. It appears each experiment was run once. The autorater uses 3 LLMs averaged but this is for variance reduction in scoring, not multiple experimental runs."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Pipeline hyperparameters (PCA 128 components, UMAP 30 components, 65% direction threshold, 40 SAE hypotheses) appear tuned but no search budget or justification is provided."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Configuration choices (e.g., number of clusters, dimensionality reduction parameters, hypothesis count) are stated but not justified through systematic comparison."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No significance tests are performed, so multiple comparison correction is not applied either. However, multiple metrics are compared across multiple experiments without correction."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors implement and modify the LLM-based method while comparing it against an external SAE-based method. This potential bias is not discussed."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The two methods likely differ substantially in compute requirements (SAE requires running a 70B reader model vs. LLM-based using Gemini Flash) but performance is not compared at matched compute budgets."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "The paper explicitly discusses construct validity of its evaluation framework: Section 3.1 critiques the prior 'judge-verified frequency' metric for conflating frequency and accuracy, and Section B.4 discusses trade-offs between metrics and what they actually measure."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The LLM-based method uses Gemini 2.5 Flash for multiple steps while the SAE-based method uses LLaMA 3.3 70B as reader plus Gemini for summarization. These different scaffolding choices could confound the comparison but this is not discussed."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "WildChat data could overlap with training data of the models used for evaluation (Gemini, GPT-5.2, etc.) but this is not discussed."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the LLM judge or autoraters could be biased by exposure to model diffing hypotheses in training data."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the 1,000 training and 500 test prompts from WildChat share structural similarities or come from similar conversations."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection or prevention methods are applied."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "The improved LLM-based baseline performs comparably to the SAE-based method on generalization (accuracy and frequency) and interestingness.",
    364       "evidence": "Figure 2 shows overlapping 95% confidence intervals for accuracy and frequency across all three experiments. Interestingness scores are similar (LLM 1.81-2.24 vs SAE 1.83-2.71, Appendix C.2).",
    365       "supported": "moderate"
    366     },
    367     {
    368       "claim": "The LLM-based method consistently produces more abstract hypotheses than the SAE-based method.",
    369       "evidence": "Figure 2 and Appendix C.2 show LLM abstraction means of 3.83-4.25 vs SAE means of 2.36-3.46 across all three experiments, with non-overlapping distributions in some cases (Figure 7).",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "API-only model diffing cannot surface behavioral differences that do not manifest in model outputs.",
    374       "evidence": "Section 4.1 and Appendix D demonstrate that neither method detected the hidden gender assumption, with extended analysis on 513 prompts (Figure 9) confirming the behavior rarely manifests on WildChat prompts.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "The prior 'judge-verified frequency difference' metric conflates frequency and accuracy, hiding distinct aspects of hypothesis quality.",
    379       "evidence": "Section 3.1 and Appendix B.1 provide the mathematical decomposition (Equation 4) showing vfd = f(h) · (2·acc(h)−1) with a concrete example of two hypotheses with identical vfd but very different properties.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "The LLM-based method achieves higher acceptance rates, indicating more robust hypothesis generation.",
    384       "evidence": "Figure 2 shows higher acceptance rates for LLM across experiments, but confidence intervals sometimes overlap. Section 4.3 states this finding.",
    385       "supported": "moderate"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "No ablation of pipeline modifications",
    391       "detail": "The authors modify both pipelines from prior work (e.g., adding prompt context to SAE, changing clustering approach for LLM) but never ablate these changes to measure their individual contribution. It's unclear whether improvements come from the modifications or are inherent to the approach."
    392     },
    393     {
    394       "flag": "LLM autorater circularity",
    395       "detail": "Interestingness and abstraction are evaluated by LLM autoraters (GPT-5.2, Claude Opus 4.5, Grok-4), but the LLM-based diffing method is itself an LLM pipeline. LLMs may systematically rate LLM-generated hypotheses higher on abstraction than SAE-generated ones due to shared linguistic biases."
    396     },
    397     {
    398       "flag": "Single-run results with stochastic pipeline",
    399       "detail": "The diffing pipeline involves stochastic steps (LLM generation, clustering) but results appear to be from single runs. Hypothesis sets could vary substantially across runs, affecting all downstream metrics."
    400     },
    401     {
    402       "flag": "No cost comparison between methods",
    403       "detail": "The SAE-based method requires running a 70B reader model while the LLM-based method uses Gemini Flash. Practical cost differences are not reported despite being relevant to the 'practical implications' discussed."
    404     }
    405   ],
    406   "cited_papers": [
    407     {
    408       "title": "VibeCheck: Discover and quantify qualitative differences in large language models",
    409       "authors": ["Lisa Dunlap", "Krishna Mandal", "Trevor Darrell", "Jacob Steinhardt", "Joseph E. Gonzalez"],
    410       "year": 2025,
    411       "relevance": "Primary baseline for LLM-based model diffing; proposes clustering-based hypothesis generation from model response differences."
    412     },
    413     {
    414       "title": "Interpretable Embeddings with Sparse Autoencoders: A Data Analysis Toolkit",
    415       "authors": ["Nick Jiang", "Xiaoqing Sun", "Lisa Dunlap", "Lewis Smith", "Neel Nanda"],
    416       "year": 2025,
    417       "arxiv_id": "2512.10092",
    418       "relevance": "Primary baseline for SAE-based model diffing; uses SAE features with activation frequency differences to identify behavioral differences."
    419     },
    420     {
    421       "title": "Emergent Misalignment: Narrow finetuning can produce broadly misaligned LLMs",
    422       "authors": ["Jan Betley", "Daniel Tan", "Niels Warncke"],
    423       "year": 2025,
    424       "relevance": "Creates model organisms of misalignment used as ground-truth test cases for evaluating model diffing methods."
    425     },
    426     {
    427       "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training",
    428       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    429       "year": 2024,
    430       "arxiv_id": "2401.05566",
    431       "relevance": "Demonstrates deceptive AI behaviors that persist through safety training, motivating the need for model diffing methods."
    432     },
    433     {
    434       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    435       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    436       "year": 2023,
    437       "relevance": "Establishes the LLM-as-a-judge evaluation paradigm used throughout this paper for hypothesis verification."
    438     },
    439     {
    440       "title": "WildChat: 1M ChatGPT Interaction Logs in the Wild",
    441       "authors": ["Wenting Zhao", "Xiang Ren", "Jack Hessel", "Claire Cardie", "Yejin Choi", "Yuntian Deng"],
    442       "year": 2024,
    443       "relevance": "Provides the diverse prompt dataset used for all model diffing experiments in this paper."
    444     },
    445     {
    446       "title": "Towards Monosemanticity: Decomposing Language Models With Dictionary Learning",
    447       "authors": ["Trenton Bricken", "Adly Templeton", "Joshua Batson"],
    448       "year": 2023,
    449       "relevance": "Foundational work on sparse autoencoders for interpretability, underlying the SAE-based model diffing approach."
    450     },
    451     {
    452       "title": "Sparse Crosscoders for Cross-Layer Features and Model Diffing",
    453       "authors": ["Jack Lindsey", "Adly Templeton", "Jonathan Marcus", "Thomas Conerly", "Joshua Batson", "Christopher Olah"],
    454       "year": 2024,
    455       "relevance": "Introduces model diffing concept using sparse crosscoders; foundational motivation for this paper's comparison of diffing approaches."
    456     },
    457     {
    458       "title": "Eliciting secret knowledge from language models",
    459       "authors": ["Bartosz Cywiński", "Emil Ryd", "Rowan Wang"],
    460       "year": 2025,
    461       "arxiv_id": "2510.01070",
    462       "relevance": "Creates the Gemma gender assumption model organism used as a test case in this paper."
    463     },
    464     {
    465       "title": "Model Organisms for Emergent Misalignment",
    466       "authors": ["Edward Turner", "Anna Soligo", "Mia Taylor"],
    467       "year": 2025,
    468       "arxiv_id": "2506.11613",
    469       "relevance": "Creates model organisms of emergent misalignment used as ground-truth test cases for model diffing evaluation."
    470     }
    471   ]
    472 }

Impressum · Datenschutz