ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28788B)


      1 {
      2   "paper": {
      3     "title": "Narrow Finetuning Leaves Clearly Readable Traces in Activation Differences",
      4     "authors": ["Julian Minder", "Clément Dumas", "Stewart Slocum", "Helena Casademunt", "Cameron Holmes", "Robert West", "Neel Nanda"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2510.13900",
      8     "doi": "10.48550/arXiv.2510.13900"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "Code is released at https://github.com/science-of-finetuning/diffing-toolkit (Appendix A: 'All code is available')."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The model organisms used are from prior published work (Turner et al. 2025, Cloud et al. 2025, Cywiński et al. 2025, Wang et al. 2025a) and publicly available. SDF models were trained by the authors using publicly available base models and described procedures."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No requirements.txt, Dockerfile, or environment specification is mentioned in the paper. Only model names and library references are given."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "While code is released, no step-by-step reproduction instructions or README with commands are described in the paper."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Standard deviation is shown in figures (e.g., Figure 2 shows 'mean and std over all variants'), and Figure 4 shows 95% Highest Density Intervals (HDIs) from Bayesian GLM."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "A Bayesian GLM is fitted (Figure 4, Section 4.1) with 95% HDIs used to assess whether effects are 'credibly different from zero.' This is a principled alternative to frequentist significance tests."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Effect sizes are reported throughout: 'more than twice as well at identifying the broad finetuning objective and over 30 times better at identifying specific details' (Abstract). Agent success rates (91% vs 39%) and grade distributions provide magnitude context."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No justification is given for why 33 model organisms, 5 evaluation runs per configuration, or 20 chat prompts were chosen. No power analysis is discussed."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Each agent configuration is evaluated 5 times with average grades reported (Section 4.1). Figure 16 shows deviation from mean performance. Standard deviations are shown in multiple figures."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Blackbox agents without ADL access serve as baselines, tested at i=0, i=5, and i=50 interaction budgets (Section 4.1, Figure 3)."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The blackbox baseline represents the natural alternative (prompting-only access to models). The comparison is fair and contemporary — there is no established prior method for this specific task."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Multiple ablations: interaction budget (i=0 vs i=5), positional investigation (Section 4.3), causal ablation of bias (Section 5), mixing ratios (Section 6), LoRA vs full finetuning (D.1), reducing training samples (D.2), CAFT mitigation (D.3)."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Four main metrics: token relevance (fraction of relevant Patchscope tokens), steering cosine similarity, agent grades (1-5 scale), and False Fact Alignment scores."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "All evaluation is automated via LLM graders (gpt-5-mini, gpt-5-nano). The paper acknowledges this limitation: 'Our evaluation pipeline relies on multiple LLM graders and agents, which introduce noise' (Section 8)."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No explicit held-out test set. The same 20 chat prompts are used for both steering and evaluation. The agent grading uses the same organisms for development and evaluation."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down by organism type (EM, SDF, Subliminal, Taboo, Domain) in Figures 2, 3, and by model architecture in Figures 4, 20, 21."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Failures are discussed: ADL agent fails on 3/33 organisms (9%), Domain organisms show weak bias detection (Section D.5), CAFT mitigation provides 'limited effectiveness' (Section D.3), agents get 'sidetracked during interaction' (Section D.5)."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Several negative results: CAFT mitigation is largely ineffective (Section D.3), mixing data can impair fact internalization (Section 6), Gemma3 1B shows unexpected positive causal effect on pretraining data (Section 5), Domain organisms show minimal bias (Section D.5)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims of 'more than twice as well' and 'over 30 times better' are supported by Figure 3 data (91% vs 39% for grade≥2; 30% vs 1% for grade≥4). Claims about cross-architecture generalization are supported by testing 7 models."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Causal claims about bias being beneficial for finetuning data are supported by controlled causal ablation experiments (Section 5, Equation 1-2) with random vector baselines. The intervention design (projecting out the bias direction) is a valid causal methodology."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The paper explicitly bounds its claims to narrow finetuning and warns against generalizing to broader finetuning: 'narrow finetuning may compromise the validity of model organisms as realistic proxies for broader finetuning' (Section 1). Title also specifies 'narrow finetuning.'"
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section 5 investigates why the bias exists (overfitting to mono-semantic data, connection to catastrophic forgetting). The paper considers whether the bias is an artifact of LoRA vs full finetuning (D.1), and discusses that it may not generalize to more diverse finetuning."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper clearly defines its measurements (token relevance, cosine similarity, agent grades) and what they proxy for (detectability of finetuning objectives). The agent grading rubric and its limitations are explicitly discussed."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Specific model versions are stated: Qwen3 1.7B, Qwen3 32B, Llama 3.1 8B Instruct, Llama 3.2 1B, Gemma3 1B it, Gemma2 9B it, Qwen2.5 7B, Qwen2.5 VL 3B. Grader models specified as gpt-5-mini, gpt-5-nano, gpt-5 with medium thinking strength."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Extensive prompt text is provided in the appendix (Prompts 1-20), including system prompts for all agents and graders, steering prompts (Prompt 16), grading rubrics, and organism descriptions."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Key hyperparameters reported: temperature 1.1/1.2 for steering, 40000 training samples for SDF, k=5 first tokens, middle layer ℓ=⌊L/2⌋, 10000 pretraining samples, 500 subsampled finetuning samples, 20 chat prompts, binary search range [0,100] with 10 iterations."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The interpretability agent's tools (get_logitlens_details, get_patchscope_details, get_steering_samples, ask_model, generate_steered), interaction budget system, and behavioral instructions are described in detail (Section 3.1, C.4, Prompts 17-18)."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Preprocessing documented: activation differences computed on first k=5 tokens across 10000 samples, averaged per position (Section 3). Patchscope uses three token sets with intersection filtering (C.1). Steering uses binary search for optimal scaling factor (C.3). Token relevance uses triple-run stability check (C.2)."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 8 'Limitations and Future Work' provides substantive discussion of limitations."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Specific threats identified: 'Our evaluation pipeline relies on multiple LLM graders and agents, which introduce noise' (Section 8), 'the underlying mechanisms that produce these detectable biases remain unclear,' and the scope limitation about conditions under which biases appear or disappear."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The paper explicitly states that results apply to narrow finetuning and warns against extrapolation: 'narrow finetuning signals completely overpower any traces from standard chat finetuning suggests that the detectable biases we observe are artificially strong compared to realistic post-training scenarios' (Section 7)."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "While code is released, the raw activation data, cached analysis results, and detailed grading outputs are not described as being available for independent verification."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Data collection is well-described: model organisms sourced from prior work (Turner et al., Cloud et al., Cywiński et al.) and newly trained SDF organisms with detailed procedures (Section 3.2, C.5). Activation extraction procedure described in Section 3."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. The study uses pre-existing model organisms and LLM-based evaluation."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The pipeline is documented: compute activation differences → apply Patchscope/Logit Lens → compute token relevance → steer model → run interpretability agent → grade agent output. Each step is described in Sections 3 and C.1-C.4."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Acknowledgements state: 'This work was carried out as part of the ML Alignment & Theory Scholars (MATS) program.'"
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations clearly listed: EPFL, Ecole Normale Supérieure Paris-Saclay, Anthropic Fellows Program, Harvard University, MATS."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "MATS is an educational research program focused on AI alignment. While Anthropic has interests in AI safety, the specific findings (narrow finetuning creates detectable biases) do not have a clear commercial stake for any funder."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial disclosure statement is included in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper uses GPT-5 as the interpretability agent and grader but does not state GPT-5's training data cutoff. The agent could have knowledge about these model organisms from its training data."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No discussion of whether the GPT-5 agent or graders have prior knowledge of the model organisms being evaluated, several of which are from published papers (emergent misalignment, subliminal learning)."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "The grading agent (GPT-5) could have knowledge about the finetuning objectives of these well-known model organisms from its training data, potentially inflating agent performance. This is not discussed."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference costs reported despite extensive use of GPT-5 for agents and grading, plus running activation extraction on 10000 samples across 33 model organisms."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No total computational budget stated. The paper trains 20+ SDF models and runs extensive agent evaluations but does not quantify GPU hours or API costs."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "While agent evaluations are run 5 times, no seed sensitivity analysis is performed for model finetuning or activation extraction. SDF models are each trained once without reporting seed sensitivity."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Explicitly stated: 'Each configuration is evaluated 5 times with average grades reported' (Section 4.1). Steering uses 5 generations with temperature 1.1 (Section C.3). Token relevance repeated 3 times with shuffled order (Section C.2)."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Binary search is described for steering factor α, but no broader hyperparameter search budget is reported for choices like k=5 tokens, middle layer selection, number of pretraining samples (10000), or Patchscope scaling factor ranges."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "Steering factor selection is justified via automated coherence grading with binary search (Section C.3). Patchscope scaling factor selected via tournament grading across 30 candidates (Section C.1). Middle layer choice ℓ=⌊L/2⌋ is stated explicitly."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Many comparisons across 33 organisms, 7 models, and multiple conditions without explicit multiple comparison correction. The Bayesian GLM (Figure 4) partially addresses this through hierarchical modeling but does not cover all comparisons."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors designed and evaluate their own ADL method against blackbox baselines without acknowledging self-evaluation bias. The blackbox baseline uses the same agent architecture, which could disadvantage it."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "ADL agents have access to precomputed activation analyses (requiring substantial compute for extraction across 10000 samples and 33 organisms) while blackbox agents do not. This compute asymmetry is not discussed."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "Central to the paper: it extensively discusses whether model organisms are valid proxies for real-world finetuning (Sections 1, 7) and acknowledges that 'narrow finetuning signals completely overpower any traces from standard chat finetuning,' questioning construct validity."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": false,
    333         "answer": false,
    334         "justification": "No scaffold comparison issue — the paper evaluates an interpretability method (ADL), not comparing models across different scaffolds."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No discussion of whether GPT-5 (the agent/grader) was trained on data containing descriptions of these model organisms, which were published before GPT-5's likely training cutoff."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "The ADL agent receives activation difference information that inherently provides more signal than the blackbox baseline. While this is the point of the paper, the information asymmetry is not framed as a potential leakage concern."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "Multiple organisms from the same family (e.g., 20 SDF organisms, 6 EM organisms) share structural similarities. Non-independence between organisms within a family is not discussed."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No concrete leakage detection method is applied to verify that the GPT-5 agent does not have prior knowledge of the model organisms being tested."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "ADL-enabled agents perform more than twice as well at identifying broad finetuning objectives and over 30 times better at identifying specific details compared to blackbox baselines.",
    363       "evidence": "Figure 3 shows ADL agent with i=5 achieves grade≥2 for 91% of organisms vs 39% for blackbox i=50; grade≥4 for 30% vs 1%.",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "Activation differences on the first few tokens of random text encode meaningful information about the finetuning domain.",
    368       "evidence": "Figure 2a shows 20-80% of top-20 Patchscope tokens are relevant to finetuning domain. Figure 2b shows steering produces text with high cosine similarity to finetuning data. Tested across 33 organisms, 7 models.",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "The detectable bias arises from overfitting to semantically homogeneous finetuning data.",
    373       "evidence": "Section 5, Figure 7: Causal ablation shows positive CE loss effect on finetuning data (bias is beneficial) and negative/neutral effect on pretraining data (bias hurts generalization) for 2/3 models. Random vector baselines show near-zero effect.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "Mixing unrelated data during finetuning can mostly eliminate the detectable bias.",
    378       "evidence": "Figure 8 shows bias reduction with increasing mixing ratios across three models. At 1:1 ratio, all agents fail to achieve average grade≥2. However, mixing can also reduce FFA scores (Section 6).",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Narrow finetuning signals dominate even when overlaid on substantial base-to-chat transformation.",
    383       "evidence": "Figure 5 shows no significant difference in agent performance when comparing base→finetuned vs chat→finetuned models (Section 4.2).",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "These findings raise concerns about using narrowly finetuned model organisms as realistic proxies for broader finetuning.",
    388       "evidence": "Section 7 argues that narrow finetuning signals are 'artificially strong compared to realistic post-training scenarios.' Domain finetuning (Section D.5) shows reduced but still partially detectable bias.",
    389       "supported": "moderate"
    390     }
    391   ],
    392   "methodology_tags": ["benchmark-eval"],
    393   "key_findings": "Narrow finetuning creates strong, easily detectable biases in LLM activations visible in the first few tokens of unrelated text. An interpretability agent using Activation Difference Lens (Patchscope + steering) identifies finetuning objectives with 91% success rate vs 39% for blackbox baselines across 33 model organisms and 7 architectures. These biases appear to stem from overfitting to semantically homogeneous data and can be mostly mitigated by mixing in unrelated pretraining data, though this may reduce finetuning effectiveness. The findings question the validity of narrowly finetuned model organisms as proxies for studying broader finetuning effects.",
    394   "red_flags": [
    395     {
    396       "flag": "LLM-graded evaluation pipeline",
    397       "detail": "All evaluation relies on LLM graders (gpt-5-mini, gpt-5-nano) and an LLM agent (gpt-5). The paper acknowledges this introduces noise but does not validate against human judgments. The grader could have prior knowledge of the model organisms being evaluated."
    398     },
    399     {
    400       "flag": "Potential contamination of evaluator",
    401       "detail": "GPT-5 serves as both the interpretability agent and the grader. It may have been trained on descriptions of the model organisms (emergent misalignment, subliminal learning, etc.), which were published papers. This could inflate agent performance, especially for well-known organisms."
    402     },
    403     {
    404       "flag": "Self-evaluation bias",
    405       "detail": "The authors designed the ADL method, the agent prompts, the grading rubrics, and the evaluation pipeline. The blackbox baseline uses the same agent but without ADL tools, potentially disadvantaging it through prompt design choices optimized for ADL access."
    406     }
    407   ],
    408   "cited_papers": [
    409     {
    410       "title": "Emergent misalignment: Narrow finetuning can produce broadly misaligned LLMs",
    411       "authors": ["Jan Betley", "Daniel Chee Hian Tan", "Niels Warncke", "Anna Sztyber-Betley", "Xuchan Bao", "Martín Soto", "Nathan Labenz", "Owain Evans"],
    412       "year": 2025,
    413       "relevance": "Core model organism source for emergent misalignment, directly relevant to AI safety and alignment research."
    414     },
    415     {
    416       "title": "Alignment faking in large language models",
    417       "authors": ["Ryan Greenblatt", "Carson Denison", "Benjamin Wright"],
    418       "year": 2024,
    419       "arxiv_id": "2412.14093",
    420       "relevance": "Foundational work on alignment faking using model organisms, relevant to AI safety evaluation methodology."
    421     },
    422     {
    423       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    424       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    425       "year": 2024,
    426       "arxiv_id": "2401.05566",
    427       "relevance": "Key work on deceptive AI behaviors and model organisms for safety research."
    428     },
    429     {
    430       "title": "Building and evaluating alignment auditing agents",
    431       "authors": ["Trenton Bricken", "Rowan Wang", "Sam Bowman"],
    432       "year": 2025,
    433       "relevance": "Automated alignment auditing using LLM agents, directly relevant to AI safety evaluation methodology."
    434     },
    435     {
    436       "title": "Subliminal learning: Language models transmit behavioral traits via hidden signals in data",
    437       "authors": ["Alex Cloud", "Minh Le", "James Chua"],
    438       "year": 2025,
    439       "arxiv_id": "2507.14805",
    440       "relevance": "Model organism for subliminal preference learning in LLMs, relevant to AI safety and training data effects."
    441     },
    442     {
    443       "title": "Modifying LLM beliefs with synthetic document finetuning",
    444       "authors": ["Rowan Wang", "Avery Griffin", "Johannes Treutlein"],
    445       "year": 2025,
    446       "relevance": "Synthetic document finetuning methodology for implanting false facts, core technique used in this paper."
    447     },
    448     {
    449       "title": "Towards eliciting latent knowledge from LLMs with mechanistic interpretability",
    450       "authors": ["Bartosz Cywiński", "Emil Ryd", "Senthooran Rajamanoharan", "Neel Nanda"],
    451       "year": 2025,
    452       "arxiv_id": "2505.14352",
    453       "relevance": "Using interpretability to elicit latent knowledge from LLMs, directly related to AI safety evaluation."
    454     },
    455     {
    456       "title": "Overcoming sparsity artifacts in crosscoders to interpret chat-tuning",
    457       "authors": ["Julian Minder", "Clément Dumas", "Caden Juang", "Bilal Chugtai", "Neel Nanda"],
    458       "year": 2025,
    459       "arxiv_id": "2504.02922",
    460       "relevance": "Model diffing methodology for understanding finetuning effects, core related work."
    461     },
    462     {
    463       "title": "Patchscopes: A unifying framework for inspecting hidden representations of language models",
    464       "authors": ["Asma Ghandeharioun", "Avi Caciularu", "Adam Pearce", "Lucas Dixon", "Mor Geva"],
    465       "year": 2024,
    466       "relevance": "Core interpretability technique (Patchscope) used as the primary analysis tool in this paper."
    467     },
    468     {
    469       "title": "FIND: A function description benchmark for evaluating interpretability methods",
    470       "authors": ["Sarah Schwettmann", "Tamar Rott Shaham", "Joanna Materzynska"],
    471       "year": 2023,
    472       "relevance": "Benchmark for automated interpretability evaluation, methodological foundation for the agent-based evaluation approach."
    473     },
    474     {
    475       "title": "Persona features control emergent misalignment",
    476       "authors": ["Miles Wang", "Tom Dupré la Tour", "Olivia Watkins"],
    477       "year": 2025,
    478       "arxiv_id": "2506.19823",
    479       "relevance": "Mechanistic analysis of emergent misalignment through persona features, relevant to AI safety interpretability."
    480     },
    481     {
    482       "title": "A survey on large language model based autonomous agents",
    483       "authors": ["Lei Wang", "Chen Ma", "Xueyang Feng"],
    484       "year": 2024,
    485       "doi": "10.1007/s11704-024-40231-1",
    486       "relevance": "Survey of LLM-based autonomous agents, relevant to understanding agentic AI capabilities and evaluation."
    487     }
    488   ]
    489 }

Impressum · Datenschutz