ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28939B)


      1 {
      2   "paper": {
      3     "title": "In-the-Wild Model Organisms: Mitigating Undesirable Emergent Behaviors in Production LLM Post-Training via Data Attribution",
      4     "authors": ["Frank Xiao", "Santiago Aranguri"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.11079"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "The paper proposes activation-based data attribution to trace harmful behaviors in post-trained LLMs to responsible training datapoints. Applied to OLMo 2's DPO training, they discovered distractor-triggered compliance (model complies with harmful requests when benign formatting instructions are appended). Filtering top-ranked datapoints reduces harmful behavior by 63%, switching labels achieves 78%, and removing contributions from problematic source models achieves 85%. The method is 10x cheaper than gradient-based and LLM-judge alternatives.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "The paper mentions 'We will additionally release the visualization tool we made to inspect these visualizations upon publication' but no repository URL is provided. Future release promises count as NO."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper uses OLMo 2's publicly available DPO training data (378,341 preference pairs) and public benchmarks (GSM8k, IFEval, XSTest, LMSys). OLMo 2 is open-source with released training data and checkpoints."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specifications, requirements files, or dependency lists are provided. Hardware is mentioned (H100, 4090) but no software environment details."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided. The method is described algorithmically but there are no scripts, commands, or README-style instructions."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Tables 2-4 report 95% bootstrap confidence intervals (e.g., '7.63 ± 0.36'). Figures 4, 14, 15, 16, 19 show 95% confidence interval error bars."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Despite comparing multiple methods and claiming one outperforms another, no formal statistical significance tests (p-values, hypothesis tests) are reported. Comparisons rely on overlapping/non-overlapping CIs."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Effect sizes are reported as percentage reductions from baseline: '63% reduction', '78% reduction', '85% reduction'. Raw values with baseline context are provided (e.g., 7.63% → 2.86%)."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "120 test prompts and 100 responses per prompt are used, but no justification for why these numbers were chosen. No power analysis."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Bootstrap CIs over prompts are reported in Tables 2-4. Standard errors are provided for capability benchmarks. 100 responses sampled per prompt provides variance estimation."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Three baselines are compared: gradient-based attribution (LESS), LLM Toxic judge, and random removal. Additional methods (Max over Vector Bank, LLM Toxic + IF) are in the appendix."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "LESS (Xia et al., 2024) is a recent gradient-based influence method. LLM judge baselines use GPT-5-mini. These are contemporary methods."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Multiple ablations are presented: filtering vs. switching interventions, different intervention sizes (3k/12k/30k), model-level ablation (Table 4), alternative probing vector formulations (Appendix A.6), layer selection (Appendix A.3), additional ranking methods (Appendix A.4)."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Four metrics are used: harmful response rate (safety), GSM8K (math reasoning), IFEval (instruction following), and XSTest (exaggerated refusal). Safety-capability trade-off is explicitly evaluated."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Appendix A.9 validates the LLM judge against human annotations: 250 harmful + 250 clean responses labeled by a human annotator, achieving 90.6% agreement and Cohen's κ = 0.81."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Test prompts are randomly sampled from LMSys, which is separate from the DPO training data. The 120 evaluation prompts are distinct from the 150 prompts used to construct the probing vector."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down by intervention type (filtering vs. switching), intervention size (3k/12k/30k), ranking method (4 methods), and capability benchmark (3 benchmarks). Figure 5 shows per-source-model attribution."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper discusses where the probing vector underperforms: 'At the smallest intervention size, gradient-based and LLM toxic methods outperform our probing vector.' The LLM Toxic + IF method is shown to perform worse than LLM Toxic alone. Switching at small counts performs worse than filtering."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Several negative results: probing vector underperforms at 3k datapoints, switching causes capability degradation at 30k (GSM8K drops from 72.5% to ~68%), LLM Toxic + IF performs worse than LLM Toxic alone, using M1 for activations yields weaker steering (19% vs 29%)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims are supported: 63% reduction (Table 2: 7.63→2.86 at 30k filtering), 78% reduction (Table 2: 7.63→1.66 at 30k switching), 10x cheaper (Table 1: $30 vs $320/$500), unsupervised discovery (Figure 3 + Appendix C verification)."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Causal claims are justified through retraining from scratch: 'we validate these attributions causally by retraining with modified data.' Each intervention involves retraining the full DPO from the SFT checkpoint with modified data, which is a valid causal manipulation."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper explicitly bounds generalization: 'we focus on a single model family (OLMo 2) and a single harmful behavior type' (Discussion, Limitations). They acknowledge OLMo is the only model releasing needed artifacts, and note validation on other families 'is conditional on the release of comparable training artifacts.'"
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 7 discusses why activation-based attribution scales better than gradient-based (local sensitivity vs. semantic alignment). The paper considers whether the behavior could be an artifact (Section 5.4 addresses Minder et al.'s critique). Cluster verification in Appendix C.1-C.2 rules out sampling noise."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper measures harmful response rate via LLM judge and explicitly validates this proxy against human labels (Appendix A.9, 90.6% agreement, κ=0.81). The measured quantity (LLM judge toxicity score >50) is clearly distinguished from the concept (harmful compliance)."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "OLMo 2 7B and OLMo 2 32B are specified with exact size. GPT-5-mini is named as the LLM judge. GPT-4o is referenced as the grading model for DPO data generation. Grok 4.1 is named for prompt generation."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "LLM judge prompts are provided in full in Appendices A.7 and A.8. Example test prompts are shown in Section 5.2 and Appendix C.4."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Key hyperparameters are reported: layer 20 for probing vector, 150 prompts for vector construction, 100 rollouts per prompt, LoRA rank 128 with α=512 for LESS, 8192-dimensional random projections, toxicity threshold of 50, 'medium reasoning effort' for GPT-5-mini."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. The method involves direct model activation computation and cosine similarity ranking."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "DPO data generation pipeline is described (Section 5.1): prompts → 20 LLM responses → GPT-4o grading → highest=accepted, random other=rejected. Probing vector construction documented (Appendix A.1): 8000 prompts → 100 rollouts each → filtering criteria → 150 prompts."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 7 (Discussion) contains a dedicated 'Limitations' paragraph discussing single model family, single behavior type, reliance on human cluster interpretation, and dependency on open artifacts."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Specific threats discussed: single model family constraint (OLMo is the only model releasing needed artifacts), single behavior type, human interpretation requirement for clusters, and dependency on open-source training artifacts for causal validation."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Explicit scope boundaries: 'validation on other model families is conditional on the release of comparable training artifacts', 'whether activation-based attribution extends to other post-training methods such as RLHF and SFT remains untested', unsupervised discovery 'relies on human interpretation to identify concerning ones.'"
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "OLMo 2's DPO training data (378,341 pairs) and intermediate checkpoints are publicly available. The paper leverages OLMo's open-source release as a key enabler. Test prompts are sampled from the public LMSys dataset."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 5.1 describes the DPO dataset generation: 378,341 preference pairs, responses sampled from 20 LLMs, graded by GPT-4o, highest=accepted. Appendix A.1 describes probing vector prompt generation in detail."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "For the human evaluation validation (Appendix A.9), only 'a human annotator' is mentioned with no information about who they are, how they were selected, or their qualifications."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The full pipeline is documented: DPO data generation (Section 5.1) → activation computation (Section 3) → cosine similarity ranking (Section 3.4) → intervention (filtering/switching) → retraining → evaluation. Probing vector pipeline: 8000 prompts → filtering to 150 → vector computation."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding sources are disclosed. The acknowledgments thank individuals and SPAR (Supervised Program for Alignment Research) but do not mention financial support or grants."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly stated: Frank Xiao at California Institute of Technology, Santiago Aranguri at Goodfire. Neither affiliation is the maker of OLMo (AllenAI)."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding disclosed, so independence cannot be assessed. One author is affiliated with Goodfire, an interpretability company that could benefit from demonstrating the value of activation-based methods."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement is present. Goodfire (author affiliation) appears to be an interpretability company, which could constitute a financial interest in the validation of activation-based methods."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This paper does not evaluate a pre-trained model's capability on a knowledge benchmark. It studies emergent safety behaviors in post-training (DPO). The evaluation measures harmful response rates, not model knowledge."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Not a benchmark capability evaluation. The test prompts (from LMSys) test safety behavior, not knowledge that could be memorized from training data."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Not applicable — the paper evaluates safety behaviors emerging from DPO training, not model performance on knowledge benchmarks."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants study. The human annotation (Appendix A.9) is a validation exercise for the LLM judge, not a human subjects study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Table 1 reports computational cost for all three attribution methods: Probing Vector 12 H100 hours ($30), Gradient (LESS) 128 H100 hours ($320), LLM Judge $500. Also notes actual cost was ~$10 on a 4090."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Table 1 states GPU hours (12 H100 hours for probing vector, 128 for LESS). Mentions 36 hours on a 4090. Each intervention requires a full DPO retrain, implying substantial total compute across all experiments."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No multiple random seed analysis is reported. Each retraining condition appears to be a single run. The 100 responses per prompt provide sampling variance but not seed sensitivity for training."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "The paper states 100 rollouts per prompt for evaluation (Section 5.3). Bootstrap resampling over 120 prompts is described. However, each retrain appears to be a single run."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Layer selection (Appendix A.3) shows steering effectiveness across layers 16-26, but no hyperparameter search budget is reported for other choices (number of probing prompts, toxicity threshold, intervention sizes)."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Layer 20 selection is justified via steering effectiveness experiments (Figures 7-8). The probing vector generation criteria are explicitly stated (Appendix A.1). Results are shown across multiple intervention sizes rather than cherry-picking the best."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Multiple methods (4+) are compared across multiple intervention sizes (3) on multiple metrics (4), yielding many comparisons, but no multiple comparison correction is applied."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors evaluate their own probing vector method against baselines they implemented (LESS adaptation, LLM judge). No discussion of author-evaluation bias or independent evaluation."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "Table 1 explicitly compares cost vs. performance across methods: probing vector ($30) vs. LESS ($320) vs. LLM judge ($500). The 10x cost advantage is a key contribution."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "The paper validates that the LLM judge (the measurement instrument) agrees with human labels at 90.6% with κ=0.81 (Appendix A.9). The paper also discusses why distractor-triggered compliance is a meaningful safety behavior rather than just a benchmark artifact (Section 5.4)."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding is involved. The method directly computes activations and cosine similarities without any agentic scaffolding."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of whether LMSys test prompts or similar prompts could have appeared in OLMo 2's pretraining or SFT data. The evaluation relies on LMSys prompts but temporal overlap is not discussed."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the evaluation setup leaks information. The probing vector is constructed from the same model being evaluated, which could introduce circularity, but this is not discussed as a leakage concern."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of independence between the 150 probing prompts and the 120 evaluation prompts (both sourced from LMSys). Potential structural similarities are not addressed."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection or prevention method is applied. No decontamination analysis of the DPO training data against evaluation prompts."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "Filtering top 30,000 ranked datapoints reduces harmful behavior by 63% (from 7.63% to 2.86%).",
    364       "evidence": "Table 2 shows probing vector filtering at 30k: 2.86 ± 0.23 vs. baseline 7.63 ± 0.36. Validated by retraining from scratch.",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "Switching labels of top 30,000 ranked datapoints achieves 78% reduction in harmful behavior (from 7.63% to 1.66%).",
    369       "evidence": "Table 2 shows probing vector switching at 30k: 1.66 ± 0.19 vs. baseline 7.63 ± 0.36. Validated by retraining.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Removing all datapoints from top 4 over-represented source models achieves 85% reduction in harmful behavior.",
    374       "evidence": "Table 4: Max over Vector Bank method achieves 1.17 ± 0.19 harmful rate vs. 7.63 baseline. Multiple ranking methods tested.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Activation-based attribution is over 10x cheaper than gradient-based and LLM-judge baselines.",
    379       "evidence": "Table 1: Probing Vector $30 (12 H100 hours) vs. LESS $320 (128 H100 hours) vs. LLM Judge $500.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Distractor-triggered compliance is an in-the-wild model organism that emerged from contaminated preference data, not deliberate injection.",
    384       "evidence": "Section 5.1 describes OLMo 2's automated data generation pipeline. Section 5.4 argues the behavior emerged organically. Training datapoint examples in Appendix C.4 show mislabeled data.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Unsupervised clustering discovers emergent behaviors without prior specification.",
    389       "evidence": "Figure 3 shows four discovered clusters. Appendix C.1 independently verifies verbosity and formatting clusters. Appendix C.2 shows reproducibility across 8 random samples. Appendix C.3 replicates on OLMo 3.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Filtering preserves capabilities while switching introduces a modest trade-off at scale.",
    394       "evidence": "Table 3: filtering shows no GSM8K/IFEval degradation across methods. Switching at 30k shows GSM8K drop (~72.5 → ~68%) and XSTest increase across all methods.",
    395       "supported": "strong"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "Single model family",
    401       "detail": "All experiments are on OLMo 2 (7B and 32B). The authors acknowledge this but it limits generalizability. The method requires access to post-training data and intermediate checkpoints, which most production models don't release."
    402     },
    403     {
    404       "flag": "Single behavior type for intervention experiments",
    405       "detail": "Causal validation (retraining) is performed only for distractor-triggered compliance. The unsupervised discovery surfaces multiple behaviors but only one is evaluated with interventions."
    406     },
    407     {
    408       "flag": "Single human annotator for judge validation",
    409       "detail": "LLM judge reliability is validated against a single human annotator (Appendix A.9). Inter-annotator agreement between multiple humans is not assessed."
    410     },
    411     {
    412       "flag": "No training seed variation",
    413       "detail": "Each retraining condition appears to be a single run. DPO training can be sensitive to random seeds, and the observed differences might partly reflect training variance rather than intervention effects."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Refusal in language models is mediated by a single direction",
    419       "authors": ["A. Arditi", "O. Obeso", "A. Syed", "D. Paleka", "N. Panickssery", "W. Gurnee", "N. Nanda"],
    420       "year": 2024,
    421       "relevance": "Mechanistic interpretability work showing behaviors can be localized to directions in activation space, foundational to this paper's method."
    422     },
    423     {
    424       "title": "Direct preference optimization: Your language model is secretly a reward model",
    425       "authors": ["R. Rafailov", "A. Sharma", "E. Mitchell", "C. D. Manning", "S. Ermon", "C. Finn"],
    426       "year": 2023,
    427       "relevance": "DPO training method used in OLMo 2's post-training pipeline, the subject of this paper's safety analysis."
    428     },
    429     {
    430       "title": "Training language models to follow instructions with human feedback",
    431       "authors": ["L. Ouyang", "J. Wu", "X. Jiang"],
    432       "year": 2022,
    433       "relevance": "Foundational RLHF paper; this work studies emergent behaviors from post-training procedures like RLHF/DPO."
    434     },
    435     {
    436       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    437       "authors": ["E. Hubinger", "C. Denison", "J. Mu"],
    438       "year": 2024,
    439       "arxiv_id": "2401.05566",
    440       "relevance": "Model organisms of misalignment created through deliberate injection; this paper contrasts with in-the-wild model organisms."
    441     },
    442     {
    443       "title": "Jailbroken: How does LLM safety training fail?",
    444       "authors": ["A. Wei", "N. Haghtalab", "J. Steinhardt"],
    445       "year": 2023,
    446       "relevance": "Studies how safety training can fail in subtle ways, motivating research into emergent harmful behaviors."
    447     },
    448     {
    449       "title": "LESS: Selecting influential data for targeted instruction tuning",
    450       "authors": ["M. Xia", "S. Malladi", "S. Gururangan", "S. Arora", "D. Chen"],
    451       "year": 2024,
    452       "relevance": "Gradient-based data attribution method used as a key baseline for comparison."
    453     },
    454     {
    455       "title": "Narrow finetuning leaves clearly readable traces in activation differences",
    456       "authors": ["J. Minder", "C. Dumas", "S. Slocum", "H. Casademunt", "C. Holmes", "R. West", "N. Nanda"],
    457       "year": 2025,
    458       "arxiv_id": "2510.13900",
    459       "relevance": "Critiques artificial model organisms as having unrealistic artifacts; this paper directly addresses this critique with in-the-wild organisms."
    460     },
    461     {
    462       "title": "Persona vectors: Monitoring and controlling character traits in language models",
    463       "authors": ["R. Chen", "A. Arditi", "H. Sleight", "O. Evans", "J. Lindsey"],
    464       "year": 2025,
    465       "arxiv_id": "2507.21509",
    466       "relevance": "Closely related work extracting activation-space directions for character traits; complementary to this paper's unsupervised behavior discovery."
    467     },
    468     {
    469       "title": "Constitutional AI: Harmlessness from AI feedback",
    470       "authors": ["Y. Bai", "S. Kadavath", "S. Kundu"],
    471       "year": 2022,
    472       "arxiv_id": "2212.08073",
    473       "relevance": "Constitutional AI approach to safety; this paper shows how post-training safety procedures can fail through data contamination."
    474     },
    475     {
    476       "title": "The linear representation hypothesis and the geometry of large language models",
    477       "authors": ["K. Park", "Y. J. Choe", "V. Veitch"],
    478       "year": 2024,
    479       "relevance": "Provides theoretical framework for the linear representation hypothesis that underpins activation-based attribution."
    480     },
    481     {
    482       "title": "Representation engineering: A top-down approach to AI transparency",
    483       "authors": ["A. Zou", "L. Phan", "S. Chen"],
    484       "year": 2023,
    485       "arxiv_id": "2310.01405",
    486       "relevance": "Representation engineering approach to AI transparency; related activation-space methods for understanding model behavior."
    487     },
    488     {
    489       "title": "AI sandbagging: Language models can strategically underperform on evaluations",
    490       "authors": ["T. van der Weij", "F. Hofstätter", "O. Jaffe", "S. F. Brown", "F. R. Ward"],
    491       "year": 2025,
    492       "relevance": "Model organisms methodology showing strategic underperformance; part of the model organisms literature this paper builds on."
    493     }
    494   ]
    495 }

Impressum · Datenschutz