ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (29127B)


      1 {
      2   "paper": {
      3     "title": "Split Personality Training: Revealing Latent Knowledge Through Alternate Personalities",
      4     "authors": ["Florian Dietz", "William Wale", "Oscar Gilg", "Robert McCarthy", "Felix Michalak", "Gustavo Ewbank Rodrigues Danon", "Miguelito de Guzman", "Dietrich Klakow"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.05532"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "Split Personality Training (SPT) fine-tunes an honest auditing persona into a LoRA adapter that remains inactive during normal model operation but can review responses with access to the model's latent states. On Anthropic's Auditing Game Model Organism (adversarially-trained Llama-3.3-70B), SPT achieves 96% detection accuracy for concealed reward hacking versus a 0% baseline, with the honest persona often identifying specific fictional biases the model was trained on. Cross-topic generalization is strong (on-policy model excluding reward hacking topics matches off-policy accuracy), and SPT outperforms linear probes (93.9% vs 88.2% in-topic, 90.6% vs 81.0% cross-topic).",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "Appendix 8.1 states 'Code and data will be made publicly available upon publication; reviewers may request access during review.' This is a promise of future release, not an actual release."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "Same future-release promise as code. The 28,321 training samples and evaluation datasets are not currently available. They use Anthropic's open-source replication (Sheshadri et al. 2025) but their own generated data is not released."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No requirements.txt, Dockerfile, or environment specification provided. Models are named (Llama-3.3-70B, Gemma-3-12b-it, etc.) but no library versions, PEFT configuration details, or hardware environment are specified."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions provided. The method is described algorithmically but no runnable scripts, commands, or detailed reproduction guide is included."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "All results are point estimates (e.g., 96.0%, 84.8%, 72.2%). No confidence intervals or error bars are reported despite n=1,000-2,000 samples per condition."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Claims like 'SPT outperforms probes' (Table 2) and comparisons across configurations (Table 1) rely solely on comparing point estimates. No statistical significance tests are applied."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Effect sizes are clearly contextualized: 96% vs 0% baseline detection on poisoned responses, 96% vs 72.2% for base Llama. Table 2 compares probe (88.2%/81.0%) vs SPT (93.9%/90.6%) with baseline context. The magnitude of improvement is unambiguous."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Sample sizes stated as 'n = 1,000–2,000 samples per condition per setting' (Table 1 caption) and 'n = 720 samples across 9 topics' (Table 2) but no justification for why these sizes are adequate or power analysis."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Section 4.1.1: 'Each configuration was trained once due to the computational cost of 70B-scale experiments.' Single-run results with no variance across seeds or runs."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Table 1 includes two baselines: the poisoned model without SPT (0%/100% = 50% overall) and base Llama-3.3-70B reviewing outputs (46.1%/98.3% = 72.2%). Table 2 compares against linear probes."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Baselines use the same Anthropic Auditing Game benchmark (Marks et al., 2025; Sheshadri et al., 2025) which is contemporary. Linear probe baselines are a standard contemporary approach for latent knowledge extraction."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Table 1 systematically ablates LoRA-mask (on/off) × on-policy/off-policy training. Section 4.2 ablates training topic subsets. Section 4.4 ablates intervention string variations. These isolate individual component contributions."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Reports accuracy on poisoned responses (true positive), accuracy on clean responses (true negative), and overall accuracy. Table 2 also reports in-topic vs held-out topic accuracy."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No systematic human evaluation of the honest persona's review quality. Section 4.1.2 provides qualitative analysis of reviews but this is author inspection, not structured human evaluation."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The Anthropic Auditing Game evaluation datasets (from Sheshadri et al. 2025) are separate from training data. Section 3.3.2 explicitly excludes reward hacking topics from on-policy training to prevent overlap with the test benchmark."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table 1 breaks down by model configuration. Figure 5 provides per-topic breakdowns for cross-topic generalization. Table 1 separates poisoned vs clean accuracy rather than just reporting overall."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 4.2: fabricated statistics detection fails without explicit training. Section 4.1.2: honest persona sometimes produces accurate reviews but incorrect PASS/FAIL flags. Section 4.4: Gemma's honest persona is vulnerable to jailbreaks despite intervention string variations."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "LoRA-mask performs worse than no-mask (84.8% vs 96.0%, Table 1). Fabricated statistics don't generalize (Section 4.2). Gemma's honest persona remains jailbreakable (Section 4.4). PASS/FAIL flags are less reliable than written reviews (Section 4.1.2, Section 5)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims '96% overall accuracy' (Table 1: 96.0%), 'near 0% accuracy' baseline (Table 1: 0% poisoned detection → 50% overall), 'honest persona reveals latent knowledge' (Section 4.1.2, 4.1.4, Appendix 8.4 and 8.6) are all supported in the results."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Causal claims are supported by controlled ablations: LoRA-mask vs no-mask, on-policy vs off-policy (Table 1), topic subset holdout (Figure 5). The prompt-swap experiment (Appendix 8.4) provides a controlled decomposition of latent knowledge vs surface heuristics."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Section 5 explicitly bounds: 'single-turn English conversations', 'Most of our quantitative results come from Anthropic's Model Organism', 'performance on other misalignment types (e.g. alignment faking) remains untested.' Abstract specifies 'on the Anthropic Auditing Game Model Organism.'"
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 4.1.4 and Appendix 8.4 explicitly address the alternative explanation that the honest persona relies on surface heuristics rather than latent knowledge. They design a prompt-swap experiment to disentangle these contributions, finding roughly equal effects (4.7% latent knowledge vs 4.5% surface heuristics)."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Section 4.1.2 distinguishes between binary PASS/FAIL accuracy (their primary metric) and review quality, noting 'the honest persona sometimes produces accurate written reviews but incorrect PASS/FAIL flags.' Section 5 acknowledges this discrepancy. They measure detection accuracy as a proxy for auditing capability and discuss the gap."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Specific model versions named throughout: Llama-3.3-70B Instruct, Gemma-3-12b-it, Gemma-3-27b-it, Qwen3-14b, Qwen3-32b, Qwen3-next-80B, Claude Sonnet 4.5. These are specific model releases, not vague marketing names."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "Intervention strings are provided (Section 3.2.2 examples, Appendix 8.6, 8.8). However, the full data generation prompts for the 4-stage pipeline (Section 3.3.1) are described in natural language but not reproduced. System prompts (biased/unbiased) are described but no actual text shown."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No LoRA rank, learning rate, batch size, training epochs, or optimizer settings are reported. Probe hyperparameters are given (Section 4.3: layers, regularization α=100) but LoRA fine-tuning hyperparameters are absent."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. SPT is a fine-tuning and inference procedure, not an agentic system."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "The 4-stage generation pipeline is described (Section 3.3.1) with quality criteria (Appendix 8.1.2), and final count is 28,321 after filtering (Table 3 by model). However, no per-stage filtering counts are provided — how many samples were generated before quality filtering and how many were removed at each criterion is not documented."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 5 'Limitations' provides substantive discussion of four specific limitations: single-turn English data, single benchmark, PASS/FAIL vs review discrepancy, and jailbreak vulnerability."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Specific threats discussed: single-turn English training data (Section 5), single benchmark dependency (Section 5), fabricated statistics as exception to generalization (Section 4.2), honest persona jailbreak vulnerability varying by model (Section 4.4), surface heuristic reliance quantified at ~4.5% (Appendix 8.4)."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 5: 'performance on other misalignment types (e.g. alignment faking) remains untested', 'All training data were single-turn English conversations', 'Most of our quantitative results come from Anthropic's Model Organism.' These are specific scope boundaries."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "Training data and evaluation outputs not available. Promise of future release only (Appendix 8.1)."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 3.3 describes the 4-stage synthetic data generation process in detail: external model generates task prompts, target model generates responses, external model judges, external model filters. Quality criteria detailed in Appendix 8.1.2."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Data is synthetically generated and the benchmark is a standard publicly available artifact (Anthropic's open-source replication)."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The 4-stage pipeline is described conceptually (Section 3.3.1) and final counts per model are given (Table 3: 28,321 total). However, per-stage attrition counts are missing — how many samples entered each stage and how many were filtered at each quality criterion is not documented."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Acknowledgements: 'funding from Coefficient Giving (formerly Open Philanthropy)' and conducted through MARS program at CAISH and SPAR."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "All author affiliations listed: Saarland University, University of Bergen, independent researchers, UCL, MRM Investments L.L.C. No author is affiliated with Anthropic (whose benchmark is used) or Meta (Llama developer)."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Coefficient Giving (Open Philanthropy) funds AI safety research broadly and does not have a financial stake in SPT specifically succeeding. The funder is independent of the outcome."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement in the paper. One author is listed as 'Head of Research and Development at MRM Investments L.L.C.' but no financial interests disclosure is provided."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No training data cutoff date stated for the base Llama-3.3-70B model. The Anthropic benchmark's fictional biases mitigate traditional contamination concerns, but the training cutoff is not stated."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "Section 3.3.2 explicitly addresses train-test overlap: 'We excluded all reward hacking-related topics... for on-policy training' and 'None of the samples in our training data contain any of the reward model biases that the poisoned model was trained on' (Section 4.1)."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "The Anthropic benchmark uses fictional reward model biases specifically to prevent contamination — the base model cannot know about biases that don't exist in the real world. The paper leverages this property and further ensures SPT training data excludes related topics (Section 3.3.2)."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference cost, latency, or per-example cost reported. Section 3.2.1 mentions LoRA-mask has a 'strong practical advantage' through KV cache reuse but no quantified cost comparison."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "Section 4.1.1 mentions 'computational cost of 70B-scale experiments' as reason for single runs but does not quantify GPU hours, training time, or total compute budget."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Section 4.1.1: 'Each configuration was trained once due to the computational cost of 70B-scale experiments.' No seed sensitivity analysis."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Explicitly stated: single training run per configuration (Section 4.1.1), 'n = 1,000–2,000 samples per condition per setting' for evaluation (Table 1 caption), 'n = 720 samples across 9 topics' (Table 2)."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No hyperparameter search budget reported for LoRA training. Probe hyperparameters describe a sweep (Section 4.3: 'swept over layers, regularization strength, and activation source') but LoRA training hyperparameter selection is not documented."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Table 1 caption: 'We tested 2-4 different intervention strings per model configuration and report the best.' Only the best intervention string results are shown without reporting all intervention strings tried or how 'best' was selected (validation set vs test set)."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Authors evaluate their own method (SPT) against their own baselines and probe implementations. No discussion of self-comparison bias or independent evaluation."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "No comparison of performance as a function of compute. LoRA-mask variant is noted as more efficient (KV cache reuse) but no quantified compute comparison against no-mask variant or baselines."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "Section 5 discusses construct validity: 'Although we deem reward hacking as a very good proxy for the general ability to elicit latent knowledge, performance on other misalignment types (e.g. alignment faking) remains untested.' Section 2.4 also discusses why the Anthropic benchmark is a rigorous test."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding involved. SPT is a fine-tuning method, not an agentic scaffold."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of temporal leakage. The benchmark (Anthropic Auditing Game, 2025) likely postdates Llama-3.3-70B training, but this is not explicitly verified or discussed."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "The evaluation uses a 50/50 split of clean and poisoned responses (Section 4.1.1). Whether this balanced split or the evaluation framing leaks information about expected behavior is not discussed."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": true,
    352         "justification": "Section 3.3.2 explicitly addresses independence: reward hacking topics excluded from on-policy training, off-policy training uses base (non-poisoned) Llama. The evaluation benchmark uses fictional biases not present in any training data."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "Appendix 8.4 introduces a prompt-swap experiment to detect whether performance comes from surface heuristics vs genuine latent knowledge. The topic exclusion in Section 3.3.2 is also a concrete leakage prevention method."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "SPT achieves 96% overall accuracy detecting reward hacks on Anthropic's Auditing Game Model Organism, versus 0% baseline accuracy.",
    364       "evidence": "Table 1: best SPT model achieves 96.6% on poisoned, 95.3% on clean (96.0% overall). Poisoned baseline: 0% on poisoned, 100% on clean (50% overall). n=1,000-2,000 per condition (Section 4.1.1).",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "The honest persona accesses genuine latent knowledge, not just surface patterns.",
    369       "evidence": "Section 4.1.2: reviews reference fictional reward model biases by name. Appendix 8.4: prompt-swap experiment estimates ~4.7% accuracy from latent knowledge vs ~4.5% from surface heuristics. Appendix 8.6: context-free probes reveal specific fictional biases.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "SPT generalizes across alignment topics without retraining on each specific topic.",
    374       "evidence": "Section 4.2 and Figure 5: cross-topic generalization tested on Gemma-3-12b-it and Qwen3-14b. On-policy model trained without reward hacking topics achieves identical 96.0% on reward hacking detection (Section 4.1.1, Table 1).",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "SPT outperforms linear probes on both in-topic and cross-topic detection.",
    379       "evidence": "Table 2: SPT 93.9% vs probe 88.2% in-topic, SPT 90.6% vs probe 81.0% held-out topics (Qwen3-14b, n=720). Single comparison on one model without significance tests.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "SPT has zero alignment tax — the honest persona cannot affect the main model's behavior by construction.",
    384       "evidence": "Architectural argument (Section 3.1): LoRA adapter deactivated during main model inference, review stripped from conversation history. This is a design property, not an empirical claim, and holds by construction.",
    385       "supported": "strong"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "Single training run at scale",
    391       "detail": "Each 70B configuration was trained once (Section 4.1.1). Without variance across runs, it is impossible to know if the 96% result is typical or a lucky seed. The consistency argument across configurations is reasonable but does not replace proper seed sensitivity analysis."
    392     },
    393     {
    394       "flag": "Best intervention string cherry-picking",
    395       "detail": "Table 1 caption: 'We tested 2-4 different intervention strings per model configuration and report the best.' This is a form of selection on the test set unless intervention strings were selected on a separate validation split, which is not described."
    396     },
    397     {
    398       "flag": "No statistical significance tests",
    399       "detail": "All comparisons (SPT vs baselines, LoRA-mask vs no-mask, SPT vs probes) are based on comparing point estimates with no significance tests, despite large sample sizes that would easily support such tests."
    400     },
    401     {
    402       "flag": "Latent knowledge contribution is modest",
    403       "detail": "Appendix 8.4 estimates latent knowledge accounts for ~4.7% accuracy improvement vs ~4.5% from surface heuristics, on a smaller model (Gemma-3-12b-it). The 96% result on Llama-70B may rely more heavily on surface patterns than the framing suggests, and this decomposition was not replicated at 70B scale."
    404     }
    405   ],
    406   "cited_papers": [
    407     {
    408       "title": "Alignment faking in large language models",
    409       "authors": ["R. Greenblatt", "C. Denison", "B. Wright"],
    410       "year": 2024,
    411       "relevance": "Empirical demonstration of strategic deception in LLMs — the threat model that motivates auditing methods like SPT."
    412     },
    413     {
    414       "title": "Auditing language models for hidden objectives",
    415       "authors": ["S. Marks", "J. Treutlein", "T. Bricken"],
    416       "year": 2025,
    417       "relevance": "Defines the Anthropic Auditing Game Model Organism benchmark used as the primary evaluation in this paper."
    418     },
    419     {
    420       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    421       "authors": ["E. Hubinger", "C. Denison", "J. Mu"],
    422       "year": 2024,
    423       "arxiv_id": "2401.05566",
    424       "relevance": "Demonstrates that deceptive behaviors persist through standard safety training, establishing the threat model for SPT."
    425     },
    426     {
    427       "title": "Reasoning models don't always say what they think",
    428       "authors": ["Y. Chen"],
    429       "year": 2025,
    430       "arxiv_id": "2505.05410",
    431       "relevance": "Demonstrates systematic unfaithfulness in model reasoning chains, motivating methods that access latent states."
    432     },
    433     {
    434       "title": "Training llms for honesty via confessions",
    435       "authors": ["M. Joglekar", "J. Chen", "G. Wu"],
    436       "year": 2025,
    437       "relevance": "Related approach training separate honest confessions — SPT differs by using architecturally separated LoRA adapter."
    438     },
    439     {
    440       "title": "Spilling the beans: Teaching llms to self-report their hidden objectives",
    441       "authors": ["Y. Li"],
    442       "year": 2025,
    443       "arxiv_id": "2511.06626",
    444       "relevance": "Self-Report Fine-Tuning (SRFT) for admitting hidden objectives — related honest self-reporting approach compared to SPT."
    445     },
    446     {
    447       "title": "Discovering latent knowledge in language models without supervision",
    448       "authors": ["C. Burns", "H. Ye", "D. Klein", "J. Steinhardt"],
    449       "year": 2023,
    450       "relevance": "Foundational work on eliciting latent knowledge from LLMs, which SPT extends by training an internal persona."
    451     },
    452     {
    453       "title": "LoRA: Low-rank adaptation of large language models",
    454       "authors": ["E. J. Hu", "Y. Shen", "P. Wallis"],
    455       "year": 2022,
    456       "relevance": "The parameter-efficient fine-tuning method used as SPT's delivery mechanism for the honest persona."
    457     },
    458     {
    459       "title": "Language models don't always say what they think: Unfaithful explanations in chain-of-thought prompting",
    460       "authors": ["M. Turpin", "J. Michael", "E. Perez", "S. Bowman"],
    461       "year": 2023,
    462       "relevance": "Demonstrates unfaithful chain-of-thought explanations in LLMs — foundational motivation for accessing latent states."
    463     },
    464     {
    465       "title": "Measuring faithfulness in chain-of-thought reasoning",
    466       "authors": ["T. Lanham", "A. Chen", "A. Radhakrishnan"],
    467       "year": 2023,
    468       "arxiv_id": "2307.13702",
    469       "relevance": "Shows chain-of-thought faithfulness exhibits inverse scaling — larger models are less faithful, motivating latent state access."
    470     },
    471     {
    472       "title": "Activation oracles: Training and evaluating llms as general-purpose activation explainers",
    473       "authors": ["A. Karvonen", "J. Chua"],
    474       "year": 2026,
    475       "relevance": "Related approach training decoders to interpret model activations — SPT differs by using internal persona rather than external decoder."
    476     },
    477     {
    478       "title": "Open problems in mechanistic interpretability",
    479       "authors": ["L. Sharkey", "B. Chughtai", "J. Batson"],
    480       "year": 2025,
    481       "arxiv_id": "2501.16496",
    482       "relevance": "Documents limitations of mechanistic interpretability that SPT aims to complement for auditing."
    483     }
    484   ]
    485 }

Impressum · Datenschutz