scan.json (27831B)
1 { 2 "paper": { 3 "title": "A Single Direction of Truth: An Observer Model's Linear Residual Probe Exposes and Steers Contextual Hallucinations", 4 "authors": [ 5 "Charles O'Neill", 6 "Slava Chalnev", 7 "Chi Chi Zhao", 8 "Max Kirkby", 9 "Mudith Jayasekara" 10 ], 11 "year": 2025, 12 "venue": "ICML 2025 (PMLR 267)", 13 "arxiv_id": "2507.23221", 14 "doi": "10.48550/arXiv.2507.23221" 15 }, 16 "scan_version": 2, 17 "active_modules": ["experimental_rigor", "data_leakage"], 18 "methodology_tags": ["benchmark-eval"], 19 "key_findings": "A linear probe on a frozen observer model's residual stream detects contextual hallucinations in a single forward pass, achieving F1 up to 0.99 on news summarisation and 0.84 on the synthetic ContraTales benchmark, outperforming lexical, entity, semantic similarity, and attention baselines by 5-27 points. The hallucination signal localises to a sparse, consistent late-layer MLP sub-circuit (layers 7-9 for a layer 10 probe on Gemma-2-9B). Causal steering experiments show that injecting or ablating the probe direction in a generator model bidirectionally modulates hallucination rates, establishing functional causality. Unsupervised domain adaptation on correct-only examples improves probe F1 from 0.75 to 0.89 on ContraTales.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "No repository URL or code archive is provided anywhere in the paper. The paper mentions releasing the ContraTales benchmark but provides no link to code." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": false, 30 "justification": "The paper states 'we release the 2000-example CONTRATALES benchmark' but provides no download URL or repository link in the paper text." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper mentions '8xH200 GPUs' for finetuning (Section 4.5) but provides no requirements.txt, Dockerfile, or library version details." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions are provided. The methodology sections describe the approach but not how to replicate the experiments." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper reports 95% bootstrap confidence intervals (whiskers in Figure 2) and ± notation for F1 scores, e.g., '0.97±0.01 F1 on XSUM' and '0.75 ± 0.04 F1' on ContraTales (Section 4.2)." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "Despite claiming the probe 'outperforms baselines by 5-27 points', no statistical significance tests are reported. Comparisons rely solely on point estimates with confidence intervals." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "Effect sizes are reported as absolute F1 differences: '5-8 points' over Lookback Lens on news, '9-27 points' on ContraTales (Section 4.2). Baseline and probe scores are both provided." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "Dataset sizes are stated (1000 for CNN/DM, 1000 for XSum, 2000 for ContraTales) but no justification for why these sizes are sufficient. The steering experiment uses 128 summaries per α value with no power analysis." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": true, 67 "justification": "5-fold cross-validation is used with reported standard deviations (e.g., '0.75 ± 0.04 F1'). The layer sweep in Figure 1 shows F1 across all layers for multiple models." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Four baselines are compared: lexical overlap, entity verification, semantic similarity, and Lookback Lens (Section 3.3, Figure 2)." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "Lookback Lens (Chuang et al., 2024) is recent. Semantic Entropy Probes (Kossen et al., 2024) are discussed in related work. The baselines represent current detection approaches." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Layer sweeps (Figure 1) serve as ablation across probe positions. Cross-domain transfer experiments (Figure 3) ablate the training domain. Finetuning experiments (Section 4.5) ablate observer adaptation. Attribution analysis (Section 4.3) identifies contributing components." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": false, 89 "justification": "Only F1 score is used as the detection metric. No precision, recall, accuracy, or AUROC are reported separately, though F1 implicitly combines precision and recall." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": false, 94 "justification": "No human evaluation of detection quality is performed. The steering experiment uses GPT-4.1 as a judge (Section 4.4) rather than human evaluators. For a hallucination detection paper, human validation of detected hallucinations would be relevant." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "5-fold cross-validation is used throughout (Section 3.2): 'Performance is assessed via 5-fold cross-validation.' Layer selection uses inner-fold validation on training set." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Results are broken down by dataset (CNN/DM, XSum, ContraTales), by model size (Gemma-2 2B/9B/27B, GPT-2-small), and by layer (Figures 1, 4). Cross-domain transfer is shown separately (Figure 3)." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 4.3 discusses probe activations on unrelated text from the Pile, identifying that lowest activations correspond to repetitive text (Table 4, Appendix D). The lower F1 on ContraTales vs news is discussed as a limitation." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Per-head attention attributions 'showed fluctuations around zero, lacking layer-consistent signs' (Section 4.3) — a negative result showing attention patterns are not useful. The steering trade-off where reducing hallucinations increases repetition (Section 4.4) is also a negative finding." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "Abstract claims of 5-27 point improvement, mid-layer performance plateau across Gemma-2 sizes, sparse late-layer MLP attribution, and causal steering are all supported by results in Sections 4.1-4.5." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "The paper makes causal claims about steering and supports them with interventional experiments (Section 4.4): injecting the probe direction at varying strengths (α from -60 to +60) and measuring hallucination/repetition rates. This is a controlled single-variable manipulation." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The title claims 'A Single Direction of Truth' but results are limited to Gemma-2 models (2B-27B) plus GPT-2-small. The abstract's 'generator-agnostic' claim is tested only with gpt-4.1/o4-mini generated hallucinations. The paper acknowledges this in limitations but the title and abstract are broader than the evidence." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": true, 136 "justification": "The Limitations section discusses: synthetic hallucination artifacts (probes may learn generation-specific patterns), LLM judge noise/bias for steering evaluation, and applicability only to intrinsic hallucinations. Section 4.2 also discusses potential lexical cue exploitation." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper is clear about what it measures (F1 on synthetic hallucination detection) and acknowledges in Limitations that synthetically generated hallucinations 'may exhibit patterns or artifacts predictable to an observer model' that differ from real-world hallucinations." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": false, 148 "justification": "Models are specified as 'Gemma-2-2B', 'Gemma-2-9B', 'Gemma-2-27B', 'GPT-2-small', 'gpt-4.1', 'gpt-4.1-mini', 'o4-mini', 'Claude Opus'. No snapshot dates, specific version hashes, or API versions are provided for any model." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": true, 153 "justification": "Full prompt text for hallucination generation, factual generation, and hallucination evaluation is provided in Appendix A (Sections A.1, A.2, A.3)." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Finetuning hyperparameters are reported: 'AdamW, LR 1×10^-5, context length 512, batch size 8, 8xH200 GPUs, no dropout' (Section 4.5). Logistic regression uses L2 regularisation. Steering uses greedy decoding with α range specified." 159 }, 160 "scaffolding_described": { 161 "applies": false, 162 "answer": false, 163 "justification": "No agentic scaffolding is used. The approach is a single forward pass through a frozen model plus a linear probe." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 3.1 describes data preparation: how factual and hallucinated continuations were generated, sentence length constraints (≤40 words), and concatenation for observer input. Appendix B describes ContraTales generation process with seed examples." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "A substantial 'Limitations' subsection appears within Section 5 (Discussion), spanning approximately a full column of text with multiple specific limitations." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": true, 180 "justification": "Specific threats are discussed: synthetic hallucinations may have detectable artifacts, GPT-4.1 judge may introduce noise/bias in steering evaluation, computational cost of deploying observer model, limitation to intrinsic hallucinations only." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": true, 185 "justification": "The paper explicitly states: 'our study focuses specifically on intrinsic hallucinations that contradict or are unsupported by the provided source context; the applicability... to extrinsic hallucinations... remains untested' (Limitations)." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": false, 192 "justification": "No data download link is provided despite claiming to release ContraTales. Raw activations, probe weights, and generated continuations are not available for verification." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "Data generation is described in detail: CNN/DM and XSum are standard datasets; hallucinated/factual continuations generated via gpt-4.1-mini with specific prompts (Appendix A); ContraTales generated via o4-mini with seed examples (Appendix B)." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants. Data sources are standard benchmarks (CNN/DM, XSum) and synthetic generation (ContraTales)." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": true, 207 "justification": "The pipeline from source datasets through continuation generation, concatenation, forward pass, activation extraction, to probe training is documented across Sections 3.1-3.2 and Appendix A-B. Dataset sizes are stated (Table 1)." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding source is disclosed anywhere in the paper. No acknowledgments section listing grants or sponsors." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations are listed: four authors from Parsed (London, UK), one independent researcher. Correspondence address at Parsed (charles@parsed.com)." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "With no funding disclosure, independence cannot be assessed. Parsed is a company whose products could benefit from hallucination detection technology, creating a potential undisclosed interest." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests or financial disclosure statement is present. Four of five authors are affiliated with Parsed, a commercial entity that could have financial interest in hallucination detection technology." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": false, 236 "justification": "No training data cutoff dates are stated for Gemma-2 models or GPT-2. The paper does not mention when these models' training data was collected." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": false, 241 "justification": "CNN/DM and XSum are well-known public datasets that Gemma-2 likely encountered in training. No discussion of whether the observer models saw these datasets during pre-training." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": false, 246 "justification": "CNN/DM (2017) and XSum (2018) are public benchmarks likely in Gemma-2's training data. The paper does not discuss whether pre-training exposure affects the observer's ability to detect hallucinations on these datasets." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": false, 290 "justification": "No inference cost, latency, or tokens consumed are reported. The paper claims the approach is 'efficient' and requires only 'a single forward pass' but provides no wall-clock time or cost figures." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "Hardware is mentioned for finetuning ('8xH200 GPUs') but no total GPU hours, training time, or API costs for generating the datasets are reported." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": true, 302 "justification": "5-fold cross-validation is used with 'a fixed random seed... for data splitting and sampling across all experiments' (Section 3.2). Results are reported with variance across folds." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": true, 307 "justification": "5-fold cross-validation is explicitly stated (Section 3.2). Steering experiments use 128 summaries per α value (Section 4.4)." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": false, 312 "justification": "Layer selection uses inner-fold validation but no hyperparameter search budget is stated. The L2 regularisation strength for the logistic probe is not discussed as a tuned parameter." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": true, 317 "justification": "Layer selection is done via 'inner-fold validation on the training set' (Section 3.2), which is a principled approach that avoids test set leakage." 318 }, 319 "multiple_comparison_correction": { 320 "applies": true, 321 "answer": false, 322 "justification": "Multiple comparisons are made across datasets, models, layers, and baselines without any correction for multiple testing." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The authors propose and evaluate their own method against baselines. They re-implement baselines (lexical overlap, entity verification, etc.) without acknowledging that their implementations may disadvantage baselines." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": false, 332 "justification": "The linear probe requires running a full Gemma-2 forward pass while baselines like lexical overlap require no neural network inference. This compute disparity is not quantified or discussed." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": true, 337 "justification": "The Limitations section discusses construct validity: synthetic hallucinations 'may exhibit patterns or artifacts predictable to an observer model' and may not represent real-world hallucinations. ContraTales is introduced specifically to address construct validity concerns about lexical-overlap-based detection." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": false, 341 "answer": false, 342 "justification": "No scaffolding is involved; the method is a single forward pass plus linear probe." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "CNN/DM (2017) and XSum (2018) predate Gemma-2's training. The observer model likely saw these datasets during pre-training, which could inflate detection performance. This is not discussed." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "A potential feature leakage concern: hallucinations are generated by gpt-4.1-mini while factual continuations are also from gpt-4.1-mini. The probe might detect stylistic differences in the two generation modes rather than actual hallucination. The Limitations section acknowledges this partially but does not test for it." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "Paired factual and hallucinated examples are generated from the same source articles. Whether cross-validation splits respect article-level independence is not discussed." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": false, 364 "justification": "No concrete leakage detection method is applied. No decontamination, canary strings, or membership inference tests." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "A linear probe on an observer model's residual stream outperforms heuristic detectors by 5-27 F1 points across datasets.", 371 "evidence": "Figure 2 and Section 4.2: linear probe achieves 0.99 F1 on CNN/DM, 0.97 on XSum, 0.75 on ContraTales vs best baseline (Lookback Lens at 0.94, 0.89, 0.48 respectively).", 372 "supported": "strong" 373 }, 374 { 375 "claim": "A single linear direction in residual-stream activation space separates hallucinated from supported spans, with consistent mid-layer performance plateau across model sizes.", 376 "evidence": "Figure 1 shows layer-wise F1 for Gemma-2 2B/9B/27B and GPT-2-small, all showing mid-layer plateau pattern (Section 4.1).", 377 "supported": "strong" 378 }, 379 { 380 "claim": "The hallucination signal localises to a sparse, consistent late-layer MLP sub-circuit (layers 7-9 for layer 10 probe).", 381 "evidence": "Figure 4 shows MLP attributions consistent across CNN/DM, XSum, and ContraTales. Attention attributions show no consistent pattern (Section 4.3).", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "Manipulating the probe direction causally steers hallucination rates in generation.", 386 "evidence": "Figure 5 shows bidirectional control: α=+60 increases hallucination to 0.86, α=-60 reduces to 0.35 (Section 4.4). However, hallucination is judged by GPT-4.1, not humans.", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "Unsupervised domain adaptation (finetuning on correct-only examples) improves probe F1 by 0.10-0.17 across models.", 391 "evidence": "Figure 6 shows improvements: GPT-2-small +0.10, Gemma-2-2B +0.17, Gemma-2-9B +0.14 (Section 4.5).", 392 "supported": "strong" 393 }, 394 { 395 "claim": "The hallucination direction transfers across news domains with minimal accuracy loss.", 396 "evidence": "Figure 3 shows cross-domain transfer CNN/DM→XSum and XSum→CNN/DM with the linear probe maintaining high F1 while surface-cue baselines degrade substantially (Section 4.3).", 397 "supported": "strong" 398 } 399 ], 400 "red_flags": [ 401 { 402 "flag": "Synthetic hallucination artifacts", 403 "detail": "All hallucinations are generated by gpt-4.1-mini or o4-mini. The probe may learn to detect stylistic signatures of prompted hallucination generation rather than genuine contextual inconsistency. The paper acknowledges this but does not test with naturally occurring hallucinations." 404 }, 405 { 406 "flag": "LLM-as-judge for causal claim", 407 "detail": "The steering experiment's hallucination rate is judged by GPT-4.1, introducing potential noise and bias into the key causality claim. No human validation of the judge's accuracy is provided." 408 }, 409 { 410 "flag": "Compute-unfair baseline comparison", 411 "detail": "The linear probe requires a full Gemma-2 forward pass (billions of parameters) while baselines like lexical overlap and entity verification are essentially free. The comparison is framed as method quality rather than cost-effectiveness." 412 }, 413 { 414 "flag": "No contamination analysis for observer models", 415 "detail": "Gemma-2 models were likely pre-trained on CNN/DM and XSum articles. The high F1 on these datasets (0.97-0.99) could partly reflect memorisation of the source articles rather than hallucination detection capability." 416 }, 417 { 418 "flag": "Company evaluating commercially relevant technology", 419 "detail": "Four of five authors are from Parsed, a commercial entity. Hallucination detection is commercially valuable. No competing interests statement or funding disclosure is provided." 420 } 421 ], 422 "cited_papers": [ 423 { 424 "title": "A Survey on Hallucination in Large Language Models: Principles, Taxonomy, Challenges, and Open Questions", 425 "authors": ["L. Huang", "W. Yu", "W. Ma"], 426 "year": 2024, 427 "doi": "10.1145/3703155", 428 "relevance": "Comprehensive survey on LLM hallucination taxonomy and detection, directly relevant to understanding the hallucination problem space." 429 }, 430 { 431 "title": "Detecting hallucinations in large language models using semantic entropy", 432 "authors": ["S. Farquhar", "J. Kossen", "L. Kuhn", "Y. Gal"], 433 "year": 2024, 434 "doi": "10.1038/s41586-024-07421-0", 435 "relevance": "Nature paper on semantic entropy for hallucination detection, key competing approach." 436 }, 437 { 438 "title": "Semantic Entropy Probes: Robust and Cheap Hallucination Detection in LLMs", 439 "authors": ["J. Kossen", "J. Han", "M. Razzak"], 440 "year": 2024, 441 "arxiv_id": "2406.15927", 442 "relevance": "Lightweight probe-based hallucination detection, directly comparable approach." 443 }, 444 { 445 "title": "Lookback Lens: Detecting and Mitigating Contextual Hallucinations in Large Language Models Using Only Attention Maps", 446 "authors": ["Y.-S. Chuang", "L. Qiu", "C.-Y. Hsieh"], 447 "year": 2024, 448 "arxiv_id": "2407.07071", 449 "relevance": "Attention-based hallucination detection baseline used in this paper's comparisons." 450 }, 451 { 452 "title": "SelfCheckGPT: Zero-Resource Black-Box Hallucination Detection for Generative Large Language Models", 453 "authors": ["P. Manakul", "A. Liusie", "M. J. F. Gales"], 454 "year": 2023, 455 "doi": "10.18653/v1/2023.emnlp-main.557", 456 "relevance": "Sampling-based consistency approach for hallucination detection, key related work." 457 }, 458 { 459 "title": "The Internal State of an LLM Knows When It's Lying", 460 "authors": ["A. Azaria", "T. Mitchell"], 461 "year": 2023, 462 "doi": "10.18653/v1/2023.findings-emnlp.68", 463 "relevance": "Probing internal states for truthfulness detection, foundational related work." 464 }, 465 { 466 "title": "Discovering Latent Knowledge in Language Models Without Supervision", 467 "authors": ["C. Burns", "H. Ye", "D. Klein", "J. Steinhardt"], 468 "year": 2023, 469 "relevance": "Unsupervised discovery of truth directions in LLMs, foundational to the linear representation hypothesis tested here." 470 }, 471 { 472 "title": "Cost-Effective Hallucination Detection for LLMs", 473 "authors": ["S. Valentin", "J. Fu", "G. Detommaso"], 474 "year": 2024, 475 "arxiv_id": "2407.21424", 476 "relevance": "Cost-effective hallucination detection approaches, relevant to practical deployment considerations." 477 }, 478 { 479 "title": "Trust Me, I'm Wrong: High-Certainty Hallucinations in LLMs", 480 "authors": ["A. Simhi", "I. Itzhak", "F. Barez"], 481 "year": 2025, 482 "arxiv_id": "2502.12964", 483 "relevance": "Demonstrates that models can hallucinate with high confidence, challenging uncertainty-based detection approaches." 484 }, 485 { 486 "title": "Sleeper agents: Training deceptive llms that persist through safety training", 487 "authors": ["E. Hubinger", "C. Denison", "J. Mu"], 488 "year": 2024, 489 "relevance": "Demonstrates deceptive behaviors in LLMs detectable via internal probes, relevant to interpretability-based safety." 490 } 491 ] 492 }