scan.json (28688B)
1 { 2 "paper": { 3 "title": "Beyond Token Probes: Hallucination Detection via Activation Tensors with ACT-ViT", 4 "authors": [ 5 "Guy Bar-Shalom", 6 "Fabrizio Frasca", 7 "Yaniv Galron", 8 "Yftah Ziser", 9 "Haggai Maron" 10 ], 11 "year": 2025, 12 "venue": "NeurIPS 2025", 13 "arxiv_id": "2510.00296" 14 }, 15 "checklist": { 16 "artifacts": { 17 "code_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The abstract provides a GitHub link: https://github.com/BarSGuy/ACT-ViT. The NeurIPS checklist item 5 confirms 'The code to reproduce all our experiments is provided in https://github.com/BarSGuy/ACT-ViT.'" 21 }, 22 "data_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "The paper uses publicly available datasets: TriviaQA, HotpotQA, IMDB, and Movies. Dataset licenses are listed in Section A.2 (CC-BY-SA-4.0, MIT, etc.). The Movies dataset is from [48] and the code repository presumably provides the dataset construction pipeline." 26 }, 27 "environment_specified": { 28 "applies": true, 29 "answer": true, 30 "justification": "Section A states: 'Our experiments were conducted using the PyTorch [50] framework (License: BSD), using a single NVIDIA L-40 GPU for all experiments.' Specific LLM model versions with Hugging Face URLs are listed. Optimizer (AdamW), scheduler (cosine with warmup), batch sizes (128/64), and framework versions are provided. The code release presumably includes dependency specifications." 31 }, 32 "reproduction_instructions": { 33 "applies": true, 34 "answer": true, 35 "justification": "The NeurIPS checklist confirms code is released to reproduce all experiments. Section A provides comprehensive experimental details including hyperparameter grids (Tables 3-6), data splits, optimizer settings, and training procedures sufficient for reproduction." 36 } 37 }, 38 "statistical_methodology": { 39 "confidence_intervals_or_error_bars": { 40 "applies": true, 41 "answer": true, 42 "justification": "Table 7 reports 1-sigma error bars for the LOS-Net comparison. Table 8 reports ACT-ViT AUC averaged over three seeds with 1-sigma error bars (e.g., '84.48 ± 0.28'). The NeurIPS checklist states: 'We have now added results averaged over 3 random seeds with 1-sigma error bars, reported in Section A.3.'" 43 }, 44 "significance_tests": { 45 "applies": true, 46 "answer": false, 47 "justification": "The paper claims ACT-ViT 'consistently outperforms' baselines and reports improvement margins (Table 1, last row), but no formal significance tests (p-values, paired tests) are used to determine whether observed differences are statistically significant." 48 }, 49 "effect_sizes_reported": { 50 "applies": true, 51 "answer": true, 52 "justification": "The paper reports absolute AUC improvement over baselines with full baseline context. Table 1's last row shows the improvement of ACT-ViT over the best prior baseline for each of the 15 settings (e.g., '+4.04', '+3.43', '-0.58'). Both the raw AUC values and the improvement deltas are provided." 53 }, 54 "sample_size_justified": { 55 "applies": true, 56 "answer": false, 57 "justification": "The paper uses 10,000 training and 10,000 test samples per dataset (Section A.2) and 3 random seeds for error bars, but does not justify why these sizes are sufficient. No power analysis or sample size justification is provided." 58 }, 59 "variance_reported": { 60 "applies": true, 61 "answer": true, 62 "justification": "Table 8 reports ACT-ViT results averaged over 3 random seeds with standard deviation (1-sigma error bars). Table 7 also reports LOS-Net with 1-sigma error bars. However, the main Table 1 results appear to be from single runs for most methods." 63 } 64 }, 65 "evaluation_design": { 66 "baselines_included": { 67 "applies": true, 68 "answer": true, 69 "justification": "Section 5 ('Methods in comparison') describes multiple baseline categories: probability/logits-based methods (Logit/Probas-mean/min/max), LOS-Net, probing classifiers (Token[n], Probe[*]), and ablated variants of their own method (ACT-MLP, ACT-ViT(s), ACT-MLP(s))." 70 }, 71 "baselines_contemporary": { 72 "applies": true, 73 "answer": true, 74 "justification": "The baselines include LOS-Net [6] (2025, from the same authors), Probe[*] following the setup of Orgad et al. [48] (2024), and probability-based methods that are standard in the field. These represent contemporary approaches." 75 }, 76 "ablation_study": { 77 "applies": true, 78 "answer": true, 79 "justification": "The paper includes systematic ablations: ACT-ViT vs ACT-ViT(s) (multi-dataset vs single-dataset training), ACT-MLP vs ACT-ViT (MLP vs ViT backbone, answering Q(a)3), and Figure 3 ablates pooling hyperparameters (Lp, Np) to study the performance-efficiency trade-off." 80 }, 81 "multiple_metrics": { 82 "applies": true, 83 "answer": false, 84 "justification": "The paper uses only AUC (area under the ROC curve) as the evaluation metric. Section 5 states: 'We use the area under the ROC curve (AUC) to evaluate error detectors, a standard metric in this domain.' No additional metrics (e.g., accuracy, F1, precision, recall) are reported." 85 }, 86 "human_evaluation": { 87 "applies": false, 88 "answer": false, 89 "justification": "This is a method paper for automated hallucination detection using internal model representations. The evaluation is entirely based on automated comparison against ground-truth labels. Human evaluation of the detection outputs is not relevant to the claims." 90 }, 91 "held_out_test_set": { 92 "applies": true, 93 "answer": true, 94 "justification": "Section A.2 describes: 'a consistent split of 10,000 training samples and 10,000 test samples' with 20% of training used for validation. Hyperparameters are selected on validation sets (Section A.1), and results are reported on held-out test sets." 95 }, 96 "per_category_breakdown": { 97 "applies": true, 98 "answer": true, 99 "justification": "Table 1 provides results broken down by all 15 LLM-dataset combinations (3 LLMs x 5 datasets). Tables 2, Figures 4 and 5 similarly provide per-combination breakdowns. This allows readers to see performance variation across settings." 100 }, 101 "failure_cases_discussed": { 102 "applies": true, 103 "answer": true, 104 "justification": "The paper acknowledges cases where ACT-ViT does not improve: Table 1 shows IMDB on Mis-7B where ACT-ViT slightly underperforms Probe[*] (-0.58 AUC). Table 2 shows Mis-7B/IMDB (-0.03) and LlaMa-8B/Movies (-1.30) where the method falls short. The Limitations section discusses information loss from pooling." 105 }, 106 "negative_results_reported": { 107 "applies": true, 108 "answer": true, 109 "justification": "The paper reports settings where ACT-ViT does not outperform baselines: IMDB on Mis-7B (Table 1, -0.58), and ACT-MLP consistently underperforming Probe[*], which demonstrates the importance of the ViT backbone. The ACT-MLP baseline is shown to be a poor choice, providing a negative result for that design." 110 } 111 }, 112 "claims_and_evidence": { 113 "abstract_claims_supported": { 114 "applies": true, 115 "answer": true, 116 "justification": "The abstract claims: (1) ACT-ViT 'consistently outperforms traditional probing techniques' — supported by Table 1 (14/15 improvements); (2) 'benefits substantially from multi-LLM training' — supported by ACT-ViT vs ACT-ViT(s) comparison; (3) 'strong zero-shot performance on unseen datasets' — supported by Figure 4; (4) 'transferred effectively to new LLMs' — supported by Table 2. All claims are backed by experimental results." 117 }, 118 "causal_claims_justified": { 119 "applies": true, 120 "answer": true, 121 "justification": "The paper makes causal claims through ablation studies. The comparison of ACT-ViT vs ACT-MLP isolates the effect of the ViT backbone (Q(a)3). The comparison of ACT-ViT vs ACT-ViT(s) isolates multi-dataset training benefits (Q(a)2). Figure 3 ablates pooling parameters. These controlled single-variable manipulations adequately support the causal claims." 122 }, 123 "generalization_bounded": { 124 "applies": true, 125 "answer": true, 126 "justification": "The paper bounds its claims to the tested setting: 3 LLMs (all 7-8B parameter models), 5 datasets, and specifically lists the model names and datasets. The Limitations section acknowledges that pooling 'may discard potentially informative signals.' Claims are generally scoped to the tested combinations." 127 }, 128 "alternative_explanations_discussed": { 129 "applies": true, 130 "answer": false, 131 "justification": "The paper does not substantively discuss alternative explanations for why ACT-ViT outperforms probing classifiers. The architectural contribution is presented without exploring whether simpler alternatives (e.g., non-ViT attention mechanisms, CNN-based approaches on activation tensors) might achieve similar results. The broader impact section (Section D) does not address this." 132 } 133 }, 134 "setup_transparency": { 135 "model_versions_specified": { 136 "applies": true, 137 "answer": true, 138 "justification": "Section A specifies exact model versions with Hugging Face URLs: 'Mistral-7b-instruct-v0.2' (https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3), 'Llama-3-8b-Instruct' (https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct), and 'Qwen-2.5-7b-Instruct' (https://huggingface.co/Qwen/Qwen2.5-7B-Instruct). These are specific versioned model identifiers." 139 }, 140 "prompts_provided": { 141 "applies": true, 142 "answer": false, 143 "justification": "The paper uses prompts for generating LLM responses (e.g., Section A.2 mentions IMDB uses 'a one-shot prompt to help the large language model apply the predefined sentiment labels accurately'). However, the actual prompt text is not provided in the paper. The paper references the setup of [48] but does not reproduce the prompts." 144 }, 145 "hyperparameters_reported": { 146 "applies": true, 147 "answer": true, 148 "justification": "Detailed hyperparameter grids are provided in Tables 3-6 of Section A.1, including number of layers, learning rate, embedding size, epochs, dropout, weight decay, and patch size for each experimental setup. Optimizer (AdamW), scheduler (cosine with warmup over first 10% of epochs), and batch sizes (128/64) are also specified." 149 }, 150 "scaffolding_described": { 151 "applies": false, 152 "answer": false, 153 "justification": "No agentic scaffolding is used. ACT-ViT is a supervised classification architecture that processes activation tensors; there is no multi-step agent workflow, tool use, or feedback loop." 154 }, 155 "data_preprocessing_documented": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section A.2 documents dataset construction following [48], including train/test splits (10,000/10,000), validation split (20% stratified, seed 42). The activation tensor extraction process is formally defined in Equation 1 and Section A. The pooling algorithm is provided in Algorithm 1 (Section B). Movies dataset has 7,857 test samples, noted as an exception." 159 } 160 }, 161 "limitations_and_scope": { 162 "limitations_section_present": { 163 "applies": true, 164 "answer": true, 165 "justification": "Section 6 (Conclusions) includes a dedicated 'Limitations' paragraph discussing that pooling 'may discard potentially informative signals' and that 'Developing more sophisticated approaches to handle AT size without sacrificing signal fidelity remains a promising future direction.'" 166 }, 167 "threats_to_validity_specific": { 168 "applies": true, 169 "answer": false, 170 "justification": "The limitations section is brief and discusses only one specific issue (information loss from pooling). It does not discuss broader threats such as: whether the 7-8B model size is representative of larger models, whether the binary correctness labeling scheme captures all forms of hallucination, or whether the specific datasets chosen may favor certain methods." 171 }, 172 "scope_boundaries_stated": { 173 "applies": true, 174 "answer": false, 175 "justification": "The paper does not explicitly state what the results do NOT show. It does not discuss boundaries such as: results are limited to 7-8B parameter models and may not transfer to much larger models; the correctness labeling is binary and may not capture partial hallucinations; the method requires white-box access which limits practical applicability. The title 'Beyond Token Probes' is broad but results are limited to a specific set of models and tasks." 176 } 177 }, 178 "data_integrity": { 179 "raw_data_available": { 180 "applies": true, 181 "answer": true, 182 "justification": "The datasets used (TriviaQA, HotpotQA, IMDB, Movies) are publicly available with licenses listed in Section A.2. The code repository at https://github.com/BarSGuy/ACT-ViT presumably provides the data construction pipeline. Raw activation tensors would be large but the generation procedure is fully documented." 183 }, 184 "data_collection_described": { 185 "applies": true, 186 "answer": true, 187 "justification": "Section 3 ('Dataset construction') formally describes how activation datasets are constructed: queries are fed to LLMs, responses are generated, activation tensors are extracted, and binary hallucination labels are computed by comparing responses against ground truth answers. Section A.2 provides additional dataset-specific details." 188 }, 189 "recruitment_methods_described": { 190 "applies": false, 191 "answer": false, 192 "justification": "No human participants are involved. The data consists of LLM-generated activation tensors from public benchmark datasets. No recruitment is applicable." 193 }, 194 "data_pipeline_documented": { 195 "applies": true, 196 "answer": true, 197 "justification": "The full pipeline is documented: (1) input queries from benchmark datasets → (2) LLM response generation → (3) activation tensor extraction (Equation 1, Section A) → (4) binary label assignment via response comparison → (5) train/test/validation splitting (Section A.2) → (6) pooling (Algorithm 1) → (7) model training. Dataset sizes and split ratios are specified." 198 } 199 }, 200 "conflicts_of_interest": { 201 "funding_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "The Acknowledgements section lists funding sources: 'G.B. is supported by the Jacobs Qualcomm PhD Fellowship. F.F. conducted this work supported by an Aly Kaufman Post-Doctoral Fellowship. H.M. is a Robert J. Shillman Fellow and is supported by the Israel Science Foundation through a personal grant (ISF 264/23) and an equipment grant (ISF 532/23).'" 205 }, 206 "affiliations_disclosed": { 207 "applies": true, 208 "answer": true, 209 "justification": "Author affiliations are listed: Technion for Bar-Shalom, Frasca, Galron, and Maron; University of Groningen and Nvidia Research for Ziser; Technion and Nvidia Research for Maron. The Nvidia affiliations are disclosed alongside their academic affiliations." 210 }, 211 "funder_independent_of_outcome": { 212 "applies": true, 213 "answer": true, 214 "justification": "The funders (Jacobs Qualcomm Fellowship, Aly Kaufman Fellowship, ISF grants) are academic funding sources with no direct financial interest in whether ACT-ViT outperforms baselines. The paper does not evaluate any Qualcomm, Nvidia, or ISF products." 215 }, 216 "financial_interests_declared": { 217 "applies": true, 218 "answer": false, 219 "justification": "Two authors (Yftah Ziser and Haggai Maron) have Nvidia Research affiliations. There is no competing interests statement or declaration about financial interests (equity, patents, etc.). The absence of a competing interests declaration is noted." 220 } 221 }, 222 "contamination": { 223 "training_cutoff_stated": { 224 "applies": false, 225 "answer": false, 226 "justification": "This paper does not evaluate a pre-trained model's knowledge on a benchmark. It trains a new classifier (ACT-ViT) on activation tensors to detect hallucinations. The LLMs are used to generate activation data, but the evaluation measures ACT-ViT's classification performance, not the LLMs' factual knowledge. Contamination of the LLM training data is not relevant to the claims." 227 }, 228 "train_test_overlap_discussed": { 229 "applies": false, 230 "answer": false, 231 "justification": "Same as above — the paper evaluates ACT-ViT's ability to classify activation tensors, not the LLMs' factual capabilities. Train/test overlap of ACT-ViT's own data is addressed through proper splits (Section A.2)." 232 }, 233 "benchmark_contamination_addressed": { 234 "applies": false, 235 "answer": false, 236 "justification": "Same rationale — the paper proposes and evaluates a new classifier architecture, not the underlying LLMs' performance on benchmarks. Benchmark contamination is not relevant to the paper's claims about ACT-ViT." 237 } 238 }, 239 "human_studies": { 240 "pre_registered": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants are involved in this study. All experiments are computational." 244 }, 245 "irb_or_ethics_approval": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants. The NeurIPS checklist confirms: 'The paper does not involve crowdsourcing nor research with human subjects.'" 249 }, 250 "demographics_reported": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants are involved." 254 }, 255 "inclusion_exclusion_criteria": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants are involved." 259 }, 260 "randomization_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants are involved." 264 }, 265 "blinding_described": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants are involved." 269 }, 270 "attrition_reported": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants are involved." 274 } 275 }, 276 "cost_and_practicality": { 277 "inference_cost_reported": { 278 "applies": true, 279 "answer": true, 280 "justification": "The paper reports inference time: 'around 10^-5 seconds' per instance for ACT-ViT (Section 5.1). This is compared against multi-query methods that take 'up to several seconds per instance' (Section 1). Table 9 provides detailed training times for all 15 LLM-dataset combinations." 281 }, 282 "compute_budget_stated": { 283 "applies": true, 284 "answer": true, 285 "justification": "Section A states: 'Our experiments were conducted using... a single NVIDIA L-40 GPU for all experiments.' Training times are detailed in Table 9 (ranging from ~10 to ~27 minutes per LLM-dataset). Section 5.1 states the full corpus training takes 'below three hours on a single NVIDIA L-40 GPU.'" 286 } 287 } 288 }, 289 "claims": [ 290 { 291 "claim": "ACT-ViT consistently outperforms traditional probing methods across 15 LLM-dataset combinations in in-domain hallucination detection.", 292 "evidence": "Table 1 shows ACT-ViT outperforms all baselines in 14 out of 15 settings, with improvements ranging from +0.54 to +7.05 AUC points over the best prior baseline. The only exception is IMDB on Mis-7B (-0.58).", 293 "supported": "strong" 294 }, 295 { 296 "claim": "Multi-LLM/multi-dataset joint training benefits ACT-ViT: ACT-ViT outperforms ACT-ViT(s) (single-dataset variant) in 12 out of 15 cases.", 297 "evidence": "Table 1 comparison between ACT-ViT and ACT-ViT(s) rows, Section 5.1 discussion.", 298 "supported": "strong" 299 }, 300 { 301 "claim": "ACT-ViT achieves strong zero-shot generalization to unseen datasets, outperforming the best probability-based baseline in 13 out of 15 leave-one-dataset-out settings.", 302 "evidence": "Figure 4 shows zero-shot AUC results across all 15 LLM-dataset combinations in a leave-one-dataset-out setup.", 303 "supported": "strong" 304 }, 305 { 306 "claim": "ACT-ViT can be effectively transferred to unseen LLMs by training only the lightweight Linear Adapter, outperforming baselines in 13 out of 15 cases.", 307 "evidence": "Table 2 reports results when training on 2 of 3 LLMs and adapting to the held-out LLM. ACT-ViT shows improvements in 13/15 settings with gains up to +6.70 AUC.", 308 "supported": "strong" 309 }, 310 { 311 "claim": "ACT-ViT requires only ~5-10% of training data to surpass Probe[*] trained on the full dataset.", 312 "evidence": "Figure 5 (Mis-7B, HotpotQA) shows ACT-ViT with 10% of data outperforms Probe[*] at 100%. Section A.6 provides additional low-data results for all 15 combinations.", 313 "supported": "moderate" 314 }, 315 { 316 "claim": "The ViT backbone provides a crucial inductive bias: ACT-MLP baselines are often outperformed by standard probes, while ACT-ViT consistently surpasses them.", 317 "evidence": "Table 1 shows ACT-MLP(s) and ACT-MLP are frequently below Probe[*], while ACT-ViT and ACT-ViT(s) are above. Section 5.1 explicitly answers Q(a)3.", 318 "supported": "strong" 319 } 320 ], 321 "methodology_tags": [ 322 "benchmark-eval" 323 ], 324 "key_findings": "ACT-ViT, a Vision Transformer-inspired architecture that processes full activation tensors from LLMs, consistently outperforms traditional single-position probing classifiers for hallucination detection across 15 LLM-dataset combinations, with improvements up to 7 AUC points. The method supports multi-LLM joint training, which improves over single-dataset training in 12/15 cases. ACT-ViT demonstrates strong zero-shot generalization to unseen datasets and can adapt to new LLMs by training only a lightweight linear adapter while keeping the ViT backbone frozen. Inference takes approximately 10^-5 seconds per instance, making it suitable for real-time deployment.", 325 "red_flags": [ 326 { 327 "flag": "Single evaluation metric", 328 "detail": "All results are reported using only AUC. No secondary metrics (accuracy, F1, precision, recall at operating points) are provided, which limits understanding of practical deployment characteristics." 329 }, 330 { 331 "flag": "Limited model scale", 332 "detail": "All three LLMs tested are in the 7-8B parameter range (Mistral-7B, Llama-3-8B, Qwen-2.5-7B). No experiments on larger models (13B, 70B) or different architectures are included, yet the method claims to address cross-LLM generalization broadly." 333 }, 334 { 335 "flag": "No significance testing", 336 "detail": "Despite claiming ACT-ViT 'consistently outperforms' baselines, no statistical significance tests are applied. Some margins are small (e.g., +0.54 on HQA-Wc/LlaMa-8B), and the 3-seed results (Table 8) show standard deviations that could overlap with baseline performance in some settings." 337 }, 338 { 339 "flag": "Main results appear to be single runs", 340 "detail": "Table 1, the primary results table, does not include error bars. Error bars over 3 seeds are provided only separately in Table 8 (Section A.3) and only for ACT-ViT, not for baselines. This makes direct statistical comparison impossible." 341 } 342 ], 343 "cited_papers": [ 344 { 345 "title": "LLMs know more than they show: On the intrinsic representation of LLM hallucinations", 346 "authors": ["Hadas Orgad", "Michael Toker", "Zorik Gekhman", "Roi Reichart", "Idan Szpektor", "Hadas Kotek", "Yonatan Belinkov"], 347 "year": 2024, 348 "arxiv_id": "2410.02707", 349 "relevance": "Key prior work on probing classifiers for hallucination detection that established the experimental framework and datasets used in this paper." 350 }, 351 { 352 "title": "Learning on LLM Output Signatures for Gray-Box LLM Behavior Analysis", 353 "authors": ["Guy Bar-Shalom", "Fabrizio Frasca", "Derek Lim", "Yoav Gelberg", "Yftah Ziser", "Ran El-Yaniv", "Gal Chechik", "Haggai Maron"], 354 "year": 2025, 355 "arxiv_id": "2503.14043", 356 "relevance": "LOS-Net baseline from same research group, a learnable approach on output probabilities for LLM behavior analysis." 357 }, 358 { 359 "title": "The internal state of an LLM knows when it's lying", 360 "authors": ["Amos Azaria", "Tom Mitchell"], 361 "year": 2023, 362 "arxiv_id": "2304.13734", 363 "relevance": "Early work on using LLM internal states for detecting false outputs, demonstrating variability in the best probing layer." 364 }, 365 { 366 "title": "Semantic uncertainty: Linguistic invariances for uncertainty estimation in natural language generation", 367 "authors": ["Lorenz Kuhn", "Yarin Gal", "Sebastian Farquhar"], 368 "year": 2023, 369 "arxiv_id": "2302.09664", 370 "relevance": "Influential approach to hallucination detection using semantic uncertainty from multiple LLM queries, a key baseline category in this work." 371 }, 372 { 373 "title": "Discovering latent knowledge in language models without supervision", 374 "authors": ["Collin Burns", "Haotian Ye", "Dan Klein", "Jacob Steinhardt"], 375 "year": 2022, 376 "arxiv_id": "2212.03827", 377 "relevance": "Foundational work on extracting truthfulness signals from LLM internal representations without supervised labels." 378 }, 379 { 380 "title": "The geometry of truth: Emergent linear structure in large language model representations of true/false datasets", 381 "authors": ["Samuel Marks", "Max Tegmark"], 382 "year": 2023, 383 "arxiv_id": "2310.06824", 384 "relevance": "Demonstrates linear structure in LLM representations related to truthfulness, supporting the use of linear adapters in ACT-ViT." 385 }, 386 { 387 "title": "The Platonic Representation Hypothesis", 388 "authors": ["Minyoung Huh", "Brian Cheung", "Tongzhou Wang", "Phillip Isola"], 389 "year": 2024, 390 "arxiv_id": "2405.07987", 391 "relevance": "Theoretical motivation for multi-LLM training via shared representation spaces, a key hypothesis underlying ACT-ViT's cross-LLM approach." 392 }, 393 { 394 "title": "An image is worth 16x16 words: Transformers for image recognition at scale", 395 "authors": ["Alexey Dosovitskiy", "Lucas Beyer", "Alexander Kolesnikov"], 396 "year": 2020, 397 "arxiv_id": "2010.11929", 398 "relevance": "The Vision Transformer (ViT) architecture that directly inspires ACT-ViT's backbone design for processing activation tensors." 399 }, 400 { 401 "title": "Representation engineering: A top-down approach to AI transparency", 402 "authors": ["Andy Zou", "Long Phan", "Sarah Chen", "James Campbell"], 403 "year": 2023, 404 "arxiv_id": "2310.01405", 405 "relevance": "Representation engineering approach for AI transparency and control, related to using internal representations for understanding LLM behavior." 406 }, 407 { 408 "title": "INSIDE: LLMs' internal states retain the power of hallucination detection", 409 "authors": ["Chao Chen", "Kai Liu", "Ze Chen"], 410 "year": 2024, 411 "arxiv_id": "2402.03744", 412 "relevance": "Prior work on hallucination detection using LLM internal states, part of the growing literature this paper builds upon." 413 }, 414 { 415 "title": "Inference-time intervention: Eliciting truthful answers from a language model", 416 "authors": ["Kenneth Li", "Oam Patel", "Fernanda Viégas", "Hanspeter Pfister", "Martin Wattenberg"], 417 "year": 2024, 418 "relevance": "Uses probing of internal representations to improve LLM truthfulness at inference time, related to the interpretability and intervention direction." 419 }, 420 { 421 "title": "SelfCheckGPT: Zero-resource black-box hallucination detection for generative large language models", 422 "authors": ["Potsawee Manakul", "Adian Liusie", "Mark JF Gales"], 423 "year": 2023, 424 "arxiv_id": "2303.08896", 425 "relevance": "A black-box hallucination detection approach that contrasts with the white-box approach of ACT-ViT." 426 } 427 ] 428 }