scan-v5.json (26668B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "LLMs Encode Their Failures: Predicting Success from Pre-Generation Activations", 6 "authors": [ 7 "William Lugoloobi", 8 "Thomas Foster", 9 "William Bankes", 10 "Chris Russell" 11 ], 12 "year": 2026, 13 "venue": "arXiv", 14 "arxiv_id": "2602.09924", 15 "doi": null 16 }, 17 "checklist": { 18 "claims_and_evidence": { 19 "abstract_claims_supported": { 20 "applies": true, 21 "answer": true, 22 "justification": "All major claims (difficulty divergence, probe outperformance, 70% cost reduction) are directly supported by Tables 1-2, Figure 1, and Figures 3-4 with numerical evidence.", 23 "source": "haiku" 24 }, 25 "causal_claims_justified": { 26 "applies": true, 27 "answer": false, 28 "justification": "Mechanistic claims about what models 'encode' are inferred from probe correlations, not interventional. Routing claims are demonstrated but causality not proven—no ablations on what information probes actually use.", 29 "source": "haiku" 30 }, 31 "generalization_bounded": { 32 "applies": true, 33 "answer": true, 34 "justification": "Testing spans 5 benchmarks (E2H-AMC, MATH, GSM8K, AIME, LiveCodeBench), 4 model families. Paper explicitly bounds scope: 'We do not study cross-domain or cross-dataset probe transfer... leaving open questions about generalization.'", 35 "source": "haiku" 36 }, 37 "alternative_explanations_discussed": { 38 "applies": true, 39 "answer": false, 40 "justification": "One mechanism proposed (output length correlates with human difficulty) but alternatives not systematically considered. E.g., could probes just be picking up on question complexity proxies rather than true success likelihood?", 41 "source": "haiku" 42 }, 43 "proxy_outcome_distinction": { 44 "applies": true, 45 "answer": true, 46 "justification": "Clear distinction between human IRT difficulty, expected success rate, and binary policy success. Paper states 'model-specific difficulty provides a more reliable predictor than human difficulty' with explicit measurement definitions.", 47 "source": "haiku" 48 } 49 }, 50 "limitations_and_scope": { 51 "limitations_section_present": { 52 "applies": true, 53 "answer": true, 54 "justification": "Section 4.4 explicitly titled 'CONCLUSION, LIMITATIONS AND FUTURE WORK' with dedicated limitations subsection.", 55 "source": "haiku" 56 }, 57 "threats_to_validity_specific": { 58 "applies": true, 59 "answer": true, 60 "justification": "Specific threats: 'Probe performance degrades under extended reasoning'; 'We focus on linear probes at single post-instruction position'; 'We do not study cross-domain probe transfer'; 'routing policies are intentionally simple.'", 61 "source": "haiku" 62 }, 63 "scope_boundaries_stated": { 64 "applies": true, 65 "answer": true, 66 "justification": "Explicitly states boundaries: no non-linear probes, no cross-dataset transfer, no adaptive routing, no vision/multimodal domains. Future work section delineates what remains open.", 67 "source": "haiku" 68 } 69 }, 70 "conflicts_of_interest": { 71 "funding_disclosed": { 72 "applies": true, 73 "answer": false, 74 "justification": "Preprint lists no funding statement or acknowledgments. No source of financial support mentioned.", 75 "source": "haiku" 76 }, 77 "affiliations_disclosed": { 78 "applies": true, 79 "answer": true, 80 "justification": "Authors list Oxford Internet Institute, FLAIR/Oxford, and UCL affiliations. No company relationships with evaluated models disclosed.", 81 "source": "haiku" 82 }, 83 "funder_independent_of_outcome": { 84 "applies": false, 85 "answer": false, 86 "justification": "No funder disclosed, so independence cannot be assessed. Mark as NA—no financial stake evident.", 87 "source": "haiku" 88 }, 89 "financial_interests_declared": { 90 "applies": true, 91 "answer": false, 92 "justification": "No competing interests statement. No mention of patents, equity stakes, or consulting relationships with OpenAI, Anthropic, or other model providers.", 93 "source": "haiku" 94 } 95 }, 96 "scope_and_framing": { 97 "key_terms_defined": { 98 "applies": true, 99 "answer": true, 100 "justification": "Key terms precisely defined: linear probes trained on activations; pre-generation activations as 'residual stream activations (pre-layer norm)' at 'post-instruction positions'; policy-specific success under Maj@K and greedy decoding.", 101 "source": "haiku" 102 }, 103 "intended_contribution_clear": { 104 "applies": true, 105 "answer": true, 106 "justification": "'Main Contributions' section explicitly lists three claims: (1) Human vs model difficulty are distinct; (2) Probes generalize across settings; (3) Routing achieves 70% cost reduction. Contribution to efficiency + interpretability is clear.", 107 "source": "haiku" 108 }, 109 "engagement_with_prior_work": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section 2 engages with 4 research lines: correctness prediction (Kadavath, Burns, Cencerrado), difficulty estimation (Lugoloobi & Russell, Lee et al.), test-time compute (Wang et al., DeepSeek-R1), and routing (Chen et al., Ding et al.). Positions contributions relative to each.", 113 "source": "haiku" 114 } 115 } 116 }, 117 "type_checklist": { 118 "empirical": { 119 "artifacts": { 120 "code_released": { 121 "applies": true, 122 "answer": true, 123 "justification": "GitHub repo provided: 'Our code is available at: https://github.com/KabakaWilliam/llms_know_difficulty'.", 124 "source": "haiku" 125 }, 126 "data_released": { 127 "applies": true, 128 "answer": true, 129 "justification": "All benchmarks are public: E2H-AMC (Ding et al. 2024), MATH (Cobbe et al. 2021), GSM8K (Hendrycks et al. 2021), AIME (public), LiveCodeBench (Veeraboina 2023). No custom private datasets.", 130 "source": "haiku" 131 }, 132 "environment_specified": { 133 "applies": true, 134 "answer": false, 135 "justification": "Appendix 5.3 specifies VLLM + model hyperparameters (temperature, max_length). Missing: Python version, VLLM version number, PyTorch version, requirements.txt or environment file.", 136 "source": "haiku" 137 }, 138 "reproduction_instructions": { 139 "applies": true, 140 "answer": false, 141 "justification": "Code on GitHub presumably includes instructions, but the paper lacks step-by-step reproduction guide. Appendix 5.1 describes probe training (Ridge/logistic regression) but not end-to-end pipeline with data loading, preprocessing, probe extraction.", 142 "source": "haiku" 143 } 144 }, 145 "statistical_methodology": { 146 "confidence_intervals_or_error_bars": { 147 "applies": true, 148 "answer": false, 149 "justification": "Tables 1-2, 4-5 report only point estimates (Spearman ρ, AUROC, accuracy). Figures 1-4 have no error bars. No confidence intervals reported despite multiple benchmarks and models.", 150 "source": "haiku" 151 }, 152 "significance_tests": { 153 "applies": true, 154 "answer": false, 155 "justification": "No p-values, t-tests, or significance tests comparing probes to baselines. Claims like 'substantially outperforming' are made without statistical backing.", 156 "source": "haiku" 157 }, 158 "effect_sizes_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Spearman ρ (0.83-0.87), AUROC (0.76-0.91), and cost reduction percentages (17-70%) serve as effect sizes. Table 2 shows differences like 0.84 vs 0.64 AUROC.", 162 "source": "haiku" 163 }, 164 "sample_size_justified": { 165 "applies": true, 166 "answer": false, 167 "justification": "E2H-AMC: 4k problems stated. MATH, GSM8K, AIME, LiveCodeBench sizes mentioned but not all. No power analysis or justification for sample size choices provided.", 168 "source": "haiku" 169 }, 170 "variance_reported": { 171 "applies": true, 172 "answer": false, 173 "justification": "No std dev, variance, or run-to-run noise reported. Results appear to be single-run point estimates. This is a significant gap for model evaluation.", 174 "source": "haiku" 175 } 176 }, 177 "evaluation_design": { 178 "baselines_included": { 179 "applies": true, 180 "answer": true, 181 "justification": "TF-IDF and question length as text baselines. Random and oracle routing as routing baselines. Single-model baselines (Qwen, DeepSeek, GPT-OSS variants).", 182 "source": "haiku" 183 }, 184 "baselines_contemporary": { 185 "applies": true, 186 "answer": true, 187 "justification": "Model baselines are recent: Qwen2.5 (2024/2025), DeepSeek-R1 (2025), GPT-OSS-20B (recent). TF-IDF is old but acceptable as text feature baseline.", 188 "source": "haiku" 189 }, 190 "ablation_study": { 191 "applies": true, 192 "answer": false, 193 "justification": "No ablation removing components (e.g., different layer positions, different activation features). Paper does sensitivity analysis (greedy vs Maj@K, reasoning modes) but no component removal.", 194 "source": "haiku" 195 }, 196 "multiple_metrics": { 197 "applies": true, 198 "answer": true, 199 "justification": "Spearman ρ for difficulty ranking, AUROC for binary classification, cost-accuracy Pareto curves for routing. Multiple angles on performance.", 200 "source": "haiku" 201 }, 202 "human_evaluation": { 203 "applies": false, 204 "answer": false, 205 "justification": "Not applicable—no human evaluation of model outputs. E2H-AMC uses existing human IRT scores but paper does not conduct new human studies.", 206 "source": "haiku" 207 }, 208 "held_out_test_set": { 209 "applies": true, 210 "answer": true, 211 "justification": "'80/20 train–validation split for hyperparameter selection, and report our best probe results on a held-out test set.' Standard practice followed.", 212 "source": "haiku" 213 }, 214 "per_category_breakdown": { 215 "applies": true, 216 "answer": true, 217 "justification": "Results broken down by dataset (MATH, GSM8K, AIME, LiveCodeBench), model (Qwen, DeepSeek, GPT-OSS), reasoning mode (low/medium/high), and decoding (greedy vs Maj@K).", 218 "source": "haiku" 219 }, 220 "failure_cases_discussed": { 221 "applies": true, 222 "answer": false, 223 "justification": "Paper identifies probe quality degradation under extended reasoning and oracle gap but does not show specific failure examples or analyze patterns (e.g., which question types cause probe failures?).", 224 "source": "haiku" 225 }, 226 "negative_results_reported": { 227 "applies": true, 228 "answer": true, 229 "justification": "Explicitly reports: 'probe AUROC drops monotonically as reasoning budgets increase, even as accuracy improves' (0.78→0.64). Gap to oracle shown in Figures 3-4.", 230 "source": "haiku" 231 } 232 }, 233 "setup_transparency": { 234 "model_versions_specified": { 235 "applies": true, 236 "answer": false, 237 "justification": "Models referred to as 'Qwen2.5-Math-7B', 'GPT-OSS-20B', 'DeepSeek-R1-Distill-Qwen-7B' with no snapshot dates or commit hashes. Exact versions cannot be reproduced.", 238 "source": "haiku" 239 }, 240 "prompts_provided": { 241 "applies": true, 242 "answer": false, 243 "justification": "No actual prompts shown. Paper mentions 'model's chat template' and post-instruction positions but does not display the system prompt or query formatting used.", 244 "source": "haiku" 245 }, 246 "hyperparameters_reported": { 247 "applies": true, 248 "answer": false, 249 "justification": "Table 6 reports temperature, max_length, k for rollouts. Appendix 5.1: regularization α tuned over range. Missing: learning rate, optimizer, batch size, number of epochs for probe training.", 250 "source": "haiku" 251 }, 252 "scaffolding_described": { 253 "applies": false, 254 "answer": false, 255 "justification": "N/A—no agentic scaffolding. Paper evaluates base model behavior without chains-of-thought setup beyond what models generate naturally.", 256 "source": "haiku" 257 }, 258 "data_preprocessing_documented": { 259 "applies": true, 260 "answer": false, 261 "justification": "Basic info: LiveCodeBench uses 'contamination-aware temporal splits'; benchmarks are standard. Missing: answer parsing code, how incorrect/malformed responses handled, tokenization details.", 262 "source": "haiku" 263 } 264 }, 265 "data_integrity": { 266 "raw_data_available": { 267 "applies": true, 268 "answer": true, 269 "justification": "E2H-AMC, MATH, GSM8K, AIME, LiveCodeBench all publicly available. Model rollouts and probe outputs derivable from released code.", 270 "source": "haiku" 271 }, 272 "data_collection_described": { 273 "applies": true, 274 "answer": true, 275 "justification": "E2H-AMC: human IRT scores from psychometric calibration on student performance (Ding et al. 2024). Model responses: K=50 rollouts at T=1 for math benchmarks, specified in Table 6.", 276 "source": "haiku" 277 }, 278 "recruitment_methods_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "N/A—no human recruitment. Paper uses existing human-labeled benchmark (E2H-AMC) with no new participant recruitment.", 282 "source": "haiku" 283 }, 284 "data_pipeline_documented": { 285 "applies": true, 286 "answer": false, 287 "justification": "Pipeline described: benchmarks → model rollouts → activation extraction at post-instruction → probe training on 80% → evaluation on 20%. But missing details on data loading, batch processing, error handling.", 288 "source": "haiku" 289 } 290 }, 291 "contamination": { 292 "training_cutoff_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "No training data cutoff dates provided for any model. 'Qwen2.5-Math-7B', 'gpt-oss-20b', 'DeepSeek-R1' dates unknown. Critical given 2024-2025 AIME/LiveCodeBench exposure risk.", 296 "source": "haiku" 297 }, 298 "train_test_overlap_discussed": { 299 "applies": true, 300 "answer": false, 301 "justification": "LiveCodeBench: 'contamination-aware temporal splits' used. But MATH, GSM8K, AIME: no discussion of potential overlap. Known literature flags MATH/GSM8K as contaminated in some model training; not addressed here.", 302 "source": "haiku" 303 }, 304 "benchmark_contamination_addressed": { 305 "applies": true, 306 "answer": false, 307 "justification": "Only LiveCodeBench addressed with temporal splits. E2H-AMC, MATH, GSM8K, AIME contamination not discussed despite potential 2024-2025 training data overlap.", 308 "source": "haiku" 309 } 310 } 311 } 312 }, 313 "claims": [ 314 { 315 "claim": "Linear probes on pre-generation activations can predict model success with AUROC > 0.7 across most inference settings", 316 "evidence": "Table 2: AUROC values 0.76–0.91 for Maj@5 predictions on Qwen and DeepSeek models; GPT-OSS-20B shows 0.78 (low) to 0.64 (high reasoning) range.", 317 "supported": "strong" 318 }, 319 { 320 "claim": "Human difficulty and model difficulty are distinct and encoded separately in pre-generation activations", 321 "evidence": "Table 1: Human IRT Spearman ρ = 0.83–0.87 vs model success rate ρ = 0.40–0.64 from identical activations. Figure 1A shows alignment between predicted difficulties diverges with reasoning.", 322 "supported": "strong" 323 }, 324 { 325 "claim": "Model-derived difficulty signals are more predictive of actual performance than human difficulty signals", 326 "evidence": "Figure 1B: Probe-predicted model difficulty consistently outperforms human difficulty for predicting Maj@5 failure across all reasoning modes, despite divergence.", 327 "supported": "strong" 328 }, 329 { 330 "claim": "Probe quality degrades monotonically as extended reasoning increases, despite improved task accuracy", 331 "evidence": "Table 2 & 4: GPT-OSS-20B Maj@5 AUROC drops 0.78→0.64 as reasoning increases (low→high); accuracy improves 86.6%→92.0% in same setting.", 332 "supported": "strong" 333 }, 334 { 335 "claim": "Probe-guided routing can match best-performing model accuracy while reducing inference cost by up to 70% on MATH", 336 "evidence": "Figure 3 (right): Utility router achieves GPT-OSS-20B-high accuracy (92%) at $15 vs $28 baseline (70% cost reduction). Table 8 confirms results.", 337 "supported": "strong" 338 }, 339 { 340 "claim": "Linear probes substantially outperform text-based baselines (TF-IDF, question length) for predicting success", 341 "evidence": "Table 1-2: Linear probe ρ=0.83–0.87 vs TF-IDF ρ=0.72 for human difficulty; AUROC 0.84 vs 0.64 for model success. Across all settings, probes dominate baselines.", 342 "supported": "strong" 343 }, 344 { 345 "claim": "With extended reasoning, models spend more tokens on problems humans find difficult even when those problems are within model competence", 346 "evidence": "Figure 2: Chain-of-thought length positively correlated with human IRT difficulty across all reasoning modes, but negatively correlated with empirical success and probe-predicted success.", 347 "supported": "strong" 348 }, 349 { 350 "claim": "Routing effectiveness is bottlenecked by probe reliability (AUROC), not routing algorithm sophistication", 351 "evidence": "Figures 3-4 and oracle analysis: Gap to oracle (perfect knowledge) narrows when probe quality is high (AIME) and widens when probe quality degrades (GPT-OSS-20B high reasoning). Simple utility rule achieves near-oracle when AUROC > 0.8.", 352 "supported": "moderate" 353 } 354 ], 355 "methodology_tags": [ 356 "empirical", 357 "benchmark-eval", 358 "observational" 359 ], 360 "key_findings": "Linear probes extract success predictions from LLM pre-generation activations with >70% AUROC, substantially outperforming text features. Crucially, models encode a model-specific difficulty signal distinct from human difficulty (ρ=0.40–0.64 vs ρ=0.83–0.87), and this divergence increases with extended reasoning. Probe-guided routing matches top-performing models at 17–70% cost reduction on math benchmarks, demonstrating practical efficiency gains; however, probe quality degrades sharply under deep reasoning, suggesting that extended chain-of-thought reasoning encodes difficulty information in ways that are not linearly accessible at the pre-generation stage.", 361 "red_flags": [ 362 { 363 "flag": "No confidence intervals or significance testing", 364 "detail": "All results report single point estimates (Spearman ρ, AUROC, accuracy) with no error bars, confidence intervals, or p-values. Unclear if observed differences between methods (e.g., probe vs TF-IDF) are statistically significant." 365 }, 366 { 367 "flag": "Model versions not precisely specified", 368 "detail": "Marketing names ('Qwen2.5-Math-7B', 'GPT-OSS-20B') lack snapshot dates or commit hashes. Exact reproduction of setup impossible; models may have been updated since paper submission." 369 }, 370 { 371 "flag": "Actual prompts not provided", 372 "detail": "Paper mentions 'chat template' and post-instruction positions but does not display system prompts or query formatting. Reproducibility limited without prompt details." 373 }, 374 { 375 "flag": "Training cutoff dates not stated", 376 "detail": "No training data cutoff provided for any model evaluated. Given 2024-2025 AIME/LiveCodeBench exposure, potential train-test contamination cannot be ruled out, especially for MATH/GSM8K." 377 }, 378 { 379 "flag": "No variance or run-to-run noise reported", 380 "detail": "All metrics appear to be single-run point estimates. No discussion of whether random seed, batch order, or other sources of variance affect probe training or routing results." 381 }, 382 { 383 "flag": "Mechanistic explanations are correlational, not causal", 384 "detail": "Paper infers what models 'encode' from probe correlation with outcomes, but no interventional evidence (e.g., ablating components, perturbing activations) proves causality. Alternative explanations (e.g., probes detecting output length, not true difficulty) not ruled out." 385 }, 386 { 387 "flag": "Cross-domain generalization not tested", 388 "detail": "Probes trained on math do not show transfer to code or vice versa. Limits practical applicability—would need separate probes per domain/task." 389 }, 390 { 391 "flag": "Probe degradation under extended reasoning limits utility", 392 "detail": "AUROC drops from 0.78 to 0.64 as reasoning increases, exactly when routing matters most. Paper acknowledges but does not solve: 'probe reliability becomes the bottleneck' for advanced reasoning models." 393 } 394 ], 395 "cited_papers": [ 396 { 397 "title": "Language Models (Mostly) Know What They Know", 398 "authors": "Kadavath et al.", 399 "year": 2022, 400 "relevance": "Foundational work on model self-assessment and confidence; establishes that models can evaluate their own correctness." 401 }, 402 { 403 "title": "The Internal State of an LLM Knows When It's Lying", 404 "authors": "Azaria & Mitchell", 405 "year": 2023, 406 "relevance": "Demonstrates extractable 'truthfulness directions' in internal activations; directly related approach to probing latent knowledge." 407 }, 408 { 409 "title": "Discovering Latent Knowledge in Language Models Without Supervision", 410 "authors": "Burns et al.", 411 "year": 2024, 412 "relevance": "Unsupervised extraction of correctness signals from activations; comparison point for supervised probe approach." 413 }, 414 { 415 "title": "No Answer Needed: Predicting LLM Answer Accuracy from Question-Only Linear Probes", 416 "authors": "Cencerrado et al.", 417 "year": 2025, 418 "relevance": "Directly related work on predicting correctness from activations; weaker performance on math (AUROC ≈0.6–0.7) vs this paper's 0.78–0.84." 419 }, 420 { 421 "title": "LLMs Encode How Difficult Problems Are", 422 "authors": "Lugoloobi & Russell", 423 "year": 2025, 424 "relevance": "Prior work showing difficulty is linearly decodable; this paper extends to human vs model difficulty distinction and routing applications." 425 }, 426 { 427 "title": "Probing the Difficulty Perception Mechanism of Large Language Models", 428 "authors": "Lee et al.", 429 "year": 2025, 430 "relevance": "Difficulty perception mechanisms in LLMs; paper adds comparison with human IRT difficulty and extends to routing." 431 }, 432 { 433 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 434 "authors": "Chen et al.", 435 "year": 2024, 436 "relevance": "Model routing for cost-accuracy tradeoffs; establishes motivation and prior baselines for routing research." 437 }, 438 { 439 "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing", 440 "authors": "Ding et al.", 441 "year": 2023, 442 "relevance": "Heuristic-based routing using input complexity; this paper replaces heuristics with learned difficulty probes." 443 }, 444 { 445 "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models", 446 "authors": "Wang et al.", 447 "year": 2022, 448 "relevance": "Majority voting aggregation (Maj@K) for test-time scaling; paper analyzes probe robustness across sampling policies." 449 }, 450 { 451 "title": "Easy2Hard-Bench: Standardized Difficulty Labels for Profiling LLM Performance and Generalization", 452 "authors": "Ding et al.", 453 "year": 2024, 454 "relevance": "Provides E2H-AMC benchmark with human IRT difficulty labels; critical resource enabling human vs model difficulty comparison." 455 } 456 ], 457 "engagement_factors": { 458 "practical_relevance": { 459 "score": 2, 460 "justification": "Routing shows 17–70% cost reduction on benchmarks, but probe degradation under extended reasoning limits real-world utility for advanced models. Unclear if gains transfer to complex multi-step inference pipelines." 461 }, 462 "surprise_contrarian": { 463 "score": 2, 464 "justification": "Divergence between human and model difficulty is interesting. Finding that probes degrade with extended reasoning contradicts intuition that better reasoning enables better self-assessment, but effect is documented not explained." 465 }, 466 "fear_safety": { 467 "score": 1, 468 "justification": "Some interpretability value (understanding internal representations), but no direct safety implications. Does not raise new risk concerns or address alignment/deception." 469 }, 470 "drama_conflict": { 471 "score": 1, 472 "justification": "Straightforward empirical technical work. No controversial claims, stakeholder conflict, or dramatic findings. Well-executed but incremental contribution." 473 }, 474 "demo_ability": { 475 "score": 2, 476 "justification": "Code released on GitHub, but reproducing full pipeline requires 50 rollouts across multiple benchmarks (expensive compute). Feasible to demo on small subsets or use pre-computed results." 477 }, 478 "brand_recognition": { 479 "score": 1, 480 "justification": "Authors from Oxford Internet Institute, FLAIR, UCL—solid academic institutions but no industry prestige or famous lab association. Preprint with modest HN engagement (1 point, 3 comments)." 481 } 482 }, 483 "hn_data": { 484 "threads": [ 485 { 486 "hn_id": "46995551", 487 "title": "Routing LLM queries using internal success predictions (70% cost reduction)", 488 "points": 1, 489 "comments": 3, 490 "url": "https://news.ycombinator.com/item?id=46995551", 491 "created_at": "2026-02-12T21:33:47Z" 492 } 493 ], 494 "top_points": 1, 495 "total_points": 1, 496 "total_comments": 3 497 } 498 }