scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31154B)
      1 {
      2   "paper": {
      3     "title": "LLMs Encode Their Failures: Predicting Success from Pre-Generation Activations",
      4     "authors": [
      5       "William Lugoloobi",
      6       "Thomas Foster",
      7       "William Bankes",
      8       "Chris Russell"
      9     ],
     10     "year": 2026,
     11     "venue": "arXiv",
     12     "arxiv_id": "2602.09924"
     13   },
     14   "scan_version": 2,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "Linear probes trained on pre-generation activations can predict model-specific success on math and coding tasks (AUROC > 0.7 for most settings), substantially outperforming surface features like TF-IDF and question length. Human and model difficulty are distinct signals that diverge with extended reasoning: probe quality degrades (AUROC 0.78→0.64) as reasoning budget increases despite improved accuracy. Probe-guided routing achieves up to 70% cost reduction on MATH while matching the highest-capability single model, with similar gains on AIME and GSM8K.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "GitHub repository provided in abstract: 'Our code is available at: https://github.com/KabakaWilliam/llms_know_difficulty' (Section 1)."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "All evaluation benchmarks are publicly available standard datasets: MATH, GSM8K, AIME (1983-2024), E2H-AMC from Easy2HardBench, and LiveCodeBench. Paper cites public sources for each."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions using VLLM for rollouts (Appendix 5.3) and specifies model-level hyperparameters, but does not provide a requirements.txt, Dockerfile, or detailed library version specifications beyond the inference engine."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions in the paper. Code is released but the paper itself contains no 'Reproducing Results' section or explicit commands to replicate the main experiments."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results (Tables 1-5, 8-10) report point estimates only. No confidence intervals, error bars, or ± notation on any Spearman ρ or AUROC values."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims probes 'substantially outperform' baselines (e.g., Table 1) but provides no statistical significance tests. All comparative claims rest on numerical differences without p-values or any formal test."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Results consistently give absolute values with baseline context: 'matching GPT-OSS-20B-medium's 91.2% accuracy while reducing cost by 17%' (Section 4.2), 'probe performance drops from ρ = 0.58 (low reasoning) to ρ = 0.40 (high reasoning)' (Section 3.3), and AUROC comparisons across methods in Table 2."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No justification for why K=50 rollouts (or K=5 for GPT-OSS-20B 'due to computational cost'). No power analysis or discussion of whether the benchmark sizes are adequate for the statistical claims made."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No standard deviations, interquartile ranges, or variance measures reported across experimental runs. Probe training and rollout results appear to be single-run without spread measures."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Compared against TF-IDF features and question length for probing (Tables 1-2), random routing and oracle routing for the routing experiments (Figures 3-4, Tables 8-10)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Cencerrado et al. (2025) is cited as directly related prior work. The comparison against surface-feature baselines (TF-IDF, length) and routing baselines (random, oracle) is appropriate for the claims. For routing, they cite Chen et al. (2024) and Ding et al. (2023) as prior work."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No component ablation study. The paper systematically varies conditions (reasoning levels, models, benchmarks, routing strategies) but does not isolate the contribution of individual components of their approach (e.g., layer choice, position choice, probe type)."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Uses Spearman ρ for regression tasks, AUROC for classification tasks, task accuracy for model performance, and cost (USD) for routing evaluation. Both accuracy and cost are jointly evaluated in the routing Pareto frontiers."
     88       },
     89       "human_evaluation": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "Human evaluation is irrelevant to the claims. The paper is about probing model activations and automated routing — there are no system outputs that would benefit from human evaluation."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Appendix 5.1: 'We hold out 20% of the original training set as validation. The best configuration (ℓ∗, p∗, α∗) is selected by validation performance... Test evaluation happens exactly once with the selected probe.'"
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results broken down by dataset (MATH, GSM8K, AIME, E2H-AMC, LiveCodeBench), by model (Qwen2.5 variants, GPT-OSS-20B, DeepSeek-R1), by reasoning mode (low/medium/high), and by decoding strategy (greedy vs. Maj@5). Appendix Tables 3-5 provide detailed per-dataset breakdowns."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 3.3 discusses probe degradation under extended reasoning (AUROC 0.78→0.64). Section 4.3 discusses the bottleneck of probe reliability. GPT-OSS-20B's consistently weaker probe quality is identified as a pattern across domains."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Key negative result: probe quality degrades with increased reasoning budget (AUROC 0.78→0.64 despite accuracy 86.6%→92.0%). Also: model difficulty is 'less linearly accessible' than human difficulty (ρ=0.40-0.64 vs 0.83-0.87). The gap to oracle routing is candidly discussed."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims are supported: 'substantially outperforming surface features' (Table 1-2), 'model-specific notion of difficulty distinct from human difficulty' (Table 1, Figure 1), 'routing queries... reducing inference cost by up to 70% on MATH' (Figure 3, Table 8)."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The main causal claim — that increasing reasoning budget degrades probe accessibility — is supported by controlled variation of GPT-OSS-20B's reasoning level (low/medium/high) while holding other factors constant. Section 3.3 hedges appropriately: 'extended chain-of-thought may encode difficulty information in ways that are not linearly separable.'"
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title 'LLMs Encode Their Failures' implies a general property of all LLMs, but experiments cover only Qwen2.5 variants (1.5B, 7B) and GPT-OSS-20B. No Llama, Mistral, Claude, or Gemini models tested. The paper does not bound its title-level claims to the tested model families."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Section 4.3 offers an alternative mechanism for probe degradation: 'generation length becomes increasingly correlated with human difficulty rather than the model's own likelihood of failure,' explaining the divergence between human and model difficulty representations. Section 3.3 considers that sampling-based aggregation helps models solve 'marginally difficult' problems differently."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper carefully distinguishes between prediction targets: expected success rate (continuous, Eq. 1-2) vs. binary success under specific decoding policies (greedy, Maj@K). Section 3.1 explicitly defines and separates these as distinct prediction targets with different practical implications."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Models specified with version, size, and variant: 'Qwen2.5-Math-1.5B-Instruct', 'Qwen2.5-Math-7B-Instruct', 'Qwen2.5-Coder-3B-Instruct', 'Qwen2.5-Coder-7B-Instruct', 'DeepSeek-R1-Distill-Qwen-7B', 'GPT-OSS-20B'. These identify specific model checkpoints."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Appendix 5.1 shows chat template suffixes for activation extraction (e.g., Qwen2.5: '<|im_end|>\\n<|im_start|>assistant\\n') but does not provide the actual task prompts or system messages used to query models on the benchmarks."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Appendix 5.3 Table 6 provides max length, temperature, and K for each model. Appendix 5.1 reports regularization search range α ∈ {10⁻³, ..., 10⁴} and the 80/20 train-validation split for probe training."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. The approach is a direct linear probe on model activations with simple threshold or utility-based routing rules."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 3.2 describes the data pipeline: benchmark selection, K rollouts per question with specified temperatures, activation extraction at post-instruction positions (Appendix 5.1), 80/20 train-validation split, and Platt scaling for calibration. LiveCodeBench uses contamination-aware temporal splits."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 4.4 'Conclusion, Limitations and Future Work' contains a dedicated 'Limitations' paragraph with substantive discussion of probe design constraints, degradation under extended reasoning, and missing transfer experiments."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Specific limitations identified: 'We focus on linear probes applied at a single post-instruction position' (design limitation), 'Probe performance degrades under extended reasoning' (reliability limitation), 'We also do not study cross-domain or cross-dataset probe transfer' (generalization gap). These are specific to this study."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 4.4 explicitly states what was NOT tested: 'we do not explore alternative probing positions or non-linear probes', 'We also do not study cross-domain or cross-dataset probe transfer', 'our routing policies are intentionally simple—fixed-k majority voting with threshold-based or utility-based rules—rather than learned or adaptive routing strategies.'"
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "While the benchmarks are public and code is released, the intermediate data (50 rollouts per question, model activations, probe predictions) is not explicitly released. The paper does not mention a data release for the rollout results or extracted activations."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 3.1-3.2 describes the data collection process: K=50 (or K=5) rollouts per question at specified temperatures, success labels via answer parsing, activation extraction at post-instruction positions. Benchmark sources and their properties are documented."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. All data comes from standard public benchmarks (MATH, GSM8K, AIME, E2H-AMC, LiveCodeBench)."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The pipeline is documented: benchmark questions → model rollouts (K samples per question) → success labels via answer parsing → activation extraction at EOI positions → 80/20 train-validation split → probe training with grid search → test evaluation. Each step is described in Sections 3.1-3.2 and Appendix 5.1."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No acknowledgments section, funding sources, or grant numbers mentioned anywhere in the paper."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations clearly listed: Oxford Internet Institute (University of Oxford), FLAIR (University of Oxford), and Department of Computer Science (University College London). No conflict with models evaluated (Qwen/Alibaba, GPT-OSS/OpenAI)."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding is disclosed, making independence unverifiable. The authors evaluate third-party models (Qwen, GPT-OSS-20B, DeepSeek) from their academic positions, suggesting no financial stake, but without explicit disclosure this cannot be confirmed."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests statement or financial disclosure found in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "Training data cutoff dates are not stated for any model (Qwen2.5, GPT-OSS-20B, DeepSeek-R1). For LiveCodeBench, temporal splits are based on 'each model's release date' but the actual dates are not provided."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "For LiveCodeBench, contamination-aware temporal splits are used. However, for MATH (published 2021), GSM8K (2021), AIME (1983-2024), and E2H-AMC, no discussion of whether models could have seen these benchmark problems during training."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "MATH and GSM8K were published in 2021 and are widely known. Any model trained after 2021 may have seen solutions. The paper uses temporal splits only for LiveCodeBench but does not discuss contamination risk for the four math benchmarks that comprise the majority of experiments."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. All experiments involve probing model activations and automated benchmark evaluation."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Appendix 5.4 Table 7 provides Fireworks AI pricing used for cost estimation. Routing experiments (Tables 8-10) report cost per query for each model and routing strategy. Figures 3-4 plot accuracy-cost Pareto frontiers."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No mention of total GPU hours, training time for probes, wall-clock time for rollout collection, or total API spend for the ~250K+ rollouts across models and benchmarks."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No mention of random seeds or seed sensitivity for probe training. Results appear to be from single training runs. The stochastic rollouts use K=50 samples but no seed sensitivity analysis is reported."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The number of rollouts per question is stated (K=50 or K=5), but the number of probe training runs is not stated. Results appear to be from a single probe training per configuration, with no indication of multiple runs."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Appendix 5.1 specifies the search space: all transformer layers × EOI positions × α ∈ {10⁻³, 10⁻², 10⁻¹, 1, 10, 10², 10³, 10⁴}. Grid search with selection on validation data. The search space is fully defined."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "Appendix 5.1: 'The best configuration (ℓ∗, p∗, α∗) is selected by validation performance... Test evaluation happens exactly once with the selected probe.' Selection on validation set, not test set."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The paper makes many comparative claims across models, benchmarks, and configurations without any statistical tests, let alone correction for multiple comparisons."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors implement their own TF-IDF and length baselines for comparison against their linear probes. No acknowledgment of potential bias from implementing and tuning their own baselines."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "The routing framework explicitly reports performance as a function of cost. Figures 3-4 show accuracy-cost Pareto frontiers. Tables 8-10 report both accuracy and cost for each configuration. GPT-OSS-20B low/medium/high reasoning modes are compared at matched settings."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper uses MATH, GSM8K, AIME, E2H-AMC, and LiveCodeBench without discussing whether these benchmarks actually measure the capabilities claimed. No discussion of construct validity for any benchmark."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is used. The approach directly probes model activations and uses simple routing rules."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "Temporal leakage is addressed only for LiveCodeBench ('contamination-aware temporal splits based on each model's release date'). For MATH, GSM8K, AIME, and E2H-AMC — all published before the tested models' training — temporal leakage is not discussed."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluation setup leaks answer information through context, prompt formatting, or other channels."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether train and test splits within the benchmarks share structural similarities, or whether the 80/20 probe train-test split could have non-independence issues (e.g., similar problems in both sets)."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "Only LiveCodeBench uses temporal splits as a prevention method. For the four math benchmarks comprising the majority of experiments, no concrete leakage detection or prevention method is applied."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "LLMs encode both human difficulty and model-specific difficulty in pre-generation activations, but these are distinct signals (Spearman ρ = 0.83-0.87 for human IRT difficulty vs. 0.40-0.64 for model difficulty).",
    369       "evidence": "Table 1 (Section 3.3): Linear probes trained on E2H-AMC activations achieve ρ = 0.83-0.87 for human IRT difficulty and ρ = 0.40-0.64 for model success rate across Qwen2.5-Math and GPT-OSS-20B models.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Probe reliability degrades with increased test-time compute: AUROC drops from 0.78 to 0.64 for GPT-OSS-20B as reasoning budget increases from low to high, despite accuracy improving from 86.6% to 92.0%.",
    374       "evidence": "Table 2 (Section 3.3): GPT-OSS-20B Math AUROC for Maj@5 drops monotonically across reasoning levels (0.78→0.70→0.64). Table 5 shows corresponding accuracy improvements.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Linear probes predict binary model success with AUROC > 0.7 across most settings, substantially outperforming text-based baselines.",
    379       "evidence": "Table 2 (Section 3.3): Linear probes achieve AUROC 0.64-0.91 across models and domains, compared to TF-IDF (0.58-0.86) and length (0.46-0.73). Strongest on code (0.81-0.91 for Qwen-Coder models).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Probe-guided utility routing matches GPT-OSS-20B-high accuracy on MATH while reducing cost by approximately 70%.",
    384       "evidence": "Figure 3 right panel and Table 8 (Section 4.2): Probe router at λ=0.00 achieves 93.0% accuracy at $28.37 cost; GPT-OSS-20B-high achieves 92.0% at $40.00. At λ=0.20, 91.7% accuracy at $10.35 — a roughly 74% cost reduction vs. GPT-OSS-20B-high's $40.00.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Chain-of-thought length tracks human difficulty rather than model uncertainty, and this effect strengthens with increased reasoning budget.",
    389       "evidence": "Figure 2 (Section 3.3): Binned analysis showing output length positively correlated with human IRT difficulty and negatively correlated with empirical and probe-predicted success across all reasoning modes. Effect strengthens at higher reasoning budgets.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Probe accessibility is a model-family property that generalizes across domains rather than a task-specific phenomenon.",
    394       "evidence": "Table 2 (Section 3.3): GPT-OSS-20B shows consistently weaker probe quality (AUROC ~0.67-0.71) on LiveCodeBench compared to Qwen-Coder (0.81-0.91), mirroring the same pattern observed on math benchmarks.",
    395       "supported": "weak"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "No error bars or uncertainty quantification",
    401       "detail": "All results in Tables 1-5 and 8-10 are point estimates without confidence intervals, standard deviations, or any measure of uncertainty. Given that probes are trained on stochastic rollout labels and AUROC depends on the train-test split, the stability of reported numbers is unknown."
    402     },
    403     {
    404       "flag": "Limited model diversity for broad claims",
    405       "detail": "Claims about 'LLMs' are based on only two model families (Qwen2.5 and GPT-OSS-20B) plus one distilled model (DeepSeek-R1-Distill-Qwen-7B). No Llama, Mistral, Claude, or Gemini models tested. The generality implied by the title is not supported by the model coverage."
    406     },
    407     {
    408       "flag": "Incomplete results in published table",
    409       "detail": "Table 3 in the Appendix contains 'XX' placeholder values for the Length Probe Greedy rows, suggesting the paper was published with incomplete results."
    410     },
    411     {
    412       "flag": "Contamination risk unaddressed for majority of benchmarks",
    413       "detail": "MATH (2021) and GSM8K (2021) are widely used benchmarks that likely appeared in training data for 2025-2026 models. Only LiveCodeBench receives contamination-aware temporal splits. If models have memorized benchmark solutions, the probe is predicting 'can the model recall the answer' rather than genuine difficulty."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Training Verifiers to Solve Math Word Problems",
    419       "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian", "Mark Chen", "Heewoo Jun", "Lukasz Kaiser", "Matthias Plappert", "Jerry Tworek", "Jacob Hilton", "Reiichiro Nakano", "Christopher Hesse", "John Schulman"],
    420       "year": 2021,
    421       "arxiv_id": "2110.14168",
    422       "relevance": "Introduces GSM8K and verifier-based approach for math reasoning evaluation, a foundational benchmark used in this paper."
    423     },
    424     {
    425       "title": "Measuring mathematical problem solving with the math dataset",
    426       "authors": ["Dan Hendrycks", "Collin Burns", "Saurav Kadavath", "Akul Arora", "Steven Basart", "Eric Tang", "Dawn Song", "Jacob Steinhardt"],
    427       "year": 2021,
    428       "arxiv_id": "2103.03874",
    429       "relevance": "Introduces the MATH benchmark, the primary evaluation dataset for the routing experiments."
    430     },
    431     {
    432       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    433       "authors": ["Naman Jain", "King Han", "Alex Gu", "Wen-Ding Li", "Fanjia Yan", "Tianjun Zhang", "Sida Wang", "Armando Solar-Lezama", "Koushik Sen", "Ion Stoica"],
    434       "year": 2024,
    435       "relevance": "Contamination-aware code evaluation benchmark used as the coding domain testbed in this paper."
    436     },
    437     {
    438       "title": "Language Models (Mostly) Know What They Know",
    439       "authors": ["Saurav Kadavath", "Tom Conerly", "Amanda Askell"],
    440       "year": 2022,
    441       "arxiv_id": "2207.05221",
    442       "relevance": "Foundational work on LLM self-knowledge and calibration, showing models can predict their own correctness."
    443     },
    444     {
    445       "title": "The Internal State of an LLM Knows When It's Lying",
    446       "authors": ["Amos Azaria", "Tom Mitchell"],
    447       "year": 2023,
    448       "arxiv_id": "2304.13734",
    449       "relevance": "Demonstrates that LLM internal representations contain truthfulness signals extractable via probes."
    450     },
    451     {
    452       "title": "Discovering Latent Knowledge in Language Models Without Supervision",
    453       "authors": ["Collin Burns", "Haotian Ye", "Dan Klein", "Jacob Steinhardt"],
    454       "year": 2024,
    455       "arxiv_id": "2212.03827",
    456       "relevance": "Unsupervised method for finding 'truth directions' in LLM activations, part of the correctness-signal literature."
    457     },
    458     {
    459       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    460       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    461       "year": 2024,
    462       "relevance": "Prior work on cost-efficient LLM usage through routing and ensemble strategies."
    463     },
    464     {
    465       "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing",
    466       "authors": ["Dujian Ding", "Ankur Mallick", "Chi Wang", "Robert Sim", "Subhabrata Mukherjee"],
    467       "year": 2023,
    468       "relevance": "LLM routing framework using input complexity heuristics, a direct baseline for the probe-based routing approach."
    469     },
    470     {
    471       "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning",
    472       "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"],
    473       "year": 2025,
    474       "arxiv_id": "2501.12948",
    475       "relevance": "Extended reasoning model used in the paper's experiments, key to the finding that reasoning budgets degrade probe quality."
    476     },
    477     {
    478       "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
    479       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans", "Quoc V. Le", "Ed H. Chi"],
    480       "year": 2022,
    481       "relevance": "Introduces majority voting (self-consistency) for chain-of-thought reasoning, the Maj@K method central to this paper's prediction targets."
    482     },
    483     {
    484       "title": "No Answer Needed: Predicting LLM Answer Accuracy from Question-Only Linear Probes",
    485       "authors": ["Iván Vicente Moreno Cencerrado", "Arnau Padrés Masdemont", "Anton Gonzalvez Hawthorne", "David Demitri Africa", "Lorenzo Pacchiardi"],
    486       "year": 2025,
    487       "arxiv_id": "2509.10625",
    488       "relevance": "Most directly related prior work: extracts correctness directions from activation centroids to predict LLM success."
    489     },
    490     {
    491       "title": "Refusal in Language Models Is Mediated by a Single Direction",
    492       "authors": ["Andy Arditi", "Oscar Obeso", "Aaquib Syed", "Daniel Paleka", "Nina Panickssery", "Wes Gurnee", "Neel Nanda"],
    493       "year": 2024,
    494       "arxiv_id": "2406.11717",
    495       "relevance": "Demonstrates linear directions in LLM activations mediate behavior; the activation extraction methodology used in this paper follows their approach."
    496     }
    497   ]
    498 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs