scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24276B)
      1 {
      2   "paper": {
      3     "title": "Calibrating LLM Judges: Linear Probes for Fast and Reliable Uncertainty Estimation",
      4     "authors": [
      5       "Bhaktipriya Radharapu",
      6       "Eshika Saxena",
      7       "Kenneth Li",
      8       "Chenxi Whitehouse",
      9       "Adina Williams",
     10       "Nicola Cancedda"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv",
     14     "arxiv_id": "2512.22245"
     15   },
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": false,
     21         "justification": "No repository URL, code archive, or link to released code is provided anywhere in the paper."
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper uses publicly available datasets: PPE (Frick et al., 2024), JudgeBench (Tan et al., 2024), and RewardBench (Lambert et al., 2024). These are standard public benchmarks."
     27       },
     28       "environment_specified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "Appendix A.5 mentions using the 'HuggingFace transformers library' but provides no version numbers, requirements file, or detailed environment specification."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The method section and appendix describe the approach conceptually but not with sufficient operational detail to reproduce."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper states 'Standard deviations were near zero for all results' (Section 3, Evaluation metrics) but does not actually report them in the results tables. Table 7 shows standard deviations for model accuracy but not for the main calibration results."
     44       },
     45       "significance_tests": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper claims probes outperform baselines based on comparing Kuiper/ECE values directly without any statistical significance tests (e.g., no p-values, no bootstrap tests, no paired comparisons)."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 4 reports effect sizes in context: 'Llama-family models show the largest gains, with 70-92% improvements over multi-generation methods and 64-87% improvement over verbalized methods.' Kuiper/ECE values are reported with baselines for comparison."
     54       },
     55       "sample_size_justified": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The paper uses 4000 training examples and ~10K evaluation examples (Appendix A.5) but provides no justification for why these sizes are adequate, no power analysis, and no discussion of whether the sample sizes are sufficient."
     59       },
     60       "variance_reported": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The paper states results are 'averaged over three different train-test splits' and that 'Standard deviations were near zero for all results' but does not actually report variance or standard deviation values in any of the main results tables (Tables 1-3)."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Three baselines are compared: verbalized confidence, self-consistency, and majority voting (Section 3, Baselines). Additional single-pass baselines are compared in Appendix A.1 (Table 4)."
     71       },
     72       "baselines_contemporary": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Baselines include recent methods: verbalized confidence (Xiong et al., 2024; Tian et al., 2023), self-consistency (Wang et al., 2022), and several 2024-2025 single-pass methods (CoT-kinetics, Chain of Embedding). These represent current approaches."
     76       },
     77       "ablation_study": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Multiple ablation studies are conducted: loss function ablation comparing Brier score, focal loss, and binary cross-entropy (Appendix A.7, Table 8), layer selection ablation (Appendix A.6, Figure 4), temperature ablation (A.9), number of runs ablation (A.10), and prompt ablations for verbalized confidence (A.13)."
     81       },
     82       "multiple_metrics": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Two primary calibration metrics are used: Kuiper statistic and Expected Calibration Error (ECE). AUROC is additionally used for selective classification comparisons in Appendix A.1."
     86       },
     87       "human_evaluation": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No human evaluation is performed on the probe outputs. The paper relies entirely on automated metrics (Kuiper, ECE, AUROC) to assess calibration quality. Human judgment of whether the uncertainty estimates are useful in practice is not evaluated."
     91       },
     92       "held_out_test_set": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The paper uses explicit train-test splits from PPE datasets for in-distribution evaluation, and completely separate benchmarks (JudgeBench, RewardBench) for out-of-distribution evaluation (Section 3, Datasets)."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Results are broken down by PPE Correctness subcategories (GPQA, IFEval, Math, MBPP, MMLU) in Appendix A.3 (Tables 5-6), and by model family, architecture type (dense vs MoE), and training paradigm (prompted vs fine-tuned)."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 4 explicitly discusses failure cases: probes underperform on RewardBench where high baseline accuracy makes overconfident verbalized methods appear well-calibrated. The conservative estimation trade-off is discussed in detail."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper reports that probes lag behind verbalized confidence on RewardBench (Table 2), and discusses the conservative behavior as a limitation. The abstract itself acknowledges probes 'underperform on easier datasets.'"
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The abstract claims: (1) probes achieve superior calibration (~10x savings) - supported by Tables 1-3; (2) robust OOD generalization - supported by JudgeBench results in Table 2 though RewardBench is weaker; (3) conservative estimates underperform on easier datasets - explicitly shown on RewardBench. The abstract hedges appropriately."
    118       },
    119       "causal_claims_justified": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The paper makes causal claims about why probes underperform on RewardBench (conservative estimation + high baseline accuracy) and supports this with analysis in Section 4 ('Relationship Between Accuracy and Calibration'). Ablation studies support claims about loss function and layer choices."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper carefully bounds its claims to the tested models and datasets. It explicitly discusses where probes do and do not generalize (JudgeBench vs RewardBench), and the Limitations section acknowledges the method requires access to hidden states and labeled ground truth data."
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Section 4 discusses alternative explanations for RewardBench results (high accuracy making overconfidence appear calibrated), and considers whether the conservative behavior is inherent to the probing approach vs the training data/loss function."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper specifies model families (LLaMA 8B/70B, Qwen 32B, J1 variants, GPT-OSS 20B, LLaMA Scout 109B) but does not provide specific model version strings or snapshot dates. 'Llama-4-Scout-17B-16E-Instruct' is mentioned in references for Scout, but for others only family names and sizes are given."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "Full prompt texts are provided in Appendix A.11 (multi-generation baselines) and A.12 (verbalized confidence) for all three judge formulations (PaV, PaS, PaL), including the complete templates with placeholder variables."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Appendix A.5 reports probe hyperparameters: learning rate 10^-4, weight decay 0.01, batch size 4, 10 epochs, 4000 training examples. Baseline hyperparameters: optimal N=10, temperature=0.7 for consistency/majority (Section 3, Baselines)."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding is used. The method is a linear probe on model hidden states, not an agentic system."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 3 describes the datasets and their composition. Appendix A.5 specifies how training data was selected: '4000 examples randomly selected from the PPE datasets, with 2000 from all of the PPE correctness datasets and 2000 from the PPE preference dataset. The rest (~10K examples) is used for in-distribution evaluation.'"
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "A dedicated 'Limitations' section follows the Conclusion, discussing the need for hidden state access, labeled data with ground truth, and probe retraining when judge models are updated."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "The Limitations section provides specific threats: 'it requires access to the judge's hidden states in the middle layers and a labeled dataset with ground truth verdicts. The method doesn't work for domains where ground truth verdicts aren't available.' Also: 'if the judge is retrained or finetuned, the probe may also need to be retrained.'"
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The paper does not explicitly state what the results do NOT show. The limitations section discusses method requirements but does not bound the scope of the claims (e.g., it does not discuss that results are limited to pairwise judgment tasks, or that the PPE-based training data limits domain coverage)."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "Raw probe predictions, hidden state representations, and per-example results are not made available. Only aggregated metrics are reported in tables."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 3 describes the data sources: PPE dataset from LM Arena with specific sample counts (10.2K preference, 12.7K correctness), JudgeBench (620 samples), and RewardBench (3K samples). The composition and sourcing of each dataset is described."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": false,
    192         "answer": false,
    193         "justification": "No human participants were recruited. The paper uses existing public benchmarks."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The pipeline from data to results is documented: hidden states extracted from last token at selected layers (A.5), trained on 4000 examples, evaluated on remaining ~10K in-distribution and two OOD benchmarks. Three train-test splits are used and averaged."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No funding or acknowledgments section is present in the paper. There is no mention of funding sources, grants, or sponsors."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are clearly stated: '1FAIR at Meta, 2Meta Superintelligence Labs' (page 1)."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "All authors are from Meta (FAIR and Meta Superintelligence Labs). Meta has a financial interest in demonstrating that LLM judges can be efficiently calibrated, as this supports Meta's model deployment and evaluation infrastructure. The funder is not independent of the outcome."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests or financial interests statement is present in the paper."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "The paper evaluates multiple pre-trained models (LLaMA, Qwen, GPT-OSS) on benchmarks but does not state the training data cutoff dates for any of these models."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No discussion of whether the benchmark data (PPE, JudgeBench, RewardBench) may have appeared in the training data of the judge models being evaluated."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The underlying benchmarks within PPE Correctness (MMLU-Pro, MATH, GPQA, MBPP-Plus, IFEval) are well-known public benchmarks that could plausibly appear in training data. This contamination risk is not addressed."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human participants in this study."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants in this study."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "The paper claims '~10x computational savings' compared to multi-generation baselines and mentions 'order-of-magnitude computational savings' but does not report actual inference cost, latency, or token consumption for the probe-based approach."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "No total computational budget is stated. The paper does not report GPU hours, hardware used, total API spend, or training time for the probes."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "Linear probes trained with Brier score loss achieve superior calibration compared to verbalized confidence and multi-generation baselines across model families.",
    293       "evidence": "Tables 1-3 show probes achieve the lowest Kuiper/ECE scores on PPE Correctness and PPE Preference across dense and MoE models. Section 4 reports '70-92% improvements over multi-generation methods and 64-87% improvement over verbalized methods' for Llama models.",
    294       "supported": "strong"
    295     },
    296     {
    297       "claim": "Probes provide approximately 10x computational savings compared to multi-generation methods.",
    298       "evidence": "Section 3 states multi-generation baselines require N=10 independent samples vs. a single-pass probe. The abstract and conclusion claim '~10x computational savings.' However, no actual wall-clock time or cost measurements are provided.",
    299       "supported": "moderate"
    300     },
    301     {
    302       "claim": "Probes generalize robustly to unseen evaluation domains.",
    303       "evidence": "Table 2 shows probes outperform baselines on JudgeBench (OOD) across all model families. However, probes underperform on RewardBench, which the paper attributes to conservative estimation on easier datasets.",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "Probes produce conservative estimates that underperform on easier datasets but may benefit safety-critical deployments.",
    308       "evidence": "Section 4 shows probes lag on RewardBench where models have high accuracy. Appendix A.2 shows probes yield higher accuracy among the most confident predictions but assign fewer high-confidence scores. The safety-critical benefit is argued but not empirically demonstrated.",
    309       "supported": "moderate"
    310     },
    311     {
    312       "claim": "Brier score loss at the middle layer of models yields the most well-calibrated uncertainty estimates.",
    313       "evidence": "Appendix A.7 (Table 8) compares loss functions and shows MSE/Brier score loss outperforms focal loss variants. Appendix A.6 (Figure 4) shows middle layers outperform early/late layers.",
    314       "supported": "strong"
    315     }
    316   ],
    317   "methodology_tags": [
    318     "benchmark-eval"
    319   ],
    320   "key_findings": "Linear probes trained on LLM judge hidden states with Brier score loss achieve substantially better calibration than verbalized confidence and multi-generation baselines, with approximately 10x computational savings. The approach generalizes to unseen benchmarks (JudgeBench) but produces conservative estimates that underperform on easier datasets (RewardBench) where overconfident baselines appear deceptively well-calibrated. The method works across dense and Mixture-of-Experts architectures, with middle transformer layers providing the best probe performance.",
    321   "red_flags": [
    322     {
    323       "flag": "Company evaluating own models",
    324       "detail": "All authors are from Meta (FAIR and Superintelligence Labs). The J1 family of fine-tuned judges, which performs well with probes, is Meta's own product (Whitehouse et al., 2025). GPT-OSS is also an open-source Meta model. The evaluation includes Meta's models prominently."
    325     },
    326     {
    327       "flag": "No statistical significance tests",
    328       "detail": "Comparisons between probe and baseline methods are based on point estimates of Kuiper/ECE across three train-test splits. Despite claiming 'standard deviations were near zero,' no significance tests are reported and the actual standard deviations are not shown in any table."
    329     },
    330     {
    331       "flag": "Computational savings not quantified",
    332       "detail": "The paper repeatedly claims '~10x computational savings' but this is derived from the theoretical N=10 generation count of consistency baselines, not from actual wall-clock time or cost measurements. The overhead of extracting and storing hidden states is not quantified."
    333     },
    334     {
    335       "flag": "No code release",
    336       "detail": "Despite presenting a method claimed to be a practical 'plug-and-play solution for deployment at scale,' no code is released, making it difficult for practitioners to verify or adopt the approach."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    342       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    343       "year": 2023,
    344       "relevance": "Foundational work on the LLM-as-judge paradigm used for evaluating model outputs, directly relevant to the survey's coverage of LLM evaluation methodology."
    345     },
    346     {
    347       "title": "Can LLMs Express Their Uncertainty? An Empirical Evaluation of Confidence Elicitation in LLMs",
    348       "authors": ["Miao Xiong", "Zhiyuan Hu", "Xinyang Lu"],
    349       "year": 2024,
    350       "relevance": "Empirical evaluation of LLM confidence calibration methods, relevant to assessing how well models can self-assess their outputs."
    351     },
    352     {
    353       "title": "Trust or Escalate: LLM Judges with Provable Guarantees for Human Agreement",
    354       "authors": ["Jaehun Jung", "Faeze Brahman", "Yejin Choi"],
    355       "year": 2024,
    356       "relevance": "Addresses the reliability of LLM judges with uncertainty-aware routing, directly relevant to the survey's evaluation of LLM judge methodology."
    357     },
    358     {
    359       "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
    360       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
    361       "year": 2022,
    362       "relevance": "Foundational work on self-consistency as an uncertainty estimation method for LLMs, relevant to understanding multi-generation approaches to calibration."
    363     },
    364     {
    365       "title": "J1: Incentivizing Thinking in LLM-as-a-Judge via Reinforcement Learning",
    366       "authors": ["Chenxi Whitehouse", "Tianlu Wang", "Ping Yu"],
    367       "year": 2025,
    368       "relevance": "State-of-the-art fine-tuned LLM judge family evaluated in this paper, relevant to the survey's coverage of LLM judge quality."
    369     },
    370     {
    371       "title": "RewardBench: Evaluating Reward Models for Language Modeling",
    372       "authors": ["Nathan Lambert", "Valentina Pyatkin", "Jacob Morrison"],
    373       "year": 2024,
    374       "arxiv_id": "2403.13787",
    375       "relevance": "Benchmark for evaluating reward models, relevant to the survey's coverage of LLM evaluation benchmarks and methodology."
    376     },
    377     {
    378       "title": "JudgeBench: A Benchmark for Evaluating LLM-Based Judges",
    379       "authors": ["Sijun Tan", "Siyuan Zhuang", "Kyle Montgomery"],
    380       "year": 2024,
    381       "relevance": "Benchmark specifically designed for evaluating LLM judge quality, relevant to understanding evaluation methodology for LLM-as-judge systems."
    382     },
    383     {
    384       "title": "Semantic Entropy Probes: Robust and Cheap Hallucination Detection in LLMs",
    385       "authors": ["Jannik Kossen", "Jiatong Han", "Muhammed Razzak"],
    386       "year": 2024,
    387       "arxiv_id": "2406.15927",
    388       "relevance": "Related interpretability-based approach for uncertainty estimation in LLMs, relevant to the survey's coverage of LLM reliability methods."
    389     },
    390     {
    391       "title": "Overconfidence in LLM-as-a-Judge: Diagnosis and Confidence-Driven Solution",
    392       "authors": ["Zailong Tian", "Zhuoheng Han", "Yanzhe Chen"],
    393       "year": 2025,
    394       "relevance": "Directly addresses LLM judge overconfidence, a key reliability concern in the LLM evaluation pipeline."
    395     },
    396     {
    397       "title": "Discovering Latent Knowledge in Language Models Without Supervision",
    398       "authors": ["Collin Burns", "Haotian Ye", "Dan Klein", "Jacob Steinhardt"],
    399       "year": 2023,
    400       "relevance": "Foundational work on extracting knowledge from LLM internal representations, relevant to interpretability-based approaches for LLM reliability."
    401     },
    402     {
    403       "title": "How to Evaluate Reward Models for RLHF",
    404       "authors": ["Evan Frick", "Tianle Li", "Connor Chen"],
    405       "year": 2024,
    406       "relevance": "Source of the PPE dataset used for training and evaluation in this work, relevant to understanding evaluation methodology for RLHF reward models."
    407     }
    408   ]
    409 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs