scan-v5.json (20850B)
1 { 2 "scan_version": 5, 3 "paper_type": "benchmark-creation", 4 "paper": { 5 "title": "Martingale Score: An Unsupervised Metric for Bayesian Rationality in LLM Reasoning", 6 "authors": [ 7 "Zhonghao He", 8 "Tianyi Alex Qiu", 9 "Hirokazu Shirado", 10 "Maarten Sap" 11 ], 12 "year": 2025, 13 "venue": "NeurIPS 2025", 14 "arxiv_id": "2512.02914", 15 "doi": "10.48550/arXiv.2512.02914" 16 }, 17 "checklist": { 18 "claims_and_evidence": { 19 "abstract_claims_supported": { 20 "applies": true, 21 "answer": true, 22 "justification": "Abstract claims of widespread positive Martingale Scores are supported (51/54 CoT setups positive, Table 1), and validation via Brier Score correlation (Figure 4–5) is demonstrated for Forecasting and partially for OpenReview.", 23 "source": "haiku" 24 }, 25 "causal_claims_justified": { 26 "applies": true, 27 "answer": false, 28 "justification": "Section 6 claims to 'causally attribute belief entrenchment to various factors' and Figure 7 is labeled 'causal contribution,' but the analysis is regression on observational data; model selection is not randomized, making causal attribution unsupported.", 29 "source": "haiku" 30 }, 31 "generalization_bounded": { 32 "applies": true, 33 "answer": false, 34 "justification": "Abstract states violations are 'widespread across models, reasoning paradigms, problem domains, and system prompts,' but the paper explicitly acknowledges it did not test reinforced reasoning (a major current paradigm), undermining the breadth of the generalization.", 35 "source": "haiku" 36 }, 37 "alternative_explanations_discussed": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper tests for judge bias (Section 6.2) but does not discuss the key alternative explanation — that models with high initial confidence may simply be less likely to update AND less accurate due to calibration, producing the same correlation without 'entrenchment' as the mechanism.", 41 "source": "haiku" 42 }, 43 "proxy_outcome_distinction": { 44 "applies": true, 45 "answer": true, 46 "justification": "Section 5.1 explicitly distinguishes 'expressed belief' (LLM judge-assigned probability) from 'internal belief,' noting the latter is an open research problem; the paper consistently measures only expressed beliefs.", 47 "source": "haiku" 48 } 49 }, 50 "limitations_and_scope": { 51 "limitations_section_present": { 52 "applies": true, 53 "answer": true, 54 "justification": "A dedicated 'Limitations' subsection appears at the end of Section 7 (Conclusion), addressing specific gaps in the study.", 55 "source": "haiku" 56 }, 57 "threats_to_validity_specific": { 58 "applies": true, 59 "answer": true, 60 "justification": "Limitations specifically name: (1) OpenReview Brier-score correlation not demonstrated due to noisy community-voted labels, (2) reinforced reasoning not studied despite its popularity, (3) only internal reasoning analyzed — not external evidence seeking.", 61 "source": "haiku" 62 }, 63 "scope_boundaries_stated": { 64 "applies": true, 65 "answer": true, 66 "justification": "The paper explicitly states the Martingale Score covers internal reasoning only (not external evidence updating) and that reinforced reasoning is out of scope; Future Work acknowledges these as open questions.", 67 "source": "haiku" 68 } 69 }, 70 "conflicts_of_interest": { 71 "funding_disclosed": { 72 "applies": true, 73 "answer": true, 74 "justification": "Acknowledgments section names Foresight Institute, Lambda Cloud, Open Philanthropy, and Cosmos Institute as financial supporters.", 75 "source": "haiku" 76 }, 77 "affiliations_disclosed": { 78 "applies": true, 79 "answer": true, 80 "justification": "Author affiliations (University of Cambridge, Peking University, Carnegie Mellon University) are disclosed on the title page; none are affiliated with the commercial LLM providers evaluated.", 81 "source": "haiku" 82 }, 83 "funder_independent_of_outcome": { 84 "applies": true, 85 "answer": true, 86 "justification": "Funders (Foresight Institute, Lambda Cloud, Open Philanthropy, Cosmos Institute) are not developers of the LLMs evaluated; Lambda Cloud is a compute provider with no stake in results.", 87 "source": "haiku" 88 }, 89 "financial_interests_declared": { 90 "applies": true, 91 "answer": false, 92 "justification": "There is no competing interests or conflict-of-interest statement anywhere in the paper; only funding acknowledgment is provided.", 93 "source": "haiku" 94 } 95 }, 96 "scope_and_framing": { 97 "key_terms_defined": { 98 "applies": true, 99 "answer": true, 100 "justification": "Section 3.1 formally defines 'belief entrenchment,' 'belief,' and 'belief update'; Section 4.1 defines the Martingale Score operationally with an equation; Section 5.1 defines 'expressed belief' vs 'internal belief.'", 101 "source": "haiku" 102 }, 103 "intended_contribution_clear": { 104 "applies": true, 105 "answer": true, 106 "justification": "A dedicated 'Contributions' bullet list (Introduction) explicitly enumerates three contributions: the metric definition, the empirical prevalence finding, and the accuracy-entrenchment connection.", 107 "source": "haiku" 108 }, 109 "engagement_with_prior_work": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section 2 engages substantively with Bayesian rationality literature, cognitive bias studies, and truthful/truth-seeking AI research, explaining how this work fills the gap left by synthetic or domain-specific prior approaches.", 113 "source": "haiku" 114 } 115 } 116 }, 117 "type_checklist": { 118 "benchmark-creation": { 119 "construct_design": { 120 "construct_validity_argued": { 121 "applies": true, 122 "answer": true, 123 "justification": "Section 4.2 and Appendix A provide formal proofs that the Martingale property is a prerequisite for Bayesian rationality, making its violation a principled (not merely assumed) indicator of belief entrenchment.", 124 "source": "haiku" 125 }, 126 "difficulty_distribution_characterized": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper describes three domains at a high level (fact-based vs. judgment-based) but does not characterize the difficulty distribution of individual questions within domains or provide easy/medium/hard tiers.", 130 "source": "haiku" 131 }, 132 "ceiling_floor_effects_checked": { 133 "applies": true, 134 "answer": false, 135 "justification": "Brier Scores in Figure 4 range from ~0.1–0.4, suggesting some discrimination, but the paper never explicitly checks for ceiling/floor effects in belief scores or Martingale Score distributions.", 136 "source": "haiku" 137 }, 138 "human_baseline_included": { 139 "applies": true, 140 "answer": false, 141 "justification": "Human evaluators are used only for judge-consistency validation (Table 2, 18–20 problems each), not as a performance baseline on the forecasting or other benchmark tasks themselves.", 142 "source": "haiku" 143 }, 144 "scoring_rubric_justified": { 145 "applies": true, 146 "answer": true, 147 "justification": "Section 4.1 explicitly justifies the choice of OLS β₁ over R², logistic regression, and other alternatives, explaining why each alternative introduces confounders or neglects magnitude information.", 148 "source": "haiku" 149 } 150 }, 151 "robustness": { 152 "contamination_resistance_designed": { 153 "applies": true, 154 "answer": true, 155 "justification": "Section 5.2 explicitly selects domains requiring post-training-cutoff knowledge (post-August 2024 events for forecasting, ongoing ICLR reviews) as a deliberate contamination-resistance design choice.", 156 "source": "haiku" 157 }, 158 "temporal_robustness_discussed": { 159 "applies": true, 160 "answer": false, 161 "justification": "The paper does not discuss whether the Martingale Score framework will remain valid or interpretable as models and training procedures change over time; there is no plan for updating the evaluation domains.", 162 "source": "haiku" 163 }, 164 "failure_modes_discussed": { 165 "applies": true, 166 "answer": true, 167 "justification": "The Limitations section identifies three failure modes: correlation with Brier Score not demonstrated on OpenReview due to noisy labels, no coverage of reinforced reasoning, and no external-evidence component.", 168 "source": "haiku" 169 }, 170 "baseline_implementations_provided": { 171 "applies": true, 172 "answer": true, 173 "justification": "Appendix C provides full implementation details (prompts, hyperparameters, compute resources), and the paper states 'Our code and data can be found in the supplementary materials.'", 174 "source": "haiku" 175 } 176 }, 177 "documentation": { 178 "dataset_documentation_complete": { 179 "applies": true, 180 "answer": false, 181 "justification": "The three domains are identified by source (Metaculus, Polymarket, r/CMV, OpenReview) but no data cards, preprocessing steps, or collection methodology are documented; sample sizes are mentioned but filtering criteria are not detailed.", 182 "source": "haiku" 183 }, 184 "licensing_and_access_clear": { 185 "applies": true, 186 "answer": false, 187 "justification": "Code is said to be in supplementary materials but no license is stated; the licensing terms for the derived evaluation framework are never specified.", 188 "source": "haiku" 189 }, 190 "intended_use_specified": { 191 "applies": true, 192 "answer": false, 193 "justification": "The paper discusses what the Martingale Score measures and its validation, but never explicitly states what it should NOT be used to conclude or what misapplications to avoid.", 194 "source": "haiku" 195 } 196 } 197 } 198 }, 199 "claims": [ 200 { 201 "claim": "Belief entrenchment (positive Martingale Score) is prevalent across LLM models, CoT reasoning, and domains — 51/54 CoT setups show positive scores.", 202 "evidence": "Table 1 shows positive Martingale Scores in 51 of 54 CoT configurations across 6 models, 3 domains, and 3 prompt types.", 203 "supported": "strong" 204 }, 205 { 206 "claim": "Higher Martingale Score (more entrenchment) is associated with higher Brier Score (lower accuracy) in Forecasting and OpenReview domains.", 207 "evidence": "Figure 4 shows positive correlation between |Martingale Score| and Brier Score; Figure 5 shows significance in regression models M4–M6 (p<0.05), though R² values are small (max 0.067).", 208 "supported": "moderate" 209 }, 210 { 211 "claim": "Debate technique reduces belief entrenchment compared to Chain-of-Thought reasoning.", 212 "evidence": "Figure 7(a) shows debate has a statistically significant negative coefficient relative to CoT in the regression analysis; Table 1 shows many negative Martingale Scores under Debate.", 213 "supported": "moderate" 214 }, 215 { 216 "claim": "DeepSeek R1 exhibits significantly lower belief entrenchment than all other tested models.", 217 "evidence": "Figure 7(a) shows DeepSeek R1 has a significant negative coefficient relative to the baseline in the factor regression; Figure 7(b) shows its updates more often point toward ground truth.", 218 "supported": "moderate" 219 }, 220 { 221 "claim": "Prior-conforming system prompts substantially increase belief entrenchment; critical thinking prompts show minimal improvement over no prompt.", 222 "evidence": "Mean Martingale Scores reported as M_prior-conforming=0.082±0.018, M_no-prompt=0.075±0.014, M_critical-thinking=0.072±0.018 (95% CI); difference between critical and none is not statistically significant.", 223 "supported": "moderate" 224 }, 225 { 226 "claim": "Belief entrenchment is more severe in judgment-based domains (r/ChangeMyView, OpenReview) than in fact-based domains (Forecasting).", 227 "evidence": "Table 1 shows consistently higher Martingale Scores for r/CMV and OpenReview vs. Forecasting for the same model/prompt combinations; Appendix B regression confirms domain is a significant factor.", 228 "supported": "strong" 229 }, 230 { 231 "claim": "The Martingale Score correlation with Brier Score holds after controlling for domain, reasoning technique, model, and prompt confounders.", 232 "evidence": "Figure 5 shows abs(Martingale) coefficient remains significant in M4 (+Domain +RM, p=0.011) and M5/M6 specifications, though R² contribution is small.", 233 "supported": "moderate" 234 } 235 ], 236 "methodology_tags": [ 237 "benchmark-eval", 238 "observational", 239 "theoretical" 240 ], 241 "key_findings": "The Martingale Score, an OLS regression coefficient measuring predictability of LLM belief updates from prior beliefs, detects widespread belief entrenchment across 6 major LLMs, 3 problem domains, 2 reasoning techniques, and 3 prompt types — with 51/54 Chain-of-Thought configurations showing positive (entrenchment-indicating) scores. Higher entrenchment predicts lower forecasting accuracy (higher Brier Score) even after controlling for domain, technique, model, and prompt confounders, validating the metric as an unsupervised proxy for reasoning quality. Debate reduces entrenchment relative to CoT; DeepSeek R1 is the only outlier model with significantly lower entrenchment; prior-conforming prompts substantially increase entrenchment while critical-thinking prompts offer minimal improvement over no prompt. The metric applies to open-ended domains without ground truth, but the Brier Score validation does not replicate for OpenReview, attributed to noisy community-voted acceptance labels.", 242 "red_flags": [ 243 { 244 "flag": "Correlational ≠ causal", 245 "detail": "The paper frames Figure 7 as 'causal contribution' of factors, but the regression analysis is observational. Model selection is not randomized; comparing DeepSeek R1 vs. other models is confounded by architecture, training data, and RLHF differences." 246 }, 247 { 248 "flag": "Small R² for validation", 249 "detail": "In Figure 5, the Martingale Score explains at most 6.7% of Brier Score variance (R² increment) after controlling for confounders, suggesting entrenchment is a weak predictor of accuracy — potentially undermining the claimed validation." 250 }, 251 { 252 "flag": "OpenReview validation fails", 253 "detail": "The Martingale-Brier correlation is not demonstrated for OpenReview (a key domain), attributed to noisy labels. This is acknowledged but weakens the generality of the claim that Martingale Score predicts accuracy." 254 }, 255 { 256 "flag": "GPT-4o as judge measures GPT-4o's beliefs, not evaluated model's", 257 "detail": "The judge model (GPT-4o) assigns belief scores by reading reasoning trajectories; these are GPT-4o's probabilistic interpretations, not the evaluated model's expressed confidence — creating a potential confound when GPT-4o is also the evaluated model." 258 }, 259 { 260 "flag": "No human performance baseline", 261 "detail": "Human evaluators are used only for judge consistency (n=18–20 problems), not as a performance baseline on forecasting or paper-review tasks. Without this, it is unclear whether the benchmark discriminates meaningfully relative to human-level performance." 262 }, 263 { 264 "flag": "Reinforced reasoning excluded", 265 "detail": "The paper claims 'widespread' belief entrenchment across reasoning paradigms but explicitly excludes reinforced/inference-time-scaled reasoning (DeepSeek R1 style training), which is arguably the dominant current paradigm." 266 } 267 ], 268 "cited_papers": [ 269 { 270 "title": "Is In-Context Learning in Large Language Models Bayesian? A Martingale Perspective", 271 "relevance": "Direct predecessor work testing whether LLM in-context learning follows the Martingale property; motivates the theoretical framing of this paper." 272 }, 273 { 274 "title": "Towards Understanding Sycophancy in Language Models", 275 "relevance": "Key related work on a closely related failure mode (sycophancy); the paper contrasts belief entrenchment from internal reasoning with sycophancy from external pressure." 276 }, 277 { 278 "title": "Bayesian Teaching Enables Probabilistic Reasoning in Large Language Models", 279 "relevance": "Proposes corrective intervention for Bayesian updating failures; cited as prior work on solutions the current paper critiques for requiring auxiliary structures." 280 }, 281 { 282 "title": "Debating with More Persuasive LLMs Leads to More Truthful Answers", 283 "relevance": "Provides the debate reasoning technique evaluated as a comparison condition in this paper." 284 }, 285 { 286 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 287 "relevance": "Foundational paper for the CoT reasoning technique evaluated throughout the benchmark experiments." 288 }, 289 { 290 "title": "Toward Parsimony in Bias Research: A Proposed Common Framework of Belief-Consistent Information Processing", 291 "relevance": "Psychological framework for confirmation bias that provides theoretical grounding for defining belief entrenchment." 292 }, 293 { 294 "title": "Do As We Do, Not As You Think: The Conformity of Large Language Models", 295 "relevance": "Tests group conformity in LLMs — a related reasoning failure mode studied alongside belief entrenchment in Appendix B." 296 }, 297 { 298 "title": "Inverse Scaling in Test-Time Compute", 299 "relevance": "Related finding that more reasoning can hurt performance in some domains; cited as a specific failure case of Bayesian reasoning in LLMs." 300 }, 301 { 302 "title": "Approaching Human-Level Forecasting with Language Models", 303 "relevance": "Provides domain context for the forecasting evaluation pipeline and defines the Brier Score benchmark used for validation." 304 }, 305 { 306 "title": "Automatic Evaluation Metrics for Artificially Generated Scientific Research", 307 "relevance": "Source for the OpenReview ICLR dataset used in the academic paper review domain experiments." 308 } 309 ], 310 "engagement_factors": { 311 "practical_relevance": { 312 "score": 2, 313 "justification": "The metric is unsupervised and domain-agnostic with code released, making it directly applicable by researchers evaluating LLM reasoning pipelines." 314 }, 315 "surprise_contrarian": { 316 "score": 2, 317 "justification": "The finding that Chain-of-Thought — a standard improvement technique — actually entrenches beliefs and can harm accuracy challenges the conventional narrative that more reasoning is better." 318 }, 319 "fear_safety": { 320 "score": 1, 321 "justification": "The paper raises concerns about AI belief lock-in and human-AI feedback loops reinforcing false beliefs, but frames these as future risks rather than demonstrated immediate harms." 322 }, 323 "drama_conflict": { 324 "score": 1, 325 "justification": "The paper captures and analyzes the April 2025 GPT-4o sycophancy incident (Table 3), adding a timely real-world hook to an otherwise methodological paper." 326 }, 327 "demo_ability": { 328 "score": 2, 329 "justification": "Code and data are in supplementary materials with full implementation details; the metric can be computed with any model capable of multi-step reasoning on probability questions." 330 }, 331 "brand_recognition": { 332 "score": 1, 333 "justification": "Authors are from CMU and Cambridge (reputable but not top-tier AI lab brand names); evaluates GPT-4o, DeepSeek, Gemini, and Llama which adds recognition through evaluated models." 334 } 335 }, 336 "hn_data": { 337 "threads": [ 338 { 339 "hn_id": "42810048", 340 "title": "CogAgent: Open-Source Alternative to OpenAI Operator Agent from China", 341 "points": 5, 342 "comments": 0, 343 "url": "https://news.ycombinator.com/item?id=42810048", 344 "created_at": "2025-01-24T02:43:31Z" 345 }, 346 { 347 "hn_id": "29471315", 348 "title": "An Impossible Asylum [pdf]", 349 "points": 1, 350 "comments": 0, 351 "url": "https://news.ycombinator.com/item?id=29471315", 352 "created_at": "2021-12-07T11:25:02Z" 353 } 354 ], 355 "top_points": 5, 356 "total_points": 6, 357 "total_comments": 0 358 } 359 }