scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25823B)
      1 {
      2   "paper": {
      3     "title": "Martingale Score: An Unsupervised Metric for Bayesian Rationality in LLM Reasoning",
      4     "authors": ["Zhonghao He", "Tianyi Qiu", "Hirokazu Shirado", "Maarten Sap"],
      5     "year": 2025,
      6     "venue": "NeurIPS 2025",
      7     "arxiv_id": "2512.02914",
      8     "doi": "10.48550/arXiv.2512.02914"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "The paper proposes the Martingale Score to measure belief entrenchment in LLM reasoning, finding that belief updates are consistently predictable from prior beliefs across 6 models, 3 domains, and multiple prompting conditions — a violation of Bayesian rationality. Belief entrenchment is worst in value-laden domains (r/ChangeMyView) and least severe in factual forecasting. The Martingale Score correlates positively with Brier Score (accuracy loss), validating it as an unsupervised proxy for reasoning quality. DeepSeek R1 shows significantly less entrenchment than other models, and debate reasoning reduces it compared to chain-of-thought.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "Appendix C states 'Our code and data can be found in the supplementary materials' but no public repository URL is provided in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The study uses publicly available datasets (Metaculus, Polymarket, r/ChangeMyView, OpenReview ICLR data) and states data is in supplementary materials."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. Only inference API usage is mentioned."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. Implementation details in Appendix C cover prompts and hyperparameters but not a reproducible workflow."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "95% confidence intervals are reported throughout, e.g., 'M Prior-conforming = 0.082 ± 0.018 ... with 95% CI' (Section 6.1). Figure 7 shows coefficients with 95% CI."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "t-tests with p < 0.05 are used throughout. Table 1 marks statistically significant Martingale Scores with asterisks. Figure 5 reports p-values for regression coefficients."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Martingale Score values are reported with context: 'A positive Martingale Score M indicates that per unit increase in bprior, there is an M-unit increase in Δb.' R² values are reported in Figure 5 (e.g., R²=0.067)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No power analysis or explicit justification for the number of questions per domain. Only '>100 questions' per setup is mentioned without justification."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "95% CIs are reported for aggregate Martingale Scores across setups (e.g., 'M CMV-CoT = 0.103 ± 0.013 ... with 95% CI'). Table 2 reports correlation coefficients with significance."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The Martingale property (β1 = 0) serves as the theoretical baseline. Multiple conditions serve as empirical baselines: no-prompt vs. prior-conforming vs. critical-thinking, CoT vs. debate."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "All evaluated models are contemporary (GPT-4o, DeepSeek R1/V3, Gemini 2.0 Flash, Llama 4 Scout/Maverick). The study compares against the Martingale property itself rather than prior methods."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Figure 7 shows causal contribution of each factor (domain, reasoning technique, model, prompt) via regression analysis, effectively ablating the contribution of each variable to belief entrenchment."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Two primary metrics: Martingale Score (process-based, unsupervised) and Brier Score (outcome-based accuracy). Also inter-rater agreement (Pearson r, Spearman ρ) for judge validation."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Table 2 reports human-LLM agreement with two human evaluators who performed belief evaluations in the same format as LLM judges, validating the LLM-as-judge methodology."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Forecasting questions are explicitly selected to resolve after the models' knowledge cutoff (Section 5.2), creating a temporal held-out set. The design prevents memorization."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 1 provides full breakdown by model × domain × prompt × reasoning technique. Figure 7 breaks down factor contributions. Figure 6 shows belief update patterns."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Figure 1 shows an example of belief entrenchment. The paper discusses where entrenchment is worst (value-laden domains, prior-conforming prompts) and the Limitations section notes OpenReview ground truth issues."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table 1 shows negative Martingale Scores for debate in some setups (e.g., GPT-4o debate on forecasting: -0.0439). Critical thinking prompt shows minimal improvement over no prompt. OpenReview correlation with Brier Score is not demonstrated."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims of 'widespread' violations are supported by Table 1 (51/54 CoT setups show positive M). Claim that M predicts accuracy is supported by Figure 4 and Figure 5 correlation analysis."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims are supported by controlled manipulation of prompts (prior-conforming vs. critical thinking) and regression analysis controlling for confounders (Figure 5, Eq. 7). The regression includes domain, reasoning technique, model, and prompt as controls."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Claims are generally bounded to the tested models, domains, and reasoning techniques. Limitations section acknowledges lack of reinforced reasoning evaluation and external evidence seeking. The paper specifies which models and domains were tested."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 6.1 ('Belief Entrenchment is Not an Artifact') explicitly rules out artifacts. Section 6.2 validates the judge. Figure 5 controls for confounders. Limitations discuss noisy OpenReview ground truth as alternative explanation."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper explicitly frames the Martingale Score as a process-based proxy that 'predicts ground-truth accuracy' (Brier Score) and validates this connection empirically in Figure 4. The gap between the unsupervised metric and actual accuracy is discussed."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "GPT-4o is specified as '(May 7)' version. However, DeepSeek R1, DeepSeek V3, Gemini 2.0 Flash, Llama 4 Scout, and Llama 4 Maverick lack specific version IDs or snapshot dates."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Full prompt text is provided in Appendix C.2, including system prompts (C.1), belief measurement prompts, CoT prompts, debate prompts, and OpenReview question construction."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix C.3 states: 'temperature of 0.1 for models under evaluation, 0.3 for belief measurement. The only exception is Gemini 2.0 Flash, with which we use a temperature of 1.0.'"
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The pipeline is straightforward prompting with a judge model."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "Section 5.2 describes domain selection criteria but does not detail how specific questions were selected or filtered from each platform. No counts of questions before/after filtering are provided."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Explicit 'Limitations' subsection in Section 7 discusses OpenReview ground truth noise and lack of reinforced reasoning evaluation."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Specific threats discussed: OpenReview ground truth is community-voted and noisy (Section 7), resource constraints prevented reinforced reasoning study, belief measurement relies on LLM judge (validated in Section 6.2)."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Future Work (Section 7) explicitly states what was not studied: reinforced reasoning, external evidence searching, downstream consequences in open-ended domains. Limitations notes OpenReview correlation was not demonstrated."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Appendix C states 'Our code and data can be found in the supplementary materials' but no public URL is provided. Supplementary materials availability depends on venue publication."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 5.2 describes data sources: Metaculus and Polymarket for forecasting, r/ChangeMyView for value-laden questions, OpenReview ICLR submissions for paper review. Selection criteria are stated."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "Two human evaluators are used for judge validation (Table 2) but their recruitment, background, and selection criteria are not described."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The pipeline from data collection to Martingale Score computation is outlined conceptually (Figure 3) but intermediate filtering steps and question counts per stage are not documented."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Section 8 acknowledges 'the Foresight Institute, Lambda Cloud, Open Philanthropy, and Cosmos Institute for financial support.'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Authors are from University of Cambridge, Peking University, and Carnegie Mellon University. No affiliation with evaluated model providers."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Funders (Foresight Institute, Lambda Cloud, Open Philanthropy, Cosmos Institute) have no direct financial stake in which models exhibit more or less belief entrenchment."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper mentions 'If a model was trained up to August 2024' as a hypothetical example (Section 5.2) but does not state the actual training cutoff dates for each evaluated model."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Section 5.2 explicitly requires questions 'not solvable by memorization' and designs experiments using events resolved after model knowledge cutoff, directly addressing temporal train/test overlap."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "The experimental design uses questions that 'cannot be answered using information seen during pretraining' (Section 5.2), with ground truth resolved after model training cutoffs. This temporal split addresses contamination by design."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "The two human evaluators are used for judge validation only, not as participants in a human subjects study. The main study evaluates LLM behavior."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human subjects study is conducted. The two human evaluators provide validation annotations, not experimental data."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human subjects study. Human evaluators are used only for judge agreement validation."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human subjects study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human subjects study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human subjects study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human subjects study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Appendix C.3 states: 'This study is carried out entirely with API-based inference, with a total cost of 1,500 USD.'"
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Appendix C.3: 'a total cost of 1,500 USD' for API-based inference. No GPU compute was used."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Temperature is set to 0.1 (low randomness) but no explicit seed variation is reported."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of questions per setup ('>100') is mentioned once in passing but the exact number of runs per configuration is not explicitly stated."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. Temperature values (0.1, 0.3, 1.0) are stated without justification for their selection."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The paper does not select a best configuration — it reports results across all configurations (Table 1) to study the phenomenon, not to optimize a system."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Table 1 reports 108 Martingale Scores with individual p < 0.05 significance tests but no correction for multiple comparisons (e.g., Bonferroni) is applied."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors propose the Martingale Score and evaluate it without acknowledging potential bias in evaluating their own metric. No independent evaluation or discussion of author-evaluation bias."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "All models are evaluated via API at similar cost. Compute budget differences are negligible and not a relevant confound for this study."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "The paper extensively discusses what the Martingale Score measures (Bayesian rationality violation) and validates it against Brier Score (Figures 4-5). Theoretical justification in Section 4.2 and Appendix A."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. Models are evaluated via direct API prompting."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "Section 5.2 criterion 3 explicitly requires 'Ground truth becomes available after models' knowledge cut-off,' designing the study to prevent temporal leakage."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No explicit discussion of whether the evaluation setup (e.g., judge prompts, question framing) could leak answer information to the evaluated models."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether questions within each domain are independent or share structural similarities that could bias results."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "Temporal splits are used as a concrete leakage prevention method: only questions resolved after model training cutoff are used (Section 5.2)."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Belief entrenchment is pervasive across LLM reasoning setups, with 51 out of 54 CoT configurations showing positive Martingale Scores.",
    365       "evidence": "Table 1 shows positive Martingale Scores in 51/54 CoT setups across 6 models, 3 domains, and 3 prompt types. Many are statistically significant (p < 0.05).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "The Martingale Score predicts accuracy loss: higher entrenchment correlates with worse Brier Scores in forecasting.",
    370       "evidence": "Figure 4 shows positive correlation between |Martingale Score| and Brier Score. Figure 5 shows the coefficient is statistically significant (p = 0.011-0.043) after controlling for confounders.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Belief entrenchment is more severe in value-laden domains (r/ChangeMyView) than factual domains (Forecasting).",
    375       "evidence": "Table 1 consistently shows higher M values for ChangeMyView vs Forecasting across all models and prompts. Regression analysis in Figure 7(a) confirms domain as a significant factor.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "DeepSeek R1 shows significantly less belief entrenchment than all other tested models.",
    380       "evidence": "Figure 7(a) shows DeepSeek R1 with significantly lower contribution to entrenchment. Figure 7(b) shows R1's belief updates more likely to move toward ground truth.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Debate reasoning reduces belief entrenchment compared to chain-of-thought.",
    385       "evidence": "Figure 7(a) shows debate with lower entrenchment coefficient. However, Table 1 shows mixed results for debate (some negative M values, many non-significant).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Critical thinking prompts do not significantly reduce belief entrenchment compared to no prompt.",
    390       "evidence": "Section 6.1 reports M_Critical-thinking = 0.072 ± 0.018 vs M_No-prompt = 0.075 ± 0.014 (95% CI), overlapping intervals. Figure 7(a) confirms the difference is not significant.",
    391       "supported": "strong"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No multiple comparison correction",
    397       "detail": "Table 1 reports 108 individual significance tests (p < 0.05) without any family-wise error rate correction. Some significant results may be false positives."
    398     },
    399     {
    400       "flag": "LLM-as-judge for belief measurement",
    401       "detail": "The core measurement (belief extraction) relies on GPT-4o as judge. While cross-judge and human agreement are validated (Table 2), systematic judge biases could affect the Martingale Score calculation. Only 2 human evaluators with 20 and 18 problems respectively."
    402     },
    403     {
    404       "flag": "Missing model version specificity",
    405       "detail": "Most models lack specific version IDs or snapshot dates. Model behavior changes across versions, which could affect reproducibility."
    406     }
    407   ],
    408   "cited_papers": [
    409     {
    410       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    411       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    412       "year": 2022,
    413       "relevance": "Foundational reasoning technique evaluated in this study for belief entrenchment."
    414     },
    415     {
    416       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    417       "authors": ["Daya Guo"],
    418       "year": 2025,
    419       "arxiv_id": "2501.12948",
    420       "relevance": "Reinforced reasoning model that showed significantly less belief entrenchment than other models."
    421     },
    422     {
    423       "title": "Towards understanding sycophancy in language models",
    424       "authors": ["Mrinank Sharma"],
    425       "year": 2023,
    426       "arxiv_id": "2310.13548",
    427       "relevance": "Directly relevant to LLM cognitive biases — sycophancy is a related form of belief-update failure."
    428     },
    429     {
    430       "title": "Relying on the unreliable: The impact of language models' reluctance to express uncertainty",
    431       "authors": ["Kaitlyn Zhou"],
    432       "year": 2024,
    433       "relevance": "Studies LLM confidence calibration and uncertainty expression, directly related to belief entrenchment."
    434     },
    435     {
    436       "title": "Debating with more persuasive LLMs leads to more truthful answers",
    437       "authors": ["Akbir Khan"],
    438       "year": 2024,
    439       "arxiv_id": "2402.06782",
    440       "relevance": "Debate reasoning technique evaluated in this study as a potential mitigation for belief entrenchment."
    441     },
    442     {
    443       "title": "TruthfulQA: Measuring how models mimic human falsehoods",
    444       "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"],
    445       "year": 2021,
    446       "arxiv_id": "2109.07958",
    447       "relevance": "Benchmark for LLM truthfulness, related to truth-seeking evaluation."
    448     },
    449     {
    450       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    451       "authors": ["Evan Hubinger"],
    452       "year": 2024,
    453       "arxiv_id": "2401.05566",
    454       "relevance": "Studies LLM deception which is related to the distinction between truthful and truth-seeking AI."
    455     },
    456     {
    457       "title": "Approaching human-level forecasting with language models",
    458       "authors": ["Danny Halawi"],
    459       "year": 2024,
    460       "arxiv_id": "2402.18563",
    461       "relevance": "LLM forecasting evaluation directly relevant to this paper's forecasting domain experiments."
    462     },
    463     {
    464       "title": "Beyond accuracy: Evaluating the reasoning behavior of large language models – a survey",
    465       "authors": ["Philipp Mondorf", "Barbara Plank"],
    466       "year": 2024,
    467       "arxiv_id": "2404.01869",
    468       "relevance": "Survey on process-based reasoning evaluation, motivating the need for metrics beyond outcome accuracy."
    469     },
    470     {
    471       "title": "Inverse scaling in test-time compute",
    472       "authors": ["Aryo Pradipta Gema"],
    473       "year": 2025,
    474       "arxiv_id": "2507.14417",
    475       "relevance": "Demonstrates reasoning failure modes where more compute hurts performance, related to belief entrenchment."
    476     },
    477     {
    478       "title": "AI-LieDar: Examine the trade-off between utility and truthfulness in LLM agents",
    479       "authors": ["Zhe Su"],
    480       "year": 2025,
    481       "relevance": "Studies truthfulness-utility tradeoff in LLM agents, relevant to truth-seeking AI evaluation."
    482     }
    483   ]
    484 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs