scan-v5.json (22093B)
1 { 2 "scan_version": 5, 3 "paper_type": "benchmark-creation", 4 "paper": { 5 "title": "Evaluating LLM Reasoning Beyond Correctness and CoT", 6 "authors": [ 7 "Soheil Abbasloo" 8 ], 9 "year": 2025, 10 "venue": "arXiv", 11 "arxiv_id": "2510.18134", 12 "doi": null 13 }, 14 "checklist": { 15 "claims_and_evidence": { 16 "abstract_claims_supported": { 17 "applies": true, 18 "answer": true, 19 "justification": "The abstract's headline claim—GPT-5-chat loses >40 points on GSM—is directly verified in Table 1 (∆=-40.2). Claims about 'substantial gaps' and SIEV surfacing hidden weaknesses on MMLU are supported by Table 1 and Figures 1/6.", 20 "source": "haiku" 21 }, 22 "causal_claims_justified": { 23 "applies": true, 24 "answer": false, 25 "justification": "The paper repeatedly frames synthesis scores as evidence of 'genuine reasoning' vs. 'pattern replay,' but the study design is purely observational—no controlled manipulation demonstrates that lower synthesis scores causally reflect less genuine reasoning rather than a different task format effect.", 26 "source": "haiku" 27 }, 28 "generalization_bounded": { 29 "applies": true, 30 "answer": false, 31 "justification": "The limitations section appropriately bounds scope to GSM8K and MMLU, but the abstract and conclusion make broad claims about 'LLMs' and 'reasoning capabilities' that extend well beyond the two saturated benchmarks actually tested.", 32 "source": "haiku" 33 }, 34 "alternative_explanations_discussed": { 35 "applies": true, 36 "answer": true, 37 "justification": "Section 3.2 explicitly raises the alternative that cross-model synthesis gains may reflect structural token familiarity rather than improved reasoning, and the authors cite prior skeptical work (Dziri et al., Kambhampati) without dismissing it.", 38 "source": "haiku" 39 }, 40 "proxy_outcome_distinction": { 41 "applies": true, 42 "answer": false, 43 "justification": "Synthesis correctness (pS) is used throughout as a direct proxy for 'genuine reasoning quality,' but Section 4 itself admits synthesis can be logically coherent yet factually incorrect or vice versa—this distinction is not maintained in the main claims.", 44 "source": "haiku" 45 } 46 }, 47 "limitations_and_scope": { 48 "limitations_section_present": { 49 "applies": true, 50 "answer": true, 51 "justification": "Section 4 'Brief Discussion and Limitations' contains four named subsections covering scope of benchmarks, opposition quality, synthesis evaluation granularity, and absence of human-judged reasoning traces.", 52 "source": "haiku" 53 }, 54 "threats_to_validity_specific": { 55 "applies": true, 56 "answer": true, 57 "justification": "Specific threats are named: OC only measures formal opposition not semantic quality, pS is correctness-only and misses logical coherence, and evaluation intentionally avoids human annotation leaving conceptual-quality judgments unvalidated.", 58 "source": "haiku" 59 }, 60 "scope_boundaries_stated": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper explicitly states 'it remains to be seen how these findings generalize to emerging benchmarks, multimodal settings, or tasks that demand long-horizon planning or domain-specific symbolic reasoning.'", 64 "source": "haiku" 65 } 66 }, 67 "conflicts_of_interest": { 68 "funding_disclosed": { 69 "applies": true, 70 "answer": false, 71 "justification": "No funding disclosure section exists. The impact statement says 'none which we feel must be specifically highlighted here,' with no mention of grants, institutional support, or compute resources.", 72 "source": "haiku" 73 }, 74 "affiliations_disclosed": { 75 "applies": true, 76 "answer": true, 77 "justification": "Author affiliation is clearly stated: 'Microsoft Research, Vancouver, Canada' with contact email.", 78 "source": "haiku" 79 }, 80 "funder_independent_of_outcome": { 81 "applies": true, 82 "answer": false, 83 "justification": "The sole author is a Microsoft Research employee; the paper evaluates Microsoft's own models (GPT-5, GPT-5-chat, GPT-4, O3, O1, O4-mini, etc.) and positions them favorably in some rankings—a clear conflict of interest that is not disclosed.", 84 "source": "haiku" 85 }, 86 "financial_interests_declared": { 87 "applies": true, 88 "answer": false, 89 "justification": "No competing interests statement, patent disclosures, or financial interests declaration appears anywhere in the paper.", 90 "source": "haiku" 91 } 92 }, 93 "scope_and_framing": { 94 "key_terms_defined": { 95 "applies": true, 96 "answer": true, 97 "justification": "Thesis, antithesis, and synthesis are defined in Section 2.1; OC (Opposition Compliance), pS (Synthesis Score), DS (Dialectic Score), and ∆ are formally defined with formulas in Section 2.4.", 98 "source": "haiku" 99 }, 100 "intended_contribution_clear": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 1 explicitly lists four key contributions of SIEV: benchmark/model agnosticism, lower contamination susceptibility, exposing hidden weaknesses, and natural multi-agent compatibility.", 104 "source": "haiku" 105 }, 106 "engagement_with_prior_work": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 1.1 engages substantively with GSM-Plus, GSM-Symbolic, ontology-guided interventions, CoT prompting, and skeptical reasoning literature, explicitly contrasting SIEV from these approaches rather than just listing them.", 110 "source": "haiku" 111 } 112 } 113 }, 114 "type_checklist": { 115 "benchmark-creation": { 116 "construct_design": { 117 "construct_validity_argued": { 118 "applies": true, 119 "answer": true, 120 "justification": "Section 2.1 argues that dialectical thesis–antithesis–synthesis structure measures coherence, adaptability, and integration—dimensions of reasoning that correctness cannot capture—grounding the claim in Hegelian philosophical tradition.", 121 "source": "haiku" 122 }, 123 "difficulty_distribution_characterized": { 124 "applies": false, 125 "answer": false, 126 "justification": "SIEV is explicitly 'not a benchmark itself' (Section 4) but a framework overlaid on existing benchmarks; it creates no new items and therefore has no difficulty distribution to characterize.", 127 "source": "haiku" 128 }, 129 "ceiling_floor_effects_checked": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper explicitly selects GSM8K and MMLU because thesis scores show near-ceiling clustering, then demonstrates SIEV's synthesis scores produce much wider spread (pS ranging from ~40 to ~93), confirming the framework avoids the ceiling effect.", 133 "source": "haiku" 134 }, 135 "human_baseline_included": { 136 "applies": true, 137 "answer": false, 138 "justification": "No human baseline is provided anywhere in the paper; there is no comparison of how humans perform under the thesis–antithesis–synthesis evaluation protocol.", 139 "source": "haiku" 140 }, 141 "scoring_rubric_justified": { 142 "applies": true, 143 "answer": false, 144 "justification": "The DS formula includes free parameters λ=0.7 and γ=1 (Table 1 footnote) with no ablation study, sensitivity analysis, or justification for why these values were chosen over alternatives.", 145 "source": "haiku" 146 } 147 }, 148 "robustness": { 149 "contamination_resistance_designed": { 150 "applies": true, 151 "answer": false, 152 "justification": "The paper claims SIEV has 'lower susceptibility to contamination' because it evaluates dynamics rather than static answers, but provides no empirical validation—models could learn to produce good antitheses through training exposure to the dialectical format.", 153 "source": "haiku" 154 }, 155 "temporal_robustness_discussed": { 156 "applies": true, 157 "answer": false, 158 "justification": "Section 4 acknowledges that generalization to emerging benchmarks is unseen but provides no plan for updating SIEV, versioning the evaluation protocol, or addressing temporal obsolescence as model capabilities evolve.", 159 "source": "haiku" 160 }, 161 "failure_modes_discussed": { 162 "applies": true, 163 "answer": true, 164 "justification": "Section 4 identifies four specific failure modes: OC measures only formal not semantic opposition quality, pS is correctness-only missing logical coherence, synthesis quality is multidimensional, and human judgment is absent for conceptual validity.", 165 "source": "haiku" 166 }, 167 "baseline_implementations_provided": { 168 "applies": true, 169 "answer": true, 170 "justification": "The paper states 'the SIEV source code is publicly available at https://github.com/microsoft/siev' and Appendix A provides full prompt specifications for all three stages.", 171 "source": "haiku" 172 } 173 }, 174 "documentation": { 175 "dataset_documentation_complete": { 176 "applies": false, 177 "answer": false, 178 "justification": "SIEV does not create a new dataset; it is an evaluation framework applied to existing benchmarks (GSM8K, MMLU). No dataset documentation is applicable.", 179 "source": "haiku" 180 }, 181 "licensing_and_access_clear": { 182 "applies": true, 183 "answer": false, 184 "justification": "A GitHub link is provided but the paper does not state the license under which the code is released, nor the terms under which SIEV outputs may be used or shared.", 185 "source": "haiku" 186 }, 187 "intended_use_specified": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 4 'SIEV as a General Approach' explains that SIEV is for evaluating reasoning processes across benchmarks, and the limitations section explains what SIEV does NOT measure (semantic quality of opposition, human-judged reasoning validity).", 191 "source": "haiku" 192 } 193 } 194 } 195 }, 196 "claims": [ 197 { 198 "claim": "GPT-5-chat loses more than 40 points on GSM when evaluated through SIEV's synthesis score compared to its thesis score.", 199 "evidence": "Table 1 reports GPT-5-chat pT=96.4, pS=56.2, ∆=-40.2 on GSM8K.", 200 "supported": "strong" 201 }, 202 { 203 "claim": "Models with near-identical thesis accuracy can exhibit sharply different synthesis performance, revealing hidden reasoning differences.", 204 "evidence": "Table 1 shows multiple models cluster near 96-97 on pT (GSM) while pS spans 56–93; Figure 1 illustrates topic-level divergence on MMLU.", 205 "supported": "strong" 206 }, 207 { 208 "claim": "SIEV has lower susceptibility to contamination than correctness-based metrics.", 209 "evidence": "Claimed in Section 1 Key Contributions item (2), but no empirical validation is provided—it is a theoretical argument only.", 210 "supported": "weak" 211 }, 212 { 213 "claim": "Reasoning capability is strongly topic-dependent rather than a uniform general skill.", 214 "evidence": "Figure 6 shows Llama3.3-70B-Instruct scoring high in Elementary Math but low in Moral Disputes; DeepSeek-R1 peaks in quantitative domains but weakens on normative ones.", 215 "supported": "moderate" 216 }, 217 { 218 "claim": "Models generally show negative ∆, meaning synthesis quality degrades from thesis, indicating limited integrative reasoning.", 219 "evidence": "All 21 models show negative ∆ on both GSM and MMLU in Table 1, ranging from -0.7 to -40.2.", 220 "supported": "strong" 221 }, 222 { 223 "claim": "Cross-model antitheses improve synthesis performance compared to self-generated antitheses for many models.", 224 "evidence": "Figure 7 shows pS gains for GPT-5 of +5.4 to +14 points across pairings; similar patterns for DeepSeek-R1 and O4-mini in most settings.", 225 "supported": "moderate" 226 }, 227 { 228 "claim": "Thesis accuracy (pT) is weakly related to opposition production (OC) and thesis-to-synthesis change (∆).", 229 "evidence": "Figure 5 distance correlation analysis shows weak pT–OC and pT–∆ links; correlation matrix in Section 3.1 discussion confirms this.", 230 "supported": "moderate" 231 } 232 ], 233 "methodology_tags": [ 234 "benchmark-eval", 235 "theoretical" 236 ], 237 "key_findings": "SIEV, a dialectical evaluation framework applying thesis–antithesis–synthesis interactions to existing benchmarks, reveals substantial reasoning gaps hidden by correctness-only metrics: models with near-identical thesis accuracy (e.g., O3 vs. GPT-5-chat both at ~96% on GSM) diverge by >35 points on synthesis score. Across 21 LLMs on GSM8K and MMLU, all models show negative average ∆, meaning synthesis quality universally degrades from thesis, with GPT-5-chat losing 40 points. Cross-model dialectical pairing generally improves synthesis performance compared to self-generated antitheses, suggesting that reasoning in LLMs may be more context-sensitive than a stable general capability. Reasoning performance is strongly topic-dependent across MMLU domains, with quantitative and normative subjects producing very different model rankings.", 238 "red_flags": [ 239 { 240 "flag": "Undisclosed conflict of interest", 241 "detail": "The sole author is a Microsoft Research employee evaluating Microsoft's own models (GPT-5, O3, O1, O4-mini, GPT-4, GPT-4.1 family). No competing interests statement is present." 242 }, 243 { 244 "flag": "Proxy conflated with construct", 245 "detail": "Synthesis correctness (pS) is treated throughout as a direct measure of 'genuine reasoning,' but the paper itself acknowledges synthesis can be logically coherent yet factually wrong—the distinction between the proxy and the construct is not maintained in the main claims." 246 }, 247 { 248 "flag": "Free parameters unjustified", 249 "detail": "The Dialectic Score formula uses λ=0.7 and γ=1 with no ablation study or sensitivity analysis explaining why these values are appropriate." 250 }, 251 { 252 "flag": "Contamination resistance unvalidated", 253 "detail": "The claim that SIEV has 'lower susceptibility to contamination' is theoretical; no empirical test of this claim is provided, and models could learn to produce good antitheses through training exposure to dialectical formats." 254 }, 255 { 256 "flag": "No human baseline", 257 "detail": "The evaluation framework claims to distinguish genuine reasoning from pattern replay, yet no human performance data is provided to calibrate what 'genuine reasoning' looks like under the SIEV protocol." 258 }, 259 { 260 "flag": "Framing-type mismatch", 261 "detail": "Section 4 explicitly states 'SIEV is not a benchmark itself, but a dialectical approach to benchmark models'—yet the paper is presented as a benchmark-creation contribution, creating a fundamental ambiguity about the nature of the contribution." 262 } 263 ], 264 "cited_papers": [ 265 { 266 "title": "Training Verifiers to Solve Math Word Problems (GSM8K)", 267 "relevance": "Core benchmark (GSM8K) on which SIEV is evaluated; represents the correctness-only evaluation paradigm the paper critiques." 268 }, 269 { 270 "title": "Measuring Massive Multitask Language Understanding (MMLU)", 271 "relevance": "Second core benchmark evaluated; treated as broadly saturated for top models, making SIEV's ability to surface variance particularly notable." 272 }, 273 { 274 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 275 "relevance": "Represents the prior state of process-oriented evaluation that SIEV positions itself against and extends." 276 }, 277 { 278 "title": "GSM-Symbolic: Understanding the Limitations of Mathematical Reasoning in LLMs", 279 "relevance": "Related perturbation-based approach to probing reasoning robustness; SIEV contrasts itself by not altering benchmarks." 280 }, 281 { 282 "title": "GSM-Plus: A Comprehensive Benchmark for Evaluating the Robustness of LLMs as Mathematical Problem Solvers", 283 "relevance": "Another correctness-under-perturbation approach that SIEV argues remains tied to the static correctness paradigm." 284 }, 285 { 286 "title": "Faith and Fate: Limits of Transformers on Compositionality", 287 "relevance": "Prior work questioning genuine reasoning capabilities of LLMs; provides theoretical grounding for SIEV's skeptical framing." 288 }, 289 { 290 "title": "Can Large Language Models Reason and Plan?", 291 "relevance": "Kambhampati's skeptical view on LLM planning and reasoning is cited to contextualize SIEV's motivation." 292 }, 293 { 294 "title": "Measuring and Testing Dependence by Correlation of Distances", 295 "relevance": "Statistical methodology (distance correlation) used in SIEV's correlation analysis of dialectical metrics across MMLU sub-topics." 296 }, 297 { 298 "title": "A Peek into Token Bias: Large Language Models Are Not Yet Genuine Reasoners", 299 "relevance": "Prior work on token-level fragility cited as parallel evidence that apparent reasoning may reflect statistical patterns rather than genuine inference." 300 } 301 ], 302 "engagement_factors": { 303 "practical_relevance": { 304 "score": 2, 305 "justification": "Code is available and the framework can be applied to any existing benchmark, but the computational overhead of running three-stage dialectical evaluations for 21 models is substantial for most practitioners." 306 }, 307 "surprise_contrarian": { 308 "score": 2, 309 "justification": "The finding that GPT-5-chat—a top model by correctness—collapses to near-bottom on synthesis scoring is genuinely surprising and challenges the conventional correctness-as-reasoning paradigm." 310 }, 311 "fear_safety": { 312 "score": 0, 313 "justification": "The paper raises no AI safety or risk concerns; the impact statement explicitly declines to highlight societal consequences." 314 }, 315 "drama_conflict": { 316 "score": 1, 317 "justification": "The Microsoft-employee-evaluating-Microsoft-models dynamic creates an implicit tension, and the dramatic ranking reversals (GPT-5-chat from rank 1 to near-last) are noteworthy, but the paper presents this matter-of-factly." 318 }, 319 "demo_ability": { 320 "score": 2, 321 "justification": "Source code is publicly available on GitHub and prompts are fully specified in Appendix A, making the framework reproducible for those with API access to the evaluated models." 322 }, 323 "brand_recognition": { 324 "score": 2, 325 "justification": "Microsoft Research affiliation plus evaluation of named flagship models (GPT-5, O3, O1, DeepSeek-R1, Kimi-K2) gives the paper high brand-recognition surface area." 326 } 327 }, 328 "hn_data": { 329 "threads": [ 330 { 331 "hn_id": "45838564", 332 "title": "LLMs encode how difficult problems are", 333 "points": 174, 334 "comments": 38, 335 "url": "https://news.ycombinator.com/item?id=45838564", 336 "created_at": "2025-11-06T18:29:03Z" 337 }, 338 { 339 "hn_id": "46370038", 340 "title": "A Search for Radio Technosignatures from Interstellar Object 3I/Atlas", 341 "points": 3, 342 "comments": 1, 343 "url": "https://news.ycombinator.com/item?id=46370038", 344 "created_at": "2025-12-23T22:07:08Z" 345 }, 346 { 347 "hn_id": "46425525", 348 "title": "Optimal Software Pipelining and Warp Specialization for Tensor Core GPUs", 349 "points": 2, 350 "comments": 0, 351 "url": "https://news.ycombinator.com/item?id=46425525", 352 "created_at": "2025-12-29T20:54:07Z" 353 }, 354 { 355 "hn_id": "45751115", 356 "title": "DeepSeek-OCR: Contexts Optical Compression", 357 "points": 2, 358 "comments": 0, 359 "url": "https://news.ycombinator.com/item?id=45751115", 360 "created_at": "2025-10-29T18:33:29Z" 361 }, 362 { 363 "hn_id": "46069881", 364 "title": "Conformal Prediction for Compositional Data", 365 "points": 2, 366 "comments": 0, 367 "url": "https://news.ycombinator.com/item?id=46069881", 368 "created_at": "2025-11-27T15:03:53Z" 369 }, 370 { 371 "hn_id": "38152071", 372 "title": "Reality3DSketch: Rapid 3D Modeling of Objects from Single Freehand Sketches", 373 "points": 2, 374 "comments": 0, 375 "url": "https://news.ycombinator.com/item?id=38152071", 376 "created_at": "2023-11-05T15:41:49Z" 377 }, 378 { 379 "hn_id": "32056080", 380 "title": "Data-Driven Offline Optimization for Architecting Hardware Accelerators", 381 "points": 2, 382 "comments": 0, 383 "url": "https://news.ycombinator.com/item?id=32056080", 384 "created_at": "2022-07-11T13:48:52Z" 385 }, 386 { 387 "hn_id": "46021507", 388 "title": "World-in-World: World Models in a Closed-Loop World", 389 "points": 1, 390 "comments": 0, 391 "url": "https://news.ycombinator.com/item?id=46021507", 392 "created_at": "2025-11-23T07:25:35Z" 393 }, 394 { 395 "hn_id": "46369891", 396 "title": "The size of 3I/ATLAS from non-gravitational acceleration", 397 "points": 1, 398 "comments": 1, 399 "url": "https://news.ycombinator.com/item?id=46369891", 400 "created_at": "2025-12-23T21:51:08Z" 401 }, 402 { 403 "hn_id": "38101172", 404 "title": "Locomotion Through Step Placement with Straight Legs and Rolling Contacts", 405 "points": 1, 406 "comments": 0, 407 "url": "https://news.ycombinator.com/item?id=38101172", 408 "created_at": "2023-11-01T16:57:11Z" 409 } 410 ], 411 "top_points": 174, 412 "total_points": 190, 413 "total_comments": 40 414 } 415 }