scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (21770B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "Evaluating LLM Reasoning Beyond Correctness and CoT",
      6     "authors": [
      7       "Soheil Abbasloo"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv",
     11     "arxiv_id": "2510.18134",
     12     "doi": null
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Abstract claim 'GPT-5-chat loses more than 40 points on GSM' is confirmed in Table 1 (Δ=-40.2). Claims about substantial gaps are supported by the wide pS ranges shown.",
     20         "source": "opus"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper is appropriately hedged with causal language. Section 3.2: 'These patterns do not settle the debate, but they add weight to an existing view.' Section 3.1: 'The performance drops may hint at underlying issues in models training, though diagnosing these is beyond our current scope.'",
     26         "source": "opus"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The title 'Evaluating LLM Reasoning Beyond Correctness and CoT' and abstract claim SIEV enables 'a clearer foundation for assessing and understanding the reasoning capabilities of LLMs' are broad generalizations from only two benchmarks (GSM8K and MMLU). The limitations section acknowledges this but the framing exceeds the evidence.",
     32         "source": "opus"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Section 3.2 'Key Takeaway' extensively discusses whether improvements reflect genuine reasoning or context-sensitive pattern matching, citing prior skeptical work (Dziri et al., Kambhampati, McCoy et al.).",
     38         "source": "opus"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Section 2.4 acknowledges 'these signals do not necessarily certify authentic reasoning, they offer stronger grounds for interpreting whether and how a model's apparent reasoning reflects a stable, integrative process.' The gap between measurement (synthesis accuracy) and claim (reasoning quality) is explicitly discussed.",
     44         "source": "opus"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Section 4 'Brief Discussion and Limitations' provides substantive discussion of multiple specific limitations.",
     52         "source": "opus"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Section 4 discusses specific threats: OC measures opposition but not semantic quality of antitheses, synthesis evaluation via correctness alone misses multi-dimensional quality, absence of human-judged reasoning traces limits validation.",
     58         "source": "opus"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Section 4: 'it remains to be seen how these findings generalize to emerging benchmarks, multimodal settings, or tasks that demand long-horizon planning or domain-specific symbolic reasoning.'",
     64         "source": "opus"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding source is mentioned. The author is at Microsoft Research but no funding acknowledgment section exists.",
     72         "source": "opus"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Author affiliation 'Microsoft Research, Vancouver, Canada' is clearly stated on the first page.",
     78         "source": "opus"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "The author works at Microsoft Research. Microsoft has a major partnership with OpenAI whose models (GPT-4, GPT-5, O3, etc.) are extensively evaluated. This financial relationship is not discussed. Several OpenAI models rank highly (O3 ranks #1 on GSM).",
     84         "source": "opus"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests or financial interests statement is present in the paper.",
     90         "source": "opus"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "'Genuine reasoning' and 'pattern mimicry'—the central concepts the paper aims to distinguish—are never operationally defined; 'reasoning' is contrasted with 'pattern matching' but no crisp definition of either is given, only philosophical framing via Hegel.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper is explicit: SIEV is a 'structured, process-driven framework for evaluating LLM reasoning' that overlays a thesis-antithesis-synthesis scaffold onto existing benchmarks; the four key advantages are listed in the introduction.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 1.1 engages substantively with GSM-Plus, GSM-Symbolic, ontology-guided interventions, CoT prompting, and skeptical reasoning work (Dziri, Kambhampati, McCoy), explaining how SIEV differs from each line of work rather than merely listing them.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "benchmark-creation": {
    116       "construct_design": {
    117         "construct_validity_argued": {
    118           "applies": true,
    119           "answer": false,
    120           "justification": "The paper argues SIEV measures reasoning because Hegelian dialectics frames reasoning as thesis-antithesis-synthesis, but this is a philosophical assertion rather than an empirical argument for construct validity; no validation against human expert judgments or other accepted reasoning measures is provided.",
    121           "source": "haiku"
    122         },
    123         "difficulty_distribution_characterized": {
    124           "applies": false,
    125           "answer": false,
    126           "justification": "SIEV is a framework applied to existing benchmarks (GSM8K, MMLU) rather than a new dataset; difficulty is inherited from those benchmarks and not characterized as part of the SIEV contribution.",
    127           "source": "haiku"
    128         },
    129         "ceiling_floor_effects_checked": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "The paper explicitly targets 'saturated benchmarks' where pT clusters near ceiling, and demonstrates that pS spans a much wider range (below 60 to above 90), showing SIEV discriminates where correctness cannot; this functions as an implicit floor/ceiling check.",
    133           "source": "haiku"
    134         },
    135         "human_baseline_included": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "No human baseline is included; the evaluation covers 21 LLMs only, with no human performance data on either the dialectical or correctness tasks.",
    139           "source": "haiku"
    140         },
    141         "scoring_rubric_justified": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "The DS formula uses parameters λ=0.7 and γ=1 with no justification or sensitivity analysis; the paper states the formula but does not explain why these values are appropriate or how they were chosen.",
    145           "source": "haiku"
    146         }
    147       },
    148       "robustness": {
    149         "contamination_resistance_designed": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "The paper asserts SIEV has 'lower susceptibility to contamination' by evaluating dynamics rather than static answers, but provides no empirical test of this claim and no anti-contamination design mechanism; a model fine-tuned on dialectical traces could game it.",
    153           "source": "haiku"
    154         },
    155         "temporal_robustness_discussed": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "There is no discussion of whether SIEV will remain discriminative as models improve at dialectical tasks, whether the framework will need updating, or what a maintenance plan looks like.",
    159           "source": "haiku"
    160         },
    161         "failure_modes_discussed": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Section 4 explicitly discusses failure modes: OC measures opposition presence but not quality ('A generated antithesis may be formally opposing yet shallow or irrelevant'), synthesis quality is multidimensional but only correctness is captured, and human judgment gaps remain.",
    165           "source": "haiku"
    166         },
    167         "baseline_implementations_provided": {
    168           "applies": true,
    169           "answer": true,
    170           "justification": "Appendix A provides complete prompt specifications for all three stages (Thesis, Antithesis, Synthesis) for both MMLU and GSM, and the paper states 'the SIEV source code is publicly available at https://github.com/microsoft/siev'.",
    171           "source": "haiku"
    172         }
    173       },
    174       "documentation": {
    175         "dataset_documentation_complete": {
    176           "applies": false,
    177           "answer": false,
    178           "justification": "SIEV is an evaluation framework/protocol, not a dataset; it uses existing public benchmarks (GSM8K, MMLU) that have their own documentation, so a data card for SIEV itself is not applicable.",
    179           "source": "haiku"
    180         },
    181         "licensing_and_access_clear": {
    182           "applies": true,
    183           "answer": false,
    184           "justification": "The code is pointed to at github.com/microsoft/siev but no licensing terms are stated in the paper; it is unclear under what terms others can use or extend SIEV.",
    185           "source": "haiku"
    186         },
    187         "intended_use_specified": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Section 4 explicitly clarifies 'SIEV is not a benchmark itself, but a dialectical approach to benchmark models' and distinguishes its role from conventional benchmarks; the paper discusses what should not be inferred (e.g., SIEV does not certify 'authentic reasoning').",
    191           "source": "haiku"
    192         }
    193       }
    194     }
    195   },
    196   "claims": [
    197     {
    198       "claim": "GPT-5-chat loses more than 40 points on GSM8K when evaluated through SIEV's process-oriented lens compared to correctness-based evaluation.",
    199       "evidence": "Table 1 shows GPT-5-chat pT=96.4, pS=56.2, ∆=-40.2 on GSM8K.",
    200       "supported": "strong"
    201     },
    202     {
    203       "claim": "Models with similar static correctness scores exhibit substantially different synthesis and dialectical reasoning scores, revealing hidden reasoning gaps.",
    204       "evidence": "Table 1 shows O3 and GPT-5-chat both exceed 96 pT on GSM8K, but pS diverges (93.6 vs 56.2); similar patterns across multiple model pairs.",
    205       "supported": "strong"
    206     },
    207     {
    208       "claim": "SIEV has lower susceptibility to benchmark contamination than correctness-based metrics.",
    209       "evidence": "Asserted in Section 1 as a key advantage; no empirical test of contamination resistance is provided.",
    210       "supported": "unsupported"
    211     },
    212     {
    213       "claim": "A model's self-opposition compliance (OC) strongly predicts how it raises or lowers cross-model OC when acting as antithesis generator.",
    214       "evidence": "Figure 7 shows consistent pattern across 14 models on GSM: high self-OC models (O1, O3, O4-mini) consistently improve partners' cross-OC.",
    215       "supported": "moderate"
    216     },
    217     {
    218       "claim": "LLM reasoning capability is topic-dependent rather than a general uniform skill.",
    219       "evidence": "Figure 6 shows pS varies widely across MMLU domains for the same model (e.g., Llama3.3-70B-Instruct performs well in Elementary Math but poorly in Moral Disputes).",
    220       "supported": "moderate"
    221     },
    222     {
    223       "claim": "Thesis correctness (pT) is weakly related to opposition production (OC) and synthesis improvement (∆).",
    224       "evidence": "Distance correlation analysis in Figure 5 shows weak links between pT and both OC and ∆ across all MMLU sub-topics.",
    225       "supported": "moderate"
    226     }
    227   ],
    228   "methodology_tags": [
    229     "benchmark-eval",
    230     "theoretical"
    231   ],
    232   "key_findings": "SIEV applies a thesis-antithesis-synthesis dialectical scaffold to existing benchmarks (GSM8K, MMLU), revealing that models with near-identical correctness scores diverge substantially in synthesis quality and opposition engagement—e.g., GPT-5-chat drops 40 points from thesis to synthesis on GSM8K while O3 drops only 3.5 points. Across 21 models, synthesis scores (pS) and dialectic scores (DS) consistently expose hidden reasoning gaps invisible to correctness-based evaluation. Cross-model dialectical experiments show that a model's self-opposition compliance strongly predicts its effectiveness as an antithesis generator for other models, and that diverse cross-model antitheses often improve synthesis performance more than self-generated ones. The paper is cautious in interpretation, noting that these gains may reflect structural token-pattern alignment rather than genuine reasoning ability.",
    233   "red_flags": [
    234     {
    235       "flag": "Core construct undefined",
    236       "detail": "'Genuine reasoning' vs 'pattern mimicry' is the central distinction the paper claims to measure, but neither term is operationally defined; the mapping from dialectical performance to reasoning authenticity rests entirely on philosophical analogy to Hegel."
    237     },
    238     {
    239       "flag": "No human baseline",
    240       "detail": "21 LLMs are evaluated but no human performance data is included on any task, making it impossible to contextualize what the SIEV scores mean or whether the framework captures human-like reasoning."
    241     },
    242     {
    243       "flag": "DS parameters unjustified",
    244       "detail": "The Dialectic Score formula uses λ=0.7 and γ=1 with no justification, sensitivity analysis, or explanation of how these values were chosen."
    245     },
    246     {
    247       "flag": "Contamination resistance asserted not tested",
    248       "detail": "SIEV's 'lower susceptibility to contamination' is listed as a key advantage in the introduction but receives zero empirical validation; no contamination experiment is conducted."
    249     },
    250     {
    251       "flag": "Microsoft-OpenAI conflict undisclosed",
    252       "detail": "The sole author is at Microsoft Research, which has deep financial ties to OpenAI; the paper extensively evaluates GPT-4/5/O1/O3/O4-mini family products without disclosing this potential conflict of interest."
    253     },
    254     {
    255       "flag": "Construct validity philosophical not empirical",
    256       "detail": "The argument that dialectical interaction measures reasoning quality derives from Hegelian philosophy; no validation against human expert judgments, other reasoning measures, or downstream task performance is presented."
    257     }
    258   ],
    259   "cited_papers": [
    260     {
    261       "title": "Measuring Massive Multitask Language Understanding (MMLU)",
    262       "relevance": "Primary benchmark used for SIEV evaluation; the paper's key claim is that SIEV reveals reasoning gaps on 'saturated' benchmarks like MMLU."
    263     },
    264     {
    265       "title": "Training Verifiers to Solve Math Word Problems (GSM8K)",
    266       "relevance": "Second primary benchmark; SIEV's most dramatic results (GPT-5-chat -40 points) are from GSM8K."
    267     },
    268     {
    269       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    270       "relevance": "Key foil; SIEV positions itself as going beyond CoT by evaluating reasoning dynamics rather than trace fluency."
    271     },
    272     {
    273       "title": "GSM-Symbolic: Understanding the Limitations of Mathematical Reasoning in Large Language Models",
    274       "relevance": "Prior work on probing reasoning fragility via symbolic perturbations; SIEV is contrasted with this perturbation-based approach."
    275     },
    276     {
    277       "title": "Faith and Fate: Limits of Transformers on Compositionality",
    278       "relevance": "Cited as supporting evidence that LLMs may lack structured reasoning; used to contextualize SIEV's skeptical framing."
    279     },
    280     {
    281       "title": "Can Large Language Models Reason and Plan?",
    282       "relevance": "Kambhampati's skeptical position on LLM reasoning is cited as a key motivation for SIEV's process-oriented evaluation approach."
    283     },
    284     {
    285       "title": "GSM-Plus: A Comprehensive Benchmark for Evaluating Robustness of LLMs as Mathematical Problem Solvers",
    286       "relevance": "Related work on evaluating reasoning robustness through modified questions; contrasted with SIEV's approach of not altering benchmarks."
    287     },
    288     {
    289       "title": "Are Emergent Abilities of Large Language Models a Mirage?",
    290       "relevance": "Cited in discussion of whether LLM performance improvements reflect genuine capability changes; relevant to SIEV's model-size analysis."
    291     }
    292   ],
    293   "engagement_factors": {
    294     "practical_relevance": {
    295       "score": 2,
    296       "justification": "SIEV is benchmark-agnostic and model-agnostic with public code, so practitioners can apply it to any existing benchmark, though the operational overhead of three-stage evaluation is not trivial."
    297     },
    298     "surprise_contrarian": {
    299       "score": 3,
    300       "justification": "The finding that GPT-5-chat—a frontier model—drops 40+ points on a 'solved' math benchmark under dialectical evaluation is genuinely surprising and directly challenges correctness-as-reasoning assumptions."
    301     },
    302     "fear_safety": {
    303       "score": 1,
    304       "justification": "The paper raises concern about opaque model behavior and reasoning reliability in high-stakes domains (scientific analysis, law) but does not frame this as a safety issue."
    305     },
    306     "drama_conflict": {
    307       "score": 2,
    308       "justification": "Direct model-vs-model rankings showing dramatic divergence (O3 vs GPT-5-chat) and the implicit critique of OpenAI's frontier models by a Microsoft researcher creates mild controversy."
    309     },
    310     "demo_ability": {
    311       "score": 3,
    312       "justification": "Code is publicly available at github.com/microsoft/siev and prompts are fully specified in Appendix A; anyone can run SIEV on any benchmark immediately."
    313     },
    314     "brand_recognition": {
    315       "score": 2,
    316       "justification": "Microsoft Research affiliation and evaluation of well-known models (GPT-5, O3, DeepSeek-R1) lends brand recognition, though the author is a single researcher rather than a named lab group."
    317     }
    318   },
    319   "hn_data": {
    320     "threads": [
    321       {
    322         "hn_id": "45838564",
    323         "title": "LLMs encode how difficult problems are",
    324         "points": 174,
    325         "comments": 38,
    326         "url": "https://news.ycombinator.com/item?id=45838564",
    327         "created_at": "2025-11-06T18:29:03Z"
    328       },
    329       {
    330         "hn_id": "46370038",
    331         "title": "A Search for Radio Technosignatures from Interstellar Object 3I/Atlas",
    332         "points": 3,
    333         "comments": 1,
    334         "url": "https://news.ycombinator.com/item?id=46370038",
    335         "created_at": "2025-12-23T22:07:08Z"
    336       },
    337       {
    338         "hn_id": "46425525",
    339         "title": "Optimal Software Pipelining and Warp Specialization for Tensor Core GPUs",
    340         "points": 2,
    341         "comments": 0,
    342         "url": "https://news.ycombinator.com/item?id=46425525",
    343         "created_at": "2025-12-29T20:54:07Z"
    344       },
    345       {
    346         "hn_id": "45751115",
    347         "title": "DeepSeek-OCR: Contexts Optical Compression",
    348         "points": 2,
    349         "comments": 0,
    350         "url": "https://news.ycombinator.com/item?id=45751115",
    351         "created_at": "2025-10-29T18:33:29Z"
    352       },
    353       {
    354         "hn_id": "46069881",
    355         "title": "Conformal Prediction for Compositional Data",
    356         "points": 2,
    357         "comments": 0,
    358         "url": "https://news.ycombinator.com/item?id=46069881",
    359         "created_at": "2025-11-27T15:03:53Z"
    360       },
    361       {
    362         "hn_id": "38152071",
    363         "title": "Reality3DSketch: Rapid 3D Modeling of Objects from Single Freehand Sketches",
    364         "points": 2,
    365         "comments": 0,
    366         "url": "https://news.ycombinator.com/item?id=38152071",
    367         "created_at": "2023-11-05T15:41:49Z"
    368       },
    369       {
    370         "hn_id": "32056080",
    371         "title": "Data-Driven Offline Optimization for Architecting Hardware Accelerators",
    372         "points": 2,
    373         "comments": 0,
    374         "url": "https://news.ycombinator.com/item?id=32056080",
    375         "created_at": "2022-07-11T13:48:52Z"
    376       },
    377       {
    378         "hn_id": "46021507",
    379         "title": "World-in-World: World Models in a Closed-Loop World",
    380         "points": 1,
    381         "comments": 0,
    382         "url": "https://news.ycombinator.com/item?id=46021507",
    383         "created_at": "2025-11-23T07:25:35Z"
    384       },
    385       {
    386         "hn_id": "46369891",
    387         "title": "The size of 3I/ATLAS from non-gravitational acceleration",
    388         "points": 1,
    389         "comments": 1,
    390         "url": "https://news.ycombinator.com/item?id=46369891",
    391         "created_at": "2025-12-23T21:51:08Z"
    392       },
    393       {
    394         "hn_id": "38101172",
    395         "title": "Locomotion Through Step Placement with Straight Legs and Rolling Contacts",
    396         "points": 1,
    397         "comments": 0,
    398         "url": "https://news.ycombinator.com/item?id=38101172",
    399         "created_at": "2023-11-01T16:57:11Z"
    400       }
    401     ],
    402     "top_points": 174,
    403     "total_points": 190,
    404     "total_comments": 40
    405   }
    406 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs