scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (30979B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "The Illusion of Insight in Reasoning Models",
      6     "authors": [
      7       "Liv G. d'Aliberti",
      8       "Manoel Horta Ribeiro"
      9     ],
     10     "year": 2026,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2601.00514",
     13     "doi": "10.48550/arXiv.2601.00514"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "All abstract claims (shift rarity at ~6.31%, no improvement with training, generally lower accuracy, entropy modulation, extrinsic intervention gains) are backed by empirical results in Tables 2-5 and Figures 4-5 across 1M+ annotated traces.",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Causal claims about intervention effects are tested via a controlled Pass 1 vs Pass 2 design with paired outcomes. The claim that shifts 'are symptoms of unstable inference' is supported by logistic regression with problem fixed effects controlling for difficulty confounds.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Section 8 (Limitations) explicitly bounds results to three reasoning domains with well-defined correctness signals, two model families, and prompt-level interventions that cannot establish training-time causal mechanisms or generalize to open-ended tasks.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "The paper considers alternatives: shifts might help at specific training stages (RQ2), at particular temperatures (Fig. 5b), or under high uncertainty (RQ3). Appendix C.6 tests three detector variants to rule out annotation artifacts as an explanation.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Binary accuracy (correct/incorrect) is used as the primary metric and all claims are stated directly in accuracy terms, with no inflation of claims beyond what the measure supports.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 8 (Limitations) is a dedicated section listing four specific limitations: lexical cue dependence, domain scope restricted to well-defined correctness tasks, prompt-level vs. training-level intervention, and limited model family coverage.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Specific threats are named: detector may miss unlexicalized pivots or misclassify surface hedges; evaluation limited to tasks with automatic correctness checks; intervention via prompt cues cannot establish causal mechanism of internal insight.",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper explicitly states results do not generalize to open-ended reasoning or multi-turn interaction, that the intervention doesn't establish causal internal mechanism, and that broader replications across architectures and decoding methods are necessary.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Funding is disclosed in the Acknowledgments: 'This work was supported by a First-Year Fellowship from the Princeton University Graduate School' with compute from Princeton's Beowulf cluster and CITP.",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Both authors' affiliations with Princeton University Department of Computer Science are clearly stated on the paper header.",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Princeton University Graduate School fellowship is an academic funding source with no financial stake in the outcome of research about reasoning model behavior.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "There is no competing interests or financial interests declaration anywhere in the paper. The Ethical Considerations section mentions AI tool use for writing but not financial interests.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Key terms are formally defined: 'Aha!' moment is given a mathematical definition (Definition 3.1) with explicit criteria (δ1 prior failures, δ2 prior stability, δ3 performance gain), and reasoning shifts are operationalized via a two-stage detector (lexical cue + material revision).",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Three explicit contributions are stated in the Introduction: (1) formal definition and experimental framework, (2) empirical characterization at scale across 1M+ traces, and (3) an entropy-gated intervention that induces reliable accuracy gains.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 2 engages substantively with emergent capabilities literature (Schaeffer et al., Wei et al.), self-correction mechanisms, insight characterization (RASM metrics, Yang et al.), and safety/faithfulness work, explicitly positioning the paper relative to DeepSeek-R1 claims.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": true,
    121           "justification": "Code is released via GitHub (referenced in Appendix E: https://github.com/humans-and-machines/Illusion-of-Reasoning) with the full evaluation pipeline, shift-detection code, and configs under recipes/. Trained models are on Hugging Face (Table 7).",
    122           "source": "haiku"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "Data released on Hugging Face: evaluation sets (od2961/rush4-5-6-balanced, od2961/Guardian-cryptonite-official-split) and external model traces (od2961/gpt4o-math500-t0, od2961/deepseek-r1-math500-t0). MATH-500 and CRYPTONITE are publicly available.",
    128           "source": "haiku"
    129         },
    130         "environment_specified": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "GPU types (A100, A6000) and training framework components (vLLM, DeepSpeed ZeRO-3, accelerate, bf16/fp16 precision) are named, but no requirements.txt, Dockerfile, or equivalent versioned dependency specification is provided in the paper.",
    134           "source": "haiku"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": true,
    139           "justification": "Verbatim system prompts are provided in Figures 6-8, exact hyperparameters in Table 9, training configuration in Appendix A.4, and annotation protocols in Appendices B.1-B.4. The paper's level of detail is sufficient to reconstruct the pipeline.",
    140           "source": "haiku"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "confidence_intervals_or_error_bars": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "Main results tables (Tables 2, 3, 5) report only point estimates and p-values without confidence intervals or error bars. Bootstrap CIs appear only in the formal Aha! threshold search (Table 14) and inter-rater agreement, not for primary accuracy results.",
    148           "source": "haiku"
    149         },
    150         "significance_tests": {
    151           "applies": true,
    152           "answer": true,
    153           "justification": "Logistic regressions with cluster-robust standard errors and p-values are reported throughout (Tables 3, 4, 15, 16). The pooled regression shows p < 10^-1198 for the shift-accuracy relationship, N=723,200.",
    154           "source": "haiku"
    155         },
    156         "effect_sizes_reported": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "Raw percentage-point differences (∆pp) and average marginal effects (AME) from logistic regression are consistently reported alongside significance tests, providing interpretable effect size estimates.",
    160           "source": "haiku"
    161         },
    162         "sample_size_justified": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "No formal power analysis or sample size justification is provided. The sample size (M=8 per checkpoint) is chosen by design convention. The resulting total of 723,200+ traces makes underpowering unlikely but the choice is not explicitly justified.",
    166           "source": "haiku"
    167         },
    168         "variance_reported": {
    169           "applies": true,
    170           "answer": true,
    171           "justification": "Cluster-robust standard errors are reported for all regression analyses (clustered by problem), providing variance estimates. Inter-prompt agreement across 5 judge variants is also reported (Table 12: mean κ = 0.655-0.770 by epoch).",
    172           "source": "haiku"
    173         }
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "Non-shifted traces serve as the natural baseline throughout. For the intervention, Pass 1 (unmodified baseline generation) is compared to Pass 2 (with reconsideration cue). External models (DeepSeek-R1, GPT-4o) provide additional baselines.",
    180           "source": "haiku"
    181         },
    182         "baselines_contemporary": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "DeepSeek-R1 and GPT-4o are evaluated as external baselines and are contemporary state-of-the-art reasoning models. The primary models (Qwen2.5, Llama 3.1) are also current.",
    186           "source": "haiku"
    187         },
    188         "ablation_study": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Appendix C.6 compares three shift detector variants (formal, GPT-based, lexical) and confirms the same conclusions. Appendix C.4 tests three reconsideration cue variants (C1-C3). Threshold grids for (δ1, δ2, δ3) are explored systematically in Appendix C.2.",
    192           "source": "haiku"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "Multiple metrics are used: shift prevalence (%S), conditional accuracy (P(✓|S=1)), raw accuracy difference (∆pp), average marginal effects (AME), Shannon entropy, Cohen's κ for annotator agreement, odds ratios for entropy regressions.",
    198           "source": "haiku"
    199         },
    200         "human_evaluation": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Six volunteer annotators provided human labels on 20 examples to validate the LLM-as-judge annotation protocol, achieving GPT-4o vs. human majority vote Cohen's κ = 0.794 (PO = 0.900), reported in Appendix B.3.",
    204           "source": "haiku"
    205         },
    206         "held_out_test_set": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "Fixed evaluation sets are maintained across all checkpoints: 500 MATH-500 problems, 130 synthetic Xwords clues, 500 RHour boards. The paper ensures no train/eval leakage for the GRPO fine-tuning data (openR1 Math-220k vs MATH-500).",
    210           "source": "haiku"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "All results are broken down per domain (Math, Xwords, RHour), per model (Qwen-1.5B, Qwen-7B, Llama-8B), and per decoding temperature (T=0, 0.05, 0.3, 0.7). Appendix D provides extensive additional breakdowns.",
    216           "source": "haiku"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Section 8 discusses failure cases of the detector (false negatives from unlexicalized pivots, false positives from surface hedges). Appendix D.6 provides qualitative examples of formal 'Aha!' detections including cases where shifts occur without accuracy gains.",
    222           "source": "haiku"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "The paper is structured as a negative result — reasoning shifts do not improve accuracy. Additionally, near-zero gains on Xwords (+0.45pp) and RHour (+0.01pp) from forced reconsideration are reported honestly rather than highlighted selectively.",
    228           "source": "haiku"
    229         }
    230       },
    231       "setup_transparency": {
    232         "model_versions_specified": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "Exact model identifiers are provided: Qwen2.5-1.5B, Qwen2.5-7B, Llama 3.1-8B with Hugging Face model names in Table 7. GPT-4o is named as the judge (though snapshot date is not stated).",
    236           "source": "haiku"
    237         },
    238         "prompts_provided": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "Verbatim system prompts for all three domains are provided in Figures 6 (Xwords), 7 (Math), and 8 (RHour). LLM-as-judge system prompt and user template are also provided in Figures 9 and 10.",
    242           "source": "haiku"
    243         },
    244         "hyperparameters_reported": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "Decoding hyperparameters (T∈{0, 0.05, 0.3, 0.7}, top-p=0.95, M=8 samples) and training hyperparameters (learning rates, batch sizes, grad accumulation, KL target 0.07, PPO clip) are fully reported in Table 9 and Appendix A.4.",
    248           "source": "haiku"
    249         },
    250         "scaffolding_described": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "The GRPO training scaffolding is described in detail: domain-specific reward functions (Appendix A.4), rollout architecture (vLLM + DeepSpeed ZeRO-3 + accelerate), tag-structured output contracts (<think>/<answer>), and checkpoint evaluation cadence (every 50 steps).",
    254           "source": "haiku"
    255         },
    256         "data_preprocessing_documented": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "Preprocessing for each domain is documented in Appendix A.1: MATH-500 normalizes LaTeX wrappers; Xwords canonicalizes to uppercase and strips punctuation; RHour uses BFS with per-size node caps, discards timeouts, and stratifies by difficulty.",
    260           "source": "haiku"
    261         }
    262       },
    263       "data_integrity": {
    264         "raw_data_available": {
    265           "applies": true,
    266           "answer": true,
    267           "justification": "Generated traces and shift annotations are released on Hugging Face (Table 20 lists four datasets for external model outputs). Training datasets (CRYPTONITE, openR1 Math-220k) and MATH-500 are publicly available.",
    268           "source": "haiku"
    269         },
    270         "data_collection_described": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "Trace collection procedure is detailed in Section 5.2: M=8 completions per problem per checkpoint, fixed decoding policy, specific token budgets per domain. RHour puzzle generation via BFS with balance constraints is described in Appendix A.1.",
    274           "source": "haiku"
    275         },
    276         "recruitment_methods_described": {
    277           "applies": true,
    278           "answer": true,
    279           "justification": "The 6 volunteer annotators for human validation were 'recruited from the authors' academic networks' with informed consent and ability to withdraw, as described in Appendix B.4.",
    280           "source": "haiku"
    281         },
    282         "data_pipeline_documented": {
    283           "applies": true,
    284           "answer": true,
    285           "justification": "The full pipeline from trace collection to annotation is documented: checkpoint evaluation → trace extraction → cue prefilter → GPT-4o adjudication → label storage (Appendices B.1-B.3). Algorithm 2 formalizes the shift detection procedure with pseudocode.",
    286           "source": "haiku"
    287         }
    288       },
    289       "contamination": {
    290         "training_cutoff_stated": {
    291           "applies": true,
    292           "answer": false,
    293           "justification": "The training data cutoffs for the base models (Qwen2.5, Llama 3.1) are not stated. Only GRPO fine-tuning data splits are specified, not the pretraining data cutoffs relevant to potential benchmark contamination.",
    294           "source": "haiku"
    295         },
    296         "train_test_overlap_discussed": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "The paper addresses GRPO fine-tuning data/eval leakage for Math ('ensuring no train/eval leakage, App. A.1') but does not discuss potential overlap between MATH-500 problems and the base models' pretraining corpora.",
    300           "source": "haiku"
    301         },
    302         "benchmark_contamination_addressed": {
    303           "applies": true,
    304           "answer": false,
    305           "justification": "MATH-500 was publicly available before Qwen2.5 and Llama 3.1's training cutoffs, creating potential contamination. The paper discusses only GRPO fine-tuning data separation, not whether base model pretraining included MATH-500 examples.",
    306           "source": "haiku"
    307         }
    308       },
    309       "human_studies": {
    310         "pre_registered": {
    311           "applies": false,
    312           "answer": false,
    313           "justification": "No human subjects study. The paper explicitly states the annotation activity 'does not constitute human-subjects research' under institutional guidelines.",
    314           "source": "haiku"
    315         },
    316         "irb_or_ethics_approval": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human subjects study. The paper explicitly states 'no IRB review was sought' as the annotation activity does not constitute human-subjects research under institutional guidelines.",
    320           "source": "haiku"
    321         },
    322         "demographics_reported": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human subjects study requiring demographic reporting; 6 annotators were used only for LLM-judge validation and no demographics were collected.",
    326           "source": "haiku"
    327         },
    328         "inclusion_exclusion_criteria": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human subjects study with formal inclusion/exclusion criteria.",
    332           "source": "haiku"
    333         },
    334         "randomization_described": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No randomized human subjects experiment.",
    338           "source": "haiku"
    339         },
    340         "blinding_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No formal human subjects experiment requiring blinding; annotator validation withheld model metadata but this is not a registered human experiment.",
    344           "source": "haiku"
    345         },
    346         "attrition_reported": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human subjects study with attrition reporting requirements.",
    350           "source": "haiku"
    351         }
    352       },
    353       "cost_and_practicality": {
    354         "inference_cost_reported": {
    355           "applies": true,
    356           "answer": true,
    357           "justification": "Total carbon footprint is estimated at approximately 110 kg CO2e following Luccioni et al. (2019) methodology. GPU types (A100, A6000) and cluster infrastructure are specified.",
    358           "source": "haiku"
    359         },
    360         "compute_budget_stated": {
    361           "applies": true,
    362           "answer": true,
    363           "justification": "Compute is quantified via the ~110 kg CO2e carbon estimate and training details (up to 1000 steps for 1.5B model, 500 for 7B/8B models, on 8-GPU Slurm nodes), enabling estimation of total compute budget.",
    364           "source": "haiku"
    365         }
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "Reasoning shifts are rare (~6.31% of traces pooled across models and domains) and consistently associated with lower accuracy compared to non-shifted traces (6.6% vs 29% accuracy).",
    372       "evidence": "Table 2 shows pooled accuracy of 6.6% for shifted traces vs 29% for non-shifted, N=723,200. Pooled logistic regression of correct~shift: p < 10^-1198.",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "Formal 'Aha!' moments meeting all three definitional criteria (prior failure, prior stability, and performance gain) are vanishingly rare, occurring in fewer than 2% of problem-checkpoint pairs even under lenient thresholds.",
    377       "evidence": "Figure 4 shows <3% prevalence across all threshold combinations for Qwen2.5-1.5B at T=0.7. Appendix C.1 and Figures 13-14 confirm similar sparsity across domains, model families, and temperatures.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "The negative effect of reasoning shifts on accuracy does not improve across GRPO training stages, indicating training does not teach models to make beneficial mid-trace shifts.",
    382       "evidence": "Figure 5a shows the raw accuracy gap remaining near zero or negative across 950 training steps. Table 3a reports significant negative AMEs for Xwords (AME=-0.0311, p=0.027) and Math (AME=-0.0615, p=1.55×10^-4) at fixed T=0.7.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Spontaneous reasoning shifts under high uncertainty (top 20% by entropy) do not become reliably beneficial and remain associated with lower accuracy.",
    387       "evidence": "Table 4 shows shifts remain harmful for Math among high-entropy traces (raw ∆=-7.40pp) and the shift coefficient is not significant for high-entropy Math (coef=-0.28, p=0.739), ruling out a 'hidden beneficial regime' under uncertainty.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "Artificially triggered reconsideration under high entropy reliably improves accuracy, with a +8.41pp gain on MATH-500.",
    392       "evidence": "Table 5 shows Pass 1 accuracy 32.2% vs Pass 2 accuracy 40.6% on Math (N=320,000 paired samples, 50,574 wrong→right vs 23,500 right→wrong). Table 26 shows high-entropy instances gain +15.38pp vs +5.82pp for low-entropy.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "The finding that shifts are rare and harmful generalizes across model families (Qwen2.5-1.5B, 7B, Llama 3.1-8B) and external models (DeepSeek-R1, GPT-4o).",
    397       "evidence": "Table 15 shows negative AMEs for both Qwen-7B (AME=-0.0841) and Llama-8B (AME=-0.0688). Table 19 shows canonical shift rates of 0.4-3% for DeepSeek-R1 and GPT-4o with no systematic accuracy benefit.",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "The entropy-gated intervention effect is robust to specific reconsideration cue wording, showing consistent accuracy gains across three semantically similar but lexically distinct cues.",
    402       "evidence": "Table 18 shows OR1σ of 2.21-2.49 across cues C1-C3, all showing similar entropy-accuracy relationships. C2 shows the strongest effect but differences between cues are modest.",
    403       "supported": "moderate"
    404     }
    405   ],
    406   "methodology_tags": [
    407     "benchmark-eval",
    408     "observational",
    409     "experimental"
    410   ],
    411   "key_findings": "Contrary to popular narratives about 'Aha!' moments in reasoning models, mid-trace reasoning shifts are rare (~6.31% of traces), do not improve accuracy (shifted traces average 6.6% vs 29% for non-shifted), and do not become more frequent or beneficial with GRPO training across 1M+ annotated traces, hundreds of checkpoints, and three reasoning domains. Formal 'Aha!' events meeting all definitional criteria occur in fewer than 2% of problem-checkpoint pairs even under lenient thresholds, with no consistent positive effect on accuracy. However, an entropy-gated intervention that forcibly triggers reconsideration when models are uncertain yields consistent accuracy gains (+8.41pp on MATH-500, more modest gains on other domains), suggesting that model uncertainty identifies instances where external prompting for reflection is beneficial. These findings reframe mid-trace reasoning shifts as symptoms of inference instability rather than evidence of genuine intrinsic self-correction capability.",
    412   "red_flags": [
    413     {
    414       "flag": "Causal language from correlational evidence",
    415       "detail": "The conclusion that shifts are 'symptoms of unstable inference behavior' uses causal framing but is supported by correlational evidence. Problem fixed effects in logistic regressions partially address confounding, but the causal direction is not fully established."
    416     },
    417     {
    418       "flag": "Highly domain-specific intervention gains",
    419       "detail": "The entropy-gated forced reconsideration shows large gains on Math (+8.41pp) but near-zero gains on Xwords (+0.45pp) and negligible gains on RHour (+0.01pp), raising questions about whether the practical benefit generalizes beyond structured mathematical reasoning."
    420     },
    421     {
    422       "flag": "GPT-4o as judge introduces proprietary dependency",
    423       "detail": "Shift annotation relies on GPT-4o as an LLM judge — a proprietary model that may change over time. While three detector variants are compared in Appendix C.6 with consistent results, the primary annotation pipeline depends on a model outside the authors' control."
    424     },
    425     {
    426       "flag": "RHour near-zero accuracy floor distorts pooled statistics",
    427       "detail": "RHour accuracy is effectively zero throughout (Qwen-1.5B achieves 0.01% after 950 training steps), making shift effect estimates for this domain practically meaningless in absolute terms and pulling down pooled accuracy averages for non-shifted traces."
    428     },
    429     {
    430       "flag": "Base model pretraining contamination not addressed",
    431       "detail": "MATH-500 was publicly available before the training cutoffs of Qwen2.5 and Llama 3.1. The paper addresses GRPO fine-tuning data leakage but not whether base model pretraining included MATH-500 solutions, which could affect accuracy baselines."
    432     },
    433     {
    434       "flag": "No competing interests declaration",
    435       "detail": "The paper contains no competing interests or financial interests declaration, which is standard practice for peer-reviewed empirical work."
    436     }
    437   ],
    438   "cited_papers": [
    439     {
    440       "title": "DeepSeek-R1: Incentivizes Reasoning in LLMs through Reinforcement Learning",
    441       "relevance": "Primary target of the paper's investigation; the paper directly examines and challenges claims about mid-trace 'Aha!' moments from DeepSeek-R1"
    442     },
    443     {
    444       "title": "Let's Verify Step by Step (Lightman et al., 2024)",
    445       "relevance": "Source of MATH-500 evaluation benchmark used as primary evaluation dataset; also key work on process supervision"
    446     },
    447     {
    448       "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning (GRPO, Shao et al., 2024)",
    449       "relevance": "Source of Group Relative Policy Optimization (GRPO) training method used throughout the paper"
    450     },
    451     {
    452       "title": "Are Emergent Abilities of Large Language Models a Mirage? (Schaeffer et al., 2023)",
    453       "relevance": "Related skeptical analysis of claimed emergent capabilities in LLMs; directly relevant to the paper's argument that 'Aha!' moments are illusory"
    454     },
    455     {
    456       "title": "Self-Refine: Iterative Refinement with Self-Feedback (Madaan et al., 2023)",
    457       "relevance": "Key prior work on self-correction mechanisms that the paper's findings about spontaneous shift inefficacy challenge"
    458     },
    459     {
    460       "title": "Understanding Chain-of-Thought in LLMs through Information Theory (Ton et al., 2025)",
    461       "relevance": "Directly related work on uncertainty in reasoning traces; the entropy measures used in this paper build on this framework"
    462     },
    463     {
    464       "title": "Understanding Aha Moments: From External Observations to Internal Mechanisms (Yang et al., 2025)",
    465       "relevance": "Concurrent work examining similar phenomena and introducing RASM metrics for identifying insight-like behavior; directly compared against in the paper"
    466     },
    467     {
    468       "title": "Training Language Models to Self-Correct via Reinforcement Learning (Kumar et al., 2025)",
    469       "relevance": "Contemporary work on training self-correction that the paper's intervention results partially support while the spontaneous shift findings complicate"
    470     }
    471   ],
    472   "engagement_factors": {
    473     "practical_relevance": {
    474       "score": 2,
    475       "justification": "The entropy-gated forced reconsideration technique (+8.41pp on MATH-500) is immediately usable by practitioners as a simple prompt-level intervention, though domain-specificity limits broad applicability."
    476     },
    477     "surprise_contrarian": {
    478       "score": 3,
    479       "justification": "Directly and rigorously challenges the widely-cited 'Aha!' moment narrative from DeepSeek-R1 — one of the most discussed AI capabilities of 2025 — showing that shifts are associated with lower accuracy rather than insight."
    480     },
    481     "fear_safety": {
    482       "score": 1,
    483       "justification": "Section 2 briefly discusses safety implications of opaque mid-trace self-corrections (hidden objective shifts, deceptive rationales), but this is framing rather than empirical safety analysis."
    484     },
    485     "drama_conflict": {
    486       "score": 2,
    487       "justification": "The paper positions itself against high-profile claims from DeepSeek-R1 and the broader community narrative about emergent reasoning insight, creating a natural controversy with an influential paper."
    488     },
    489     "demo_ability": {
    490       "score": 2,
    491       "justification": "The forced reconsideration intervention is a concrete, replicable technique (append 'Wait, something is not right...' and re-query) that anyone can try on MATH-500 with open Qwen/Llama models and released code."
    492     },
    493     "brand_recognition": {
    494       "score": 1,
    495       "justification": "Princeton University affiliation but no major AI lab involvement. The paper challenges DeepSeek (high brand recognition) but is authored by graduate students at an academic institution."
    496     }
    497   },
    498   "hn_data": {
    499     "threads": [],
    500     "top_points": 0,
    501     "total_points": 0,
    502     "total_comments": 0
    503   }
    504 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs