scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25673B)
      1 {
      2   "paper": {
      3     "title": "The Illusion of Insight in Reasoning Models",
      4     "authors": ["Liv G. d'Aliberti", "Manoel Horta Ribeiro"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2601.00514",
      8     "doi": "10.48550/arXiv.2601.00514"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "The paper references a GitHub repository (humans-and-machines/Illusion-of-Reasoning) in the Appendix E and throughout, with code, configs, and evaluation pipeline."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Evaluation datasets and model outputs are released on Hugging Face (Table 7, Table 20), and synthetic data generation code is provided under data/."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Appendix A.4 describes DeepSpeed ZeRO-3, bf16 precision, vLLM, accelerate configs, and GPU types (A100, A6000). Training configs released under recipes/."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Full configs are released in recipes/, data generation code in data/, and checkpoints on Hugging Face (Table 7). Appendix A describes the full setup."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Bootstrap confidence intervals are reported for threshold search (Table 14), entropy regression odds ratios (Table 18), and Cohen's κ (Table 12)."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Logistic regressions with p-values are reported throughout (e.g., p < 10^-1198 in §6.1, AME p-values in Tables 3, 4, 15). Cluster-robust SEs used."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Raw accuracy differences in percentage points (Δpp) and odds ratios (OR) are reported alongside p-values (e.g., Table 2: shifted 2.57% vs non-shifted 16.44%; Table 18: OR1σ values)."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No explicit power analysis or justification for sample sizes (500 MATH problems, 130 Xword clues, 500 RHour boards). The sizes appear inherited from existing benchmarks without justification."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Table 8 reports std across prompt variants. Bootstrap CIs quantify variance in threshold search (Table 14). Table 12 reports mean κ with 95% bootstrap CIs."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Non-shifted traces serve as the baseline throughout. External models (DeepSeek-R1, GPT-4o) are evaluated under matched conditions (Table 19, §C.5)."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "DeepSeek-R1 and GPT-4o are contemporary state-of-the-art reasoning models tested in App. C.5."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "The paper ablates across temperatures, training stages, entropy strata, model sizes, model families, detector variants, and cue variants — systematically isolating each factor."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Shift prevalence (%S), conditional accuracy (P(✓|S=1)), raw accuracy difference (Δpp), AME from logistic regression, Cohen's κ for judge agreement, and odds ratios are all reported."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "6 volunteer annotators validated shift labels on 20 items (§B.4). Human majority vote compared to GPT-4o judge (Table 13: κ=0.794)."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Evaluation sets (MATH-500, 130 Xword clues, 500 RHour boards) are held fixed across checkpoints and explicitly separated from training data (§4, App. A.1)."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down by domain (Xwords, Math, RHour), by model (Qwen-1.5B, Qwen-7B, Llama-8B), by temperature, by training step, and by entropy stratum."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "§8 (Limitations) discusses detector failures (unlexicalized shifts missed, superficial hedges). Qualitative examples in App. D.6 include RHour cases where shifts do not help (gain=+0.00pp)."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper's central finding IS a negative result: reasoning shifts are harmful to accuracy. Forced reconsideration yields negative gains for Llama-8B (Table 17: -4.19pp). RHour shows near-zero effects throughout."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims (shifts are rare ~6.31%, do not improve accuracy, entropy-gated intervention yields +8.41pp on MATH-500) are all directly supported by Tables 2 and 5."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper uses controlled interventions (forced reconsideration with paired Pass 1/Pass 2 comparisons), logistic regressions with problem fixed effects, and explicitly acknowledges in §8 that prompt-level cues do not establish internal causal mechanisms."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "§8 explicitly bounds generalization: 'limited to tasks with well-defined correctness signals,' 'a small set of families (Qwen, Llama),' and notes that broader replications are needed."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "§6.3 discusses that high entropy may cause 'diffuse exploration or verbose flailing' rather than genuine pivots. §7 discusses that shifts may be 'uncertainty-responsive heuristics' rather than genuine reasoning. §8 discusses that detector may miss unlexicalized changes."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper carefully distinguishes between lexical cues (proxy) and genuine insight (target outcome), formalizing this in Def. 3.1 and using multiple detector variants (App. C.6) to test robustness of the proxy."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Specific model versions are given: Qwen2.5-1.5B, Qwen2.5-7B, Llama 3.1-8B, GPT-4o, DeepSeek-R1. Hugging Face repos with exact checkpoints are provided (Table 7)."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Full verbatim system prompts for all three domains are provided in Figures 6, 7, 8. LLM-as-judge prompts in Figures 9, 10. Reconsideration cues C1-C3 in §C.4."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Comprehensive hyperparameter tables in App. A.4 (Table 9): learning rates, batch sizes, gradient accumulation, KL targets, PPO clips, temperatures, top-p, max tokens, etc."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used. The paper evaluates bare model completions with fixed prompts."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "§4 and App. A.1 describe dataset filtering, normalization (case, whitespace, punctuation), BFS solving with node caps for RHour, and train/eval leakage prevention."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "§8 is a dedicated Limitations section with four substantive paragraphs covering detector limitations, domain coverage, intervention scope, and model family coverage."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "§8 discusses specific threats: detector may miss unlexicalized shifts (false negatives), may be triggered by surface hedges (false positives), limited to three domains with well-defined correctness, prompt-level cues don't establish causal mechanisms."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "§8 explicitly states what was NOT tested: open-ended reasoning, multi-turn interaction, training-time interventions, broader architectures and sizes. §7 distinguishes prompt-level interventions from training objective modifications."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Model outputs and shift annotations are released on Hugging Face (Tables 7, 20). Evaluation pipeline and annotation scripts are in the GitHub repository."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "§5.2 describes trace collection: G=8 completions per problem, 20 checkpoints, 4 temperatures, with exact evaluation set sizes. Human annotation procedure in §B.4."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "§B.4 describes annotator recruitment: '6 volunteer adult annotators (unpaid), recruited from the authors' academic networks.' Consent and calibration process described."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Full pipeline documented: trace generation (§5.2) → cue prefilter (§B.2, Table 10) → LLM judge adjudication (Figs. 9-10) → formal Aha detection (Alg. 1). Error handling and fallbacks described."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Acknowledgments section states: 'supported by a First-Year Fellowship from the Princeton University Graduate School' with computational resources from the Beowulf cluster and CITP."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Both authors affiliated with Princeton University Department of Computer Science, clearly stated on the first page."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Funded by Princeton University graduate fellowship and departmental compute. Princeton has no financial interest in whether reasoning models do or do not exhibit Aha moments."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement is present in the paper. Absence of disclosure is not absence of conflict."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper fine-tunes Qwen2.5 and Llama 3.1 base models but does not state the training data cutoff dates for these pre-trained models."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "App. A.1 states 'ensuring no train/eval leakage' for MATH-500 and the Xwords evaluation set uses synthetic templates separate from training. RHour boards are synthetically generated."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "MATH-500 (Lightman et al., 2024) was published before Qwen2.5 and Llama 3.1 training cutoffs. The paper addresses train/eval split separation but does not discuss whether MATH-500 problems appeared in the pre-training data of the base models."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "The human annotation task (6 annotators labeling model outputs) is a validation exercise, not a human-subjects study. No IRB required per §B.4."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "§B.4 explicitly states this 'does not constitute human-subjects research' and no IRB review was sought. Annotators judged model text, not a human intervention."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in the experimental sense. Annotators validated labels, not subjects of study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants. Annotators were recruited for a validation task."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human-subjects experiment requiring randomization."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human-subjects experiment. Though §B.4 notes annotators were blinded to model identity and correctness signals."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human-subjects study. Annotator pool was 6 volunteers for a validation task."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference cost or per-example cost reported despite generating 1M+ traces across multiple models and temperatures."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "§9 (Ethical Considerations) states: 'total carbon footprint of all experiments at approximately 110 kg CO2e.' Hardware specified as NVIDIA A100 and A6000 GPUs under Slurm on 8-GPU nodes."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "The paper generates G=8 samples per problem and varies temperature, but does not report results across multiple random seeds for the GRPO training runs themselves."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "§5.2 explicitly states G=8 completions per problem, 20 checkpoints per run, 4 temperatures. Total trace counts given (e.g., 320,000 Math traces for Qwen-1.5B)."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Table 9 reports the chosen hyperparameters but does not describe how many configurations were tried or the search method used to arrive at them."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No discussion of how the final hyperparameter configurations were selected or whether they were optimized on a validation set."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Many statistical tests are performed across domains, temperatures, models, and entropy strata, but no Bonferroni or other family-wise correction is applied."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors train their own models and evaluate their own shift detection pipeline. No discussion of author-evaluation bias per Lucic et al. (2018)."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "The paper does not compare methods at different compute budgets — it studies a phenomenon (reasoning shifts) rather than proposing a competing method."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "§4 explicitly discusses why each domain was chosen (representational change, progress monitoring, spatial reasoning) and what each tests. §8 acknowledges limitation to tasks with well-defined correctness."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": false,
    333         "answer": false,
    334         "justification": "No scaffolding is used — models are evaluated with direct prompting only."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "Two of three evaluation sets are synthetic (Xword templates, RHour boards), eliminating temporal leakage. MATH-500 train/eval separation is explicitly ensured (App. A.1)."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No explicit discussion of whether the evaluation setup leaks information through prompt structure or whether domain-specific prompts provide hints beyond what a real user would provide."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": true,
    351         "justification": "Training and evaluation sets are explicitly separated: openR1 Math-220k for training, MATH-500 for eval; natural clues for training, synthetic clues for eval; separate RHour boards."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No concrete leakage detection method (canary strings, membership inference, n-gram overlap) is applied. Relies on dataset separation rather than active detection."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Reasoning shifts are rare (~6.31% of traces) and are associated with substantially lower accuracy (2.57% shifted vs 16.44% non-shifted).",
    363       "evidence": "Table 2 and §6.1: pooled logistic regression across 723,200 traces, p < 10^-1198.",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "Formal 'Aha!' moments (satisfying all three criteria of Def. 3.1) are vanishingly rare, occurring in only ~1.79% of samples.",
    368       "evidence": "§6.1 and Fig. 4: prevalence heatmaps across threshold grids show consistently low rates.",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "The negative effect of reasoning shifts does not reliably change across training stages but varies with decoding temperature.",
    373       "evidence": "§6.2, Table 3, Fig. 5: AME remains negative or near-zero across training steps; temperature modulates the effect (Xwords show positive raw contrast at low T).",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Spontaneous reasoning shifts do not become reliably beneficial under high model uncertainty.",
    378       "evidence": "Table 4: in the high-entropy stratum, shifts remain harmful in Math (Δ = -7.40pp) and near-zero in Xwords and RHour.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Artificially triggered reconsideration yields +8.41pp improvement on MATH-500.",
    383       "evidence": "Table 5: paired Pass 1 vs Pass 2 comparison across 320,000 samples; 50,574 wins vs 23,500 losses.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "The entropy-gated intervention is amplified on high-entropy instances.",
    388       "evidence": "Table 26 and App. C.4: high-entropy Math instances show +15.38pp gain vs +5.82pp for low-entropy. OR1σ = 2.2-2.5× across cue variants (Table 18).",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Results generalize across model families: DeepSeek-R1 and GPT-4o show similarly low shift rates and no systematic benefit from shifts.",
    393       "evidence": "Table 19: DeepSeek-R1 shift rate 0.40-0.60%, GPT-4o 2.20-3.00%, with no reliable accuracy benefit.",
    394       "supported": "moderate"
    395     }
    396   ],
    397   "methodology_tags": ["benchmark-eval"],
    398   "key_findings": "Mid-trace reasoning shifts ('Aha!' moments) in RL-tuned language models are rare (~6.31% of 1M+ traces), generally harmful to accuracy, and do not become more frequent or beneficial with training. Formal 'Aha!' events satisfying strict criteria are vanishingly rare (~1.79%). However, extrinsically triggering reconsideration under high entropy reliably improves accuracy (+8.41pp on MATH-500), suggesting uncertainty can be exploited for productive reflection even though spontaneous shifts cannot.",
    399   "red_flags": [
    400     {
    401       "flag": "Small model sizes may limit generalizability",
    402       "detail": "Primary experiments use Qwen2.5-1.5B (small model). Larger models (7B, 8B) are only evaluated on Math at 500 steps due to compute constraints. Results on frontier-scale models (70B+) may differ."
    403     },
    404     {
    405       "flag": "LLM-as-judge circular dependency",
    406       "detail": "GPT-4o is used to judge whether reasoning shifts occur in other models. While human validation shows κ=0.794, the judge may systematically miss or misclassify certain types of shifts, and the judge itself is a reasoning model subject to the phenomena being studied."
    407     }
    408   ],
    409   "cited_papers": [
    410     {
    411       "title": "Deepseek-r1 incentivizes reasoning in LLMs through reinforcement learning",
    412       "authors": ["Daya Guo"],
    413       "year": 2025,
    414       "relevance": "Primary motivation: claims of 'Aha!' moments in RL-trained reasoning models."
    415     },
    416     {
    417       "title": "Training language models to self-correct via reinforcement learning",
    418       "authors": ["Aviral Kumar"],
    419       "year": 2025,
    420       "relevance": "Framework for trained self-correction in LLMs, directly relevant to understanding intrinsic vs extrinsic correction."
    421     },
    422     {
    423       "title": "Monitoring reasoning models for misbehavior and the risks of promoting obfuscation",
    424       "authors": ["Bowen Baker"],
    425       "year": 2025,
    426       "arxiv_id": "2503.11926",
    427       "relevance": "Safety implications of hidden reasoning shifts and deceptive rationales in reasoning models."
    428     },
    429     {
    430       "title": "Self-refine: Iterative refinement with self-feedback",
    431       "authors": ["Aman Madaan"],
    432       "year": 2023,
    433       "relevance": "Key prior work on LLM self-correction and iterative refinement without external feedback."
    434     },
    435     {
    436       "title": "Are emergent abilities of large language models a mirage?",
    437       "authors": ["Rylan Schaeffer"],
    438       "year": 2023,
    439       "relevance": "Questions whether emergent abilities are genuine or evaluation artifacts, directly parallel to this paper's argument about 'Aha!' moments."
    440     },
    441     {
    442       "title": "The illusion of thinking: Understanding the strengths and limitations of reasoning models via the lens of problem complexity",
    443       "authors": ["Parshin Shojaee"],
    444       "year": 2025,
    445       "arxiv_id": "2506.06941",
    446       "relevance": "Companion work questioning apparent reasoning capabilities in LLMs."
    447     },
    448     {
    449       "title": "Understanding chain-of-thought in LLMs through information theory",
    450       "authors": ["Jean-Francois Ton"],
    451       "year": 2025,
    452       "relevance": "Information-theoretic analysis of CoT reasoning and uncertainty signals, directly used in this paper's entropy analysis."
    453     },
    454     {
    455       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    456       "authors": ["Jason Wei"],
    457       "year": 2022,
    458       "relevance": "Foundational work on chain-of-thought prompting that this paper builds upon and questions."
    459     },
    460     {
    461       "title": "Measuring faithfulness in chain-of-thought reasoning",
    462       "authors": ["Tamera Lanham"],
    463       "year": 2023,
    464       "arxiv_id": "2307.13702",
    465       "relevance": "Studies faithfulness of reasoning traces, relevant to whether mid-trace shifts reflect genuine cognitive processes."
    466     },
    467     {
    468       "title": "Understanding the dark side of LLMs' intrinsic self-correction",
    469       "authors": ["Qingjie Zhang"],
    470       "year": 2025,
    471       "relevance": "Examines risks of intrinsic self-correction in LLMs, complementary safety perspective."
    472     },
    473     {
    474       "title": "Self-correction bench: Uncovering and addressing the self-correction blind spot in large language models",
    475       "authors": ["Ken Tsui"],
    476       "year": 2025,
    477       "arxiv_id": "2507.02778",
    478       "relevance": "Benchmark for evaluating LLM self-correction capabilities."
    479     },
    480     {
    481       "title": "Improving mathematical reasoning with process supervision",
    482       "authors": ["OpenAI"],
    483       "year": 2023,
    484       "relevance": "Process supervision approach to reasoning that rewards intermediate steps, directly relevant to training reasoning models."
    485     }
    486   ]
    487 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs