scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27410B)
      1 {
      2   "paper": {
      3     "title": "GlimpRouter: Efficient Collaborative Inference by Glimpsing One Token of Thoughts",
      4     "authors": [
      5       "Wenhao Zeng",
      6       "Xuteng Zhang",
      7       "Yuling Shi",
      8       "Chao Hu",
      9       "Yuting Chen",
     10       "Beijun Shen",
     11       "Xiaodong Gu"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv",
     15     "arxiv_id": "2601.05110",
     16     "doi": "10.48550/arXiv.2601.05110"
     17   },
     18   "scan_version": 2,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "GlimpRouter proposes using the entropy of the first token of each reasoning step as a routing signal for collaborative inference between small and large models. On AIME25, GlimpRouter achieves 51.67% accuracy (vs 46.67% for standalone LLM) with 25.9% latency reduction. The method is training-free and orthogonal to token-level speculative decoding, enabling compound speedups. However, the paper lacks error bars, contamination analysis, and tunes its threshold on test data.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No code repository URL, GitHub link, or archive is mentioned anywhere in the paper."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "All benchmarks used (AIME24, AIME25, GPQA-Diamond, LiveCodeBench v5/v6) are publicly available standard benchmarks."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions 'NVIDIA A100-80GB GPUs' and 'vLLM inference engine' (Section 4.1) but provides no requirements.txt, Dockerfile, or detailed library version specifications."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided. Algorithm 1 gives pseudocode but not executable reproduction steps."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "All tables report point estimates only (e.g., 51.67% accuracy, 163s latency). No confidence intervals, error bars, or ± notation despite averaging over 4 runs."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper claims GlimpRouter 'outperforms' baselines (Section 4.2) based solely on comparing point estimates. No statistical tests (t-test, bootstrap, etc.) are applied."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "The paper reports relative improvements with baseline context: '10.7% improvement in accuracy' (46.67% → 51.67%) and '25.9% reduction in latency' (220s → 163s) on AIME25. Tables provide absolute numbers for both methods."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No justification for why 4 runs were chosen, nor any power analysis or discussion of whether this is sufficient for reliable comparison."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Section 4.1 states 'all reported results are averaged over 4 runs' but no standard deviation, IQR, or any spread measure is reported in any table or figure."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Comprehensive baselines: standalone SLM/LLM, Random routing, RSD (reward-guided), SpecCoT (selection-based), and SpecReason (verification-based). Table 1."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Baselines include recent 2025 methods: RSD (Liao et al., 2025), SpecCoT (Shi et al., 2025), and SpecReason (Pan et al., 2025), representing state-of-the-art step-level collaboration."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Table 2/6: ablation of routing metric (Hinit vs Hstep vs PPLstep). Section 4.3: sensitivity analysis of threshold τ. Table 3/7: integration with speculative decoding."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Two primary metrics reported: Pass@1 accuracy and average latency in seconds. Intervention rate is also tracked in sensitivity analysis (Table 5)."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "Evaluation is entirely automated (pass@1 on benchmarks). No human evaluation of reasoning trace quality despite qualitative claims about self-correction and reasoning quality."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "Section 4.1 states 'we report the configuration that achieves the optimal balance, maximizing accuracy while securing significant latency reduction.' The entropy threshold τ is selected based on test set performance — no separate validation set is used."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Results broken down across 5 benchmarks (AIME24, AIME25, GPQA, LCBv5, LCBv6) in Tables 1, 4, 5, 6, and 7."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "Case studies in Appendix F show success cases (correct routing, self-correction). No systematic error analysis of when GlimpRouter misroutes or fails."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Ablation shows step-wise entropy and perplexity perform worse than Hinit (Table 2). Sensitivity analysis (Table 5) shows extreme thresholds degrade accuracy. Appendix B shows weaker SLM (1.5B) produces lower accuracy."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims '10.7% improvement in accuracy while reducing inference latency by 25.9% on AIME25' are supported by Table 1 (DeepSeek-32B: 46.67% → 51.67% accuracy, 220s → 163s latency)."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The paper makes causal claims via ablation: removing Hinit for Hstep/PPLstep reduces performance (Table 2). The threshold sensitivity study (Table 5) shows controlled manipulation of the routing parameter. These constitute adequate single-variable ablation designs."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The title claims 'Efficient Collaborative Inference' generally. The abstract says the approach suggests 'a simple yet effective mechanism for reasoning.' Testing is limited to Qwen3 and DeepSeek-R1-Distill families on 3 domains. The Limitations section notes delimiter dependency but the framing is broader than tested."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper attributes accuracy improvement to LRM 'self-correction capacity' (Section 4.2) without considering alternative explanations such as ensemble effects, data-dependent threshold selection, or the SLM contributing diversity. The Signal Dilution hypothesis (Section 2.2) is their own explanation with no alternatives considered."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper measures Pass@1 accuracy and wall-clock latency and claims about exactly those metrics. No proxy gap exists — the measurements match the granularity of the claims."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Specific open-source model names with sizes: Qwen3-4B, Qwen3-32B, DeepSeek-R1-Distill-Qwen-32B, DeepSeek-R1-Distill-Qwen-1.5B. For open-source models with fixed released weights, the name+size uniquely identifies the model."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "The paper does not show the exact input format or system prompts sent to the reasoning models. Benchmark questions are used as input but the formatting, system instructions, and any template wrapping are not provided."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 4.1: temperature=0.6, top-p=0.95, max reasoning tokens=8192, draft length n=3 for speculative decoding."
    161       },
    162       "scaffolding_described": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "GlimpRouter's multi-model orchestration framework is described in detail: step decomposition via double newlines, single-token probing, entropy-based routing, prefix caching for model switching. Pseudocode provided in Algorithm 1."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Standard benchmarks used as-is. Reasoning trace segmentation into steps via structural delimiters (double newlines) is described in Section 3.1. Preliminary study data collection described in Section 2.1."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "A dedicated 'Limitations' section appears after the Conclusion, discussing two specific limitations."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Limitations section identifies study-specific threats: (1) static global threshold may not adapt to varying difficulty distributions across domains, (2) step decomposition relies on double newline delimiters, limiting applicability to models without structured Chain-of-Thought."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Limitations section states the delimiter dependency 'may limit the framework's direct applicability to models that generate unstructured Chain-of-Thought sequences' and notes the threshold 'may not adapt optimally to the varying difficulty distributions across diverse domains.'"
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "No raw data (reasoning traces, entropy distributions, per-run results) is made available for independent verification."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 2.1 describes the preliminary study: models (Qwen3-4B, Qwen3-32B, DeepSeek-R1-Distill-Qwen-32B), datasets (AIME, LiveCodeBench), and scale (10M+ tokens). Section 4.1 describes experimental setup with models, benchmarks, and sampling parameters."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. Data sources are standard public benchmarks."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "The method pipeline is described (Algorithm 1) but the data pipeline from raw model outputs to final reported numbers is not documented. No description of how 4-run averages were computed, whether any runs were excluded, or how intermediate results were aggregated."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding acknowledgment or grant information appears anywhere in the paper."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Authors are identified as being from Shanghai Jiao Tong University. They are not evaluating a commercial product from their own employer."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No funding information is disclosed, so funder independence cannot be verified."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial disclosure statement appears in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No training data cutoff dates are stated for Qwen3 or DeepSeek-R1-Distill models. AIME24/AIME25 problems from math competitions could be in training data."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No discussion of whether AIME problems, GPQA questions, or LiveCodeBench problems appeared in the training data of Qwen3 or DeepSeek-R1 models."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "AIME problems are from well-known math competitions available online. GPQA and LiveCodeBench may also overlap with training data. No contamination analysis is performed."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": true,
    292         "justification": "Average latency in seconds per question is one of the two primary metrics, reported across all experiments in Tables 1, 3-7."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "Hardware is mentioned (A100-80GB GPUs) but total GPU hours, number of GPUs used, or total compute budget for all experiments is not stated."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Section 4.1 states 'all reported results are averaged over 4 runs' but no variance, standard deviation, or sensitivity across runs is reported. Only averaged point estimates are shown."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "Section 4.1 explicitly states: 'To ensure statistical stability, all reported results are averaged over 4 runs.'"
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The sensitivity analysis (Table 5) shows 5 threshold values for GlimpRouter, but the total compute budget spent on this search and whether additional configurations were tried is not stated."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "Section 4.1 describes the selection criterion: 'the configuration that achieves the optimal balance, maximizing accuracy while securing significant latency reduction... approximately 20%-30% intervention rate.' Full sweep shown in Table 5."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors compare GlimpRouter against their own implementations of baselines (or unspecified implementations) without acknowledging potential author-evaluation bias."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "Figure 4 plots accuracy vs latency trade-off curves for GlimpRouter and SpecReason across multiple benchmarks, directly showing performance as a function of computational cost."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The paper uses AIME, GPQA, and LiveCodeBench without discussing whether these benchmarks adequately measure the claimed capabilities (reasoning quality, collaboration efficiency). No construct validity analysis."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": true,
    343         "answer": true,
    344         "justification": "All routing methods (GlimpRouter, SpecReason, RSD, SpecCoT, Random) use the same SLM/LLM pair (e.g., Qwen3-4B + DeepSeek-R1-Distill-Qwen-32B). Differences are attributed to the routing strategy, which is the controlled variable."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "AIME24/AIME25 are competition problems from 2024/2025 that may predate the training cutoff of Qwen3 and DeepSeek-R1 models. No discussion of temporal leakage."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether the evaluation setup leaks answer information. The routing mechanism itself could potentially introduce information leakage from the LLM's context handling."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of whether train and test data share structural similarities or whether benchmark problems overlap with training corpora."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No concrete leakage detection method (canary strings, membership inference, n-gram overlap) is applied."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Initial token entropy (Hinit) exhibits a bimodal distribution that distinguishes routine reasoning steps from complex cognitive bifurcations.",
    373       "evidence": "Section 2.2 and Figure 1 show Hinit has a distinct bimodal and heavy-tailed distribution, unlike unimodal distributions of step-wise metrics. Analysis on 10M+ tokens across Qwen3 and DeepSeek models on AIME and LiveCodeBench.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "GlimpRouter achieves 10.7% accuracy improvement and 25.9% latency reduction on AIME25 compared to standalone DeepSeek-R1-Distill-Qwen-32B.",
    378       "evidence": "Table 1: GlimpRouter 51.67% accuracy / 163s vs LLM-only 46.67% / 220s with DeepSeek-32B as LLM. Averaged over 4 runs but no error bars.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "GlimpRouter establishes a superior Pareto frontier compared to SpecReason across all benchmarks.",
    383       "evidence": "Figure 4 shows GlimpRouter's curve strictly above and to the left of SpecReason on AIME24, AIME25, LCBv5, LCBv6. Table 5 provides numerical backing.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "GlimpRouter is orthogonal to token-level speculative decoding, achieving compound speedups when combined.",
    388       "evidence": "Tables 3 and 7 show that adding speculative decoding to GlimpRouter consistently reduces latency further (e.g., AIME25: 163s → 130s) with negligible accuracy impact across all benchmarks.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Collaborative inference can surpass standalone large model accuracy due to LRM self-correction capacity.",
    393       "evidence": "Table 1 shows GlimpRouter accuracy exceeding LLM-only across most benchmarks. Self-correction mechanism illustrated qualitatively in Appendix F case studies. No systematic quantitative verification of the self-correction hypothesis.",
    394       "supported": "weak"
    395     },
    396     {
    397       "claim": "Hinit outperforms step-wise entropy and perplexity as a routing signal due to signal dilution in step-level metrics.",
    398       "evidence": "Table 2: Hinit achieves 51.67% accuracy / 163s vs Hstep 46.67% / 178s and PPLstep 47.50% / 181s on AIME25. Extended results in Table 6 confirm across 4 benchmarks.",
    399       "supported": "strong"
    400     }
    401   ],
    402   "red_flags": [
    403     {
    404       "flag": "No error bars or variance despite multiple runs",
    405       "detail": "The paper averages over 4 runs but reports only point estimates in all tables. Without variance, it is impossible to assess whether performance differences between methods are within noise. The 4-5 percentage point accuracy differences could be within run-to-run variability."
    406     },
    407     {
    408       "flag": "Threshold tuned on test data",
    409       "detail": "Section 4.1 explicitly states the reported configuration 'achieves the optimal balance, maximizing accuracy while securing significant latency reduction' — this is oracle selection on the test benchmarks with no held-out validation set."
    410     },
    411     {
    412       "flag": "No contamination analysis",
    413       "detail": "AIME problems are well-known competition questions widely available online. The Qwen3 and DeepSeek-R1 models may have seen these problems during training. No training cutoff dates or contamination checks are provided."
    414     },
    415     {
    416       "flag": "Self-correction explanation is speculative",
    417       "detail": "The claim that GlimpRouter outperforms standalone LLM because of 'self-correction capacity' is supported only by two qualitative case studies (Appendix F), not by systematic analysis. Alternative explanations (ensemble diversity, threshold-induced selection bias) are not considered."
    418     },
    419     {
    420       "flag": "Limited model family diversity",
    421       "detail": "All experiments use only Qwen3 and DeepSeek-R1-Distill model families. Whether initial token entropy is a reliable routing signal for other architectures (e.g., GPT, Llama, Gemma) is untested."
    422     }
    423   ],
    424   "cited_papers": [
    425     {
    426       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    427       "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"],
    428       "year": 2025,
    429       "arxiv_id": "2501.12948",
    430       "relevance": "Foundational large reasoning model whose inference overhead motivates this work; its 'Aha Moment' phenomenon is central to the GlimpRouter hypothesis."
    431     },
    432     {
    433       "title": "OpenAI o1 System Card",
    434       "authors": ["Aaron Jaech", "Adam Kalai", "Adam Lerer"],
    435       "year": 2024,
    436       "arxiv_id": "2412.16720",
    437       "relevance": "Documents capabilities and limitations of OpenAI's reasoning models, representing the cost-performance tradeoff this paper addresses."
    438     },
    439     {
    440       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    441       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    442       "year": 2023,
    443       "arxiv_id": "2305.05176",
    444       "relevance": "Early work on query-level LLM routing for cost reduction, a precursor to step-level collaboration approaches."
    445     },
    446     {
    447       "title": "RouteLLM: Learning to Route LLMs with Preference Data",
    448       "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"],
    449       "year": 2024,
    450       "arxiv_id": "2406.18665",
    451       "relevance": "Trained router for LLM query-level allocation using preference data, representing the learned-routing approach GlimpRouter aims to surpass without training."
    452     },
    453     {
    454       "title": "Fast Inference from Transformers via Speculative Decoding",
    455       "authors": ["Yaniv Leviathan", "Matan Kalman", "Yossi Matias"],
    456       "year": 2023,
    457       "relevance": "Foundational token-level acceleration technique that GlimpRouter demonstrates is orthogonal to step-level routing."
    458     },
    459     {
    460       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    461       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    462       "year": 2024,
    463       "arxiv_id": "2403.07974",
    464       "relevance": "Code generation benchmark designed to mitigate contamination, used as a primary evaluation benchmark in this study."
    465     },
    466     {
    467       "title": "GPQA: A Graduate-Level Google-Proof Q&A Benchmark",
    468       "authors": ["David Rein", "Betty Li Hou", "Asa Cooper Stickland"],
    469       "year": 2024,
    470       "relevance": "Expert-level reasoning benchmark for science questions, used to evaluate GlimpRouter on general reasoning tasks."
    471     },
    472     {
    473       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    474       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    475       "year": 2022,
    476       "relevance": "Foundational work establishing chain-of-thought reasoning that large reasoning models build upon."
    477     },
    478     {
    479       "title": "SpecReason: Fast and Accurate Inference-Time Compute via Speculative Reasoning",
    480       "authors": ["Rui Pan", "Yinwei Dai", "Zhihao Zhang"],
    481       "year": 2025,
    482       "arxiv_id": "2504.07891",
    483       "relevance": "Primary step-level collaboration baseline using post-hoc LLM verification, which GlimpRouter outperforms on efficiency."
    484     },
    485     {
    486       "title": "MixLLM: Dynamic Routing in Mixed Large Language Models",
    487       "authors": ["Xinyuan Wang", "Yanchi Liu", "Wei Cheng"],
    488       "year": 2025,
    489       "arxiv_id": "2502.18482",
    490       "relevance": "Dynamic model routing approach representing the broader trend of mixed-model inference optimization."
    491     },
    492     {
    493       "title": "Qwen3 Technical Report",
    494       "authors": ["An Yang", "Anfeng Li", "Baosong Yang"],
    495       "year": 2025,
    496       "arxiv_id": "2505.09388",
    497       "relevance": "Model family used as both SLM and LLM in experiments; its reasoning capabilities are central to the evaluation."
    498     },
    499     {
    500       "title": "Reward-Guided Speculative Decoding for Efficient LLM Reasoning",
    501       "authors": ["Baohao Liao", "Yuhui Xu", "Hanze Dong"],
    502       "year": 2025,
    503       "relevance": "Training-based step-level collaboration using a process reward model, representing the reward-guided routing approach baseline."
    504     }
    505   ]
    506 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs