scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25801B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "GlimpRouter: Efficient Collaborative Inference by Glimpsing One Token of Thoughts",
      6     "authors": [
      7       "Wenhao Zeng",
      8       "Xuteng Zhang",
      9       "Yuling Shi",
     10       "Chao Hu",
     11       "Yuting Chen",
     12       "Beijun Shen",
     13       "Xiaodong Gu"
     14     ],
     15     "year": 2026,
     16     "venue": "arXiv.org",
     17     "arxiv_id": "2601.05110",
     18     "doi": "10.48550/arXiv.2601.05110"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The 10.7% accuracy improvement and 25.9% latency reduction on AIME25 are directly verifiable in Table 1 (46.67%→51.67% accuracy, 220s→163s latency for DeepSeek-32B config). All other abstract claims are corroborated by Figures 1–2 and the experimental tables.",
     26         "source": "haiku"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Causal claims about Hinit superiority are supported by ablation studies (Tables 2 and 6) directly comparing Hinit vs Hstep vs PPLstep at matched intervention rates, and threshold sensitivity analysis isolates the routing mechanism's contribution.",
     32         "source": "haiku"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper frames GlimpRouter as broadly applicable to LRMs but the Limitations section only bounds it to models with structural step delimiters; the abstract and conclusion make implicit claims about general applicability that exceed the tested settings.",
     38         "source": "haiku"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The counterintuitive accuracy improvement over standalone LLM is attributed solely to self-correction capacity without considering alternatives such as threshold selection bias, high variance on small benchmarks (30 problems), or the LLM simply handling a higher fraction of hard problems.",
     44         "source": "haiku"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Claims are made in terms of Pass@1 accuracy and wall-clock latency, which are exactly what is measured; no conflation of proxies with outcomes.",
     50         "source": "haiku"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "A dedicated Limitations section after the Conclusion discusses static threshold non-adaptability and delimiter dependence as specific limitations.",
     58         "source": "haiku"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The Limitations section names specific threats: (1) a static global threshold may misfire on diverse domain difficulty distributions, and (2) double-newline structural delimiters limit applicability to LRMs without structured CoT formatting.",
     64         "source": "haiku"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No explicit statements bound what the results do NOT show; the limitations discuss engineering constraints but do not state that findings are limited to, e.g., math/code domains or models in the Qwen3/DeepSeek families.",
     70         "source": "haiku"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding source is mentioned anywhere in the paper.",
     78         "source": "haiku"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "All authors are listed as affiliated with Shanghai Jiao Tong University, clearly disclosed on the title page.",
     84         "source": "haiku"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "No funder is disclosed, so independence cannot be assessed.",
     90         "source": "haiku"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests or financial disclosure statement appears in the paper.",
     96         "source": "haiku"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Key terms including 'collaborative inference,' 'initial token entropy (Hinit),' 'SLM/LLM,' 'intervention rate,' and 'step-level difficulty' are formally defined in Sections 2–3.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Contributions are explicitly enumerated at the end of the Introduction: the Hinit analysis, the GlimpRouter framework, and empirical validation demonstrating latency-accuracy tradeoffs.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 5 (Related Work) situates GlimpRouter relative to token-level (speculative decoding), step-level (SpecCoT, SpecReason, RSD), and query-level methods, explicitly explaining how each differs mechanistically.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "No code repository is linked or mentioned in the paper.",
    127           "source": "haiku"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "All benchmarks (AIME24, AIME25, GPQA-Diamond, LiveCodeBench v5/v6) are standard, publicly available datasets used unmodified.",
    133           "source": "haiku"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "Hardware (NVIDIA A100-80GB) and inference engine (vLLM) are named but no requirements file, container, or package version list is provided.",
    139           "source": "haiku"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "Algorithm 1 provides pseudocode for the method but no step-by-step instructions for reproducing experiments, including model setup, evaluation scripts, or figure generation.",
    145           "source": "haiku"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "Results are averaged over 4 runs but no confidence intervals, standard deviations, or error bars are reported for any metric in any table.",
    153           "source": "haiku"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No statistical significance tests are performed despite numerous comparative claims across baselines and benchmarks.",
    159           "source": "haiku"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Effect sizes are reported as percentage improvements with baseline context throughout (e.g., '10.7% improvement in accuracy' from 46.67% to 51.67%, '25.9% reduction in latency' from 220s to 163s).",
    165           "source": "haiku"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "AIME24/25 each have 30 problems; the paper does not discuss whether these benchmark sizes provide sufficient statistical power for the magnitude of differences being claimed.",
    171           "source": "haiku"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "Results are averaged over 4 runs but standard deviation or variance across those runs is not reported in any table or figure.",
    177           "source": "haiku"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Six baselines are included: standalone SLM, standalone LLM, Random routing, RSD, SpecCoT, and SpecReason.",
    185           "source": "haiku"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "All collaborative inference baselines (RSD, SpecCoT, SpecReason) are from 2025, contemporaneous with GlimpRouter.",
    191           "source": "haiku"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Ablation studies compare Hinit vs Hstep vs PPLstep (Tables 2 and 6) and Speculative Decoding integration (Tables 3 and 7); threshold sensitivity is analyzed in Figure 4 and Table 5.",
    197           "source": "haiku"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Both Pass@1 accuracy and end-to-end wall-clock latency are reported for all experiments, and intervention rate is reported in sensitivity tables.",
    203           "source": "haiku"
    204         },
    205         "human_evaluation": {
    206           "applies": false,
    207           "answer": false,
    208           "justification": "Human evaluation is not relevant for an automated system efficiency paper evaluated on mathematical and code benchmarks.",
    209           "source": "haiku"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Standard competition and academic benchmarks (AIME, LiveCodeBench, GPQA-Diamond) are used as held-out test sets with no training on evaluation data.",
    215           "source": "haiku"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Results are broken down across benchmark types (math: AIME24/25, general reasoning: GPQA, code: LCBv5/v6) and across multiple model configurations (Tables 1 and 4).",
    221           "source": "haiku"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": false,
    226           "justification": "Appendix F case studies show only successful routing and self-correction examples; no failure cases where GlimpRouter misroutes or degrades performance are presented.",
    227           "source": "haiku"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": false,
    232           "justification": "No negative results for GlimpRouter are reported; configurations where GlimpRouter underperforms are not surfaced even though Table 5 shows full threshold sweeps.",
    233           "source": "haiku"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Exact model names are specified: Qwen3-4B, Qwen3-32B, DeepSeek-R1-Distill-Qwen-32B, and DeepSeek-R1-Distill-Qwen-1.5B.",
    241           "source": "haiku"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "No prompts or system instructions used for the LLM-as-a-Judge baseline or for benchmark evaluation are provided.",
    247           "source": "haiku"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "Temperature (0.6), top-p (0.95), max reasoning tokens (8192), and speculative decoding draft length (n=3) are reported in Implementation Details.",
    253           "source": "haiku"
    254         },
    255         "scaffolding_described": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "The GlimpRouter pipeline is described in detail across Sections 3.1–3.6, including step decomposition, probing, routing logic, prefix caching, and complete pseudocode in Algorithm 1.",
    259           "source": "haiku"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": false,
    264           "justification": "For the preliminary study collecting 10M+ reasoning step tokens, the collection procedure, problem sampling, and any filtering steps are not documented.",
    265           "source": "haiku"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "No raw model outputs, reasoning traces, or intermediate results are released.",
    273           "source": "haiku"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": false,
    278           "justification": "The preliminary study states it collected 'over 10 million tokens of reasoning steps' from three models on AIME and LiveCodeBench, but the number of problems sampled, number of traces per problem, and any filtering criteria are not described.",
    279           "source": "haiku"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human participants; standard benchmarks are used.",
    285           "source": "haiku"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": false,
    290           "justification": "The full pipeline from benchmark loading through evaluation result aggregation is not documented; only the routing algorithm itself is formally specified.",
    291           "source": "haiku"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "Training data cutoffs for Qwen3 and DeepSeek-R1 model families are not stated, despite evaluating on AIME25 problems from 2025 competitions that could overlap with training data.",
    299           "source": "haiku"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "No discussion of potential train-test overlap for AIME25 or GPQA-Diamond; the paper cites LiveCodeBench as 'contamination free' but does not address contamination for other benchmarks.",
    305           "source": "haiku"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "AIME25 problems from 2025 competitions could be present in training data of models with 2025 cutoffs; this is unaddressed, which could inflate accuracy numbers relative to truly novel problems.",
    311           "source": "haiku"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants in this study.",
    319           "source": "haiku"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants in this study.",
    325           "source": "haiku"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants in this study.",
    331           "source": "haiku"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants in this study.",
    337           "source": "haiku"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants in this study.",
    343           "source": "haiku"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants in this study.",
    349           "source": "haiku"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants in this study.",
    355           "source": "haiku"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": true,
    362           "justification": "End-to-end wall-clock latency in seconds per question is reported for all methods across all five benchmarks.",
    363           "source": "haiku"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "Hardware (NVIDIA A100-80GB GPUs) is specified but total GPU-hours or compute budget for all experiments is not stated.",
    369           "source": "haiku"
    370         }
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "GlimpRouter achieves 10.7% relative accuracy improvement while reducing inference latency by 25.9% compared to standalone LLM on AIME25",
    377       "evidence": "Table 1 (DeepSeek-32B config): LLM only 46.67%/220s vs GlimpRouter 51.67%/163s. Relative accuracy gain = (51.67-46.67)/46.67 = 10.72%; latency reduction = (220-163)/220 = 25.9%.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Initial token entropy (Hinit) exhibits a bimodal distribution while step-wise entropy and perplexity are unimodal, making it a more discriminative routing signal",
    382       "evidence": "Figure 1 shows distribution plots for all four metrics across 10M+ reasoning step tokens; Hinit displays a clearly distinct bimodal, heavy-tailed distribution while PPLstep and Hstep are narrow and unimodal.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Hinit is monotonically negatively correlated with small-large model output alignment, validating it as a routing indicator",
    387       "evidence": "Figure 2 shows strictly monotonic negative correlation between Hinit intervals and BLEU-4/SBERT alignment between SLM and LLM outputs across the full entropy range.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "GlimpRouter outperforms Hstep and PPLstep routing variants in both accuracy and latency",
    392       "evidence": "Tables 2 and 6 show GlimpRouter (Hinit) outperforms step-wise entropy and perplexity variants by 4–10pp in accuracy with consistently lower latency across four benchmarks.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "GlimpRouter establishes a superior Pareto frontier compared to SpecReason across all intervention rate thresholds",
    397       "evidence": "Figure 4 and Table 5 show GlimpRouter's accuracy-latency curve dominates SpecReason at all intervention rates on AIME24, AIME25, LCBv5, LCBv6.",
    398       "supported": "strong"
    399     },
    400     {
    401       "claim": "GlimpRouter is orthogonal to Speculative Decoding and their combination achieves compound speedups",
    402       "evidence": "Tables 3 and 7 show GlimpRouter+SpecDec achieves the lowest latency across all benchmarks (e.g., 130s on AIME25 vs 163s GlimpRouter alone, 149s SpecDec alone).",
    403       "supported": "strong"
    404     },
    405     {
    406       "claim": "Accuracy improvement over standalone LLM results from implicit self-correction when LLM is invoked at high-entropy steps",
    407       "evidence": "Only two qualitative case studies in Appendix F support this attribution; no quantitative measurement of correction frequency or counterfactual comparison is provided.",
    408       "supported": "weak"
    409     }
    410   ],
    411   "methodology_tags": [
    412     "benchmark-eval",
    413     "empirical"
    414   ],
    415   "key_findings": "GlimpRouter proposes using initial token entropy (Hinit) as a training-free routing signal for step-wise collaborative inference, exploiting the observation that Hinit exhibits a bimodal distribution that separates routine reasoning steps from cognitive bifurcation points, unlike full-step entropy metrics that suffer from signal dilution. Across five benchmarks spanning mathematical reasoning, general reasoning, and code generation, GlimpRouter consistently achieves superior accuracy-latency Pareto frontiers compared to SpecCoT, SpecReason, and RSD baselines. A counterintuitive finding is that GlimpRouter can exceed standalone large model accuracy (e.g., +10.7% relative on AIME25) while simultaneously reducing latency by 25.9%, attributed qualitatively to the large model's self-correction capacity when invoked at high-uncertainty steps. The step-level routing granularity is shown to be orthogonal to token-level Speculative Decoding, enabling compound efficiency gains when combined.",
    416   "red_flags": [
    417     {
    418       "flag": "No variance reported despite small benchmark sizes",
    419       "detail": "Results averaged over 4 runs with no standard deviation or confidence intervals. AIME24 and AIME25 each have only 30 problems, making the 1–3pp accuracy differences between methods potentially within noise."
    420     },
    421     {
    422       "flag": "No code released",
    423       "detail": "No code repository is provided despite the training-free nature of the method, severely limiting reproducibility."
    424     },
    425     {
    426       "flag": "Contamination unaddressed for AIME25",
    427       "detail": "AIME25 problems from 2025 competitions may be present in training data of Qwen3 and DeepSeek-R1 models; no training cutoffs are stated and no contamination analysis is performed."
    428     },
    429     {
    430       "flag": "Optimal threshold selection methodology is post-hoc",
    431       "detail": "Main results are reported at 'the configuration that achieves the optimal balance' without a held-out validation set or pre-specified selection criterion, risking overfitting the reported results to the test benchmarks."
    432     },
    433     {
    434       "flag": "Self-correction attribution unsupported quantitatively",
    435       "detail": "The counterintuitive accuracy gain over standalone LLM is a key claim but is supported only by two cherry-picked qualitative case studies with no measurement of how often self-correction occurs or contributes to accuracy."
    436     },
    437     {
    438       "flag": "Preliminary study data collection underdescribed",
    439       "detail": "The collection of 10M+ reasoning step tokens used in Section 2 lacks documentation of sampling procedure, number of traces per problem, or filtering criteria."
    440     }
    441   ],
    442   "cited_papers": [
    443     {
    444       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    445       "relevance": "Foundational LRM paper; primary model family evaluated and source of the 'Aha Moment' phenomenon motivating the initial token entropy hypothesis."
    446     },
    447     {
    448       "title": "SpecReason: Fast and Accurate Inference-Time Compute via Speculative Reasoning",
    449       "relevance": "Primary step-level collaborative inference baseline; post-hoc verification approach contrasted mechanistically with GlimpRouter's proactive routing."
    450     },
    451     {
    452       "title": "SpecCoT: Accelerating Chain-of-Thought Reasoning through Speculative Exploration",
    453       "relevance": "Multi-path selection collaborative inference baseline; compared in all main experiments."
    454     },
    455     {
    456       "title": "Reward-Guided Speculative Decoding for Efficient LLM Reasoning",
    457       "relevance": "Training-based reward-guided collaborative inference baseline (RSD); contrasted as requiring a trained PRM unlike GlimpRouter's training-free approach."
    458     },
    459     {
    460       "title": "Fast Inference from Transformers via Speculative Decoding",
    461       "relevance": "Token-level acceleration technique shown to be orthogonal to and composable with GlimpRouter for compound speedups."
    462     },
    463     {
    464       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    465       "relevance": "Primary code generation benchmark used in all experiments; cited for contamination-free design."
    466     },
    467     {
    468       "title": "GPQA: A Graduate-Level Google-Proof Q&A Benchmark",
    469       "relevance": "General reasoning benchmark (GPQA-Diamond subset) used to evaluate GlimpRouter on non-mathematical domains."
    470     },
    471     {
    472       "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention",
    473       "relevance": "vLLM inference engine used for all experiments; prefix caching mechanism leveraged for efficient model switching in GlimpRouter."
    474     },
    475     {
    476       "title": "Qwen3 Technical Report",
    477       "relevance": "Primary model family evaluated; Qwen3-4B and Qwen3-32B used as SLM and LLM respectively in main experiments."
    478     }
    479   ],
    480   "engagement_factors": {
    481     "practical_relevance": {
    482       "score": 3,
    483       "justification": "Directly addresses LRM inference cost, a major deployment bottleneck; training-free design and composability with existing engines (vLLM) means practitioners can adopt it without retraining."
    484     },
    485     "surprise_contrarian": {
    486       "score": 2,
    487       "justification": "The counterintuitive finding that collaborative inference exceeds standalone large model accuracy by 10.7% challenges the conventional framing that collaboration always trades accuracy for efficiency."
    488     },
    489     "fear_safety": {
    490       "score": 0,
    491       "justification": "No AI safety or risk concerns raised."
    492     },
    493     "drama_conflict": {
    494       "score": 0,
    495       "justification": "No controversy or conflict angle."
    496     },
    497     "demo_ability": {
    498       "score": 2,
    499       "justification": "Algorithm is clearly specified with pseudocode and the method is training-free, making reimplementation feasible, but no code is released to lower the barrier."
    500     },
    501     "brand_recognition": {
    502       "score": 1,
    503       "justification": "Shanghai Jiao Tong University affiliation without industry lab partnership; no famous brand association."
    504     }
    505   },
    506   "hn_data": {
    507     "threads": [],
    508     "top_points": 0,
    509     "total_points": 0,
    510     "total_comments": 0
    511   }
    512 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs