scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28819B)
      1 {
      2   "paper": {
      3     "title": "MentorCollab: Selective Large-to-Small Inference-Time Guidance for Efficient Reasoning",
      4     "authors": [
      5       "Haojin Wang",
      6       "Yike Wang",
      7       "Shangbin Feng",
      8       "Hannaneh Hajishirzi",
      9       "Yulia Tsvetkov"
     10     ],
     11     "year": 2026,
     12     "venue": "arXiv",
     13     "arxiv_id": "2602.05307"
     14   },
     15   "scan_version": 2,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "MentorCollab proposes selective inference-time collaboration where a large reasoning model sparsely guides a small generator model through short lookahead segments verified by a lightweight verifier. Across 15 generator-mentor pairs and 3 domains (math, general knowledge, commonsense), the method improves accuracy in 12/15 settings with average gains of 3.0% and up to 8.0%, while using only 18.4% mentor-generated tokens on average. Ablation analysis shows short segments (4-8 tokens) and moderate probing frequency suffice, and the verifier is critical for strong generators but can be overly conservative for weak ones.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "A GitHub URL is provided in the abstract footnote: 'Code available at github.com/haojinw0027/Mentorcollab.'"
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "All evaluation datasets are publicly available standard benchmarks: MATH (Hendrycks et al., 2021), SuperGPQA (Team, 2025b), and Com2-hard-Intervention (Xiong et al., 2025)."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup is provided in the paper."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are included in the paper. A code repository is linked but the paper itself contains no reproduction guide."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "All results in Table 1 and throughout the paper are reported as point estimates (e.g., '43.60%', '46.60%') with no confidence intervals, error bars, or ± notation."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper claims MentorCollab 'improves accuracy' and 'outperforms' baselines based solely on comparing raw accuracy numbers. No statistical significance tests (p-values, t-tests, bootstrap tests, etc.) are reported."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper reports absolute percentage improvements ('average gains of 3.0% and up to 8.0%') alongside baseline scores in Table 1, providing full context (e.g., generator 43.60% → MentorCollab-MLP 46.60%). The reader can assess the magnitude of improvements."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification is given for sample sizes. The paper uses 500 random samples from SuperGPQA and 241 from Com2-hard-Intervention without explaining why these sizes are sufficient."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No variance, standard deviation, or spread measures are reported. Although greedy decoding is deterministic, the Bernoulli sampling of decision positions introduces randomness, yet only single-run results are shown."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Five baselines are compared: Average Decoding, Nudging (Fei et al., 2025), CoSD (Wang et al., 2025b), R-Stitch (Chen et al., 2025), and Co-LLM (Shen et al., 2024). Single-model performance (generator-only and mentor-only) is also reported."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "All baselines are recent: Nudging (2025), CoSD (2025), R-Stitch (2025), and Co-LLM (2024). These represent the current state of the art in model collaboration methods."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple ablations are conducted: decision proportion ρ (§4.1, Fig. 4), verifier necessity (§4.2, Fig. 5), and segment size L (§4.3, Fig. 6). Each isolates a single component's contribution."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Only accuracy is reported as the evaluation metric across all three domains. No secondary metrics (e.g., reasoning trace quality, inference latency, token efficiency as a formal metric) are used alongside accuracy."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No human evaluation is conducted. All evaluation is automated via answer matching against benchmark ground truth. Human evaluation of reasoning quality or correctness of reasoning traces could have strengthened the claims about 'concise and correct reasoning.'"
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are reported on held-out test sets: MATH official test split, 500 test examples from SuperGPQA, and 241 test instances from Com2-hard-Intervention. The MLP verifier is trained on separate training splits (MATH training split, 4000 SuperGPQA training examples)."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Table 1 provides detailed per-domain, per-generator, and per-mentor breakdowns across all 15 model pair combinations and 3 domains. Figure 8 provides per-generator averages."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper discusses cases where MentorCollab does not improve (3/15 settings), where the verifier causes overly conservative behavior for weak generators (§4.2), and where increasing decision proportion leads to degradation (§4.1). Case studies in Tables 3-5 show both successes and the generator's failures."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Several negative results are reported: 3/15 settings show no improvement; high decision proportions degrade performance (Fig. 4); removing the verifier helps weak generators at low intervention rates (Fig. 5); the method does not help strong generators on commonsense tasks (§A.1)."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims are supported: '12 settings' improvement matches Table 1 (verified across 15 pairs); 'average gains of 3.0% and up to 8.0%' is consistent with results; '18.4% tokens generated by the expensive mentor model' is supported by Fig. 7."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Causal claims like 'the verifier is crucial for achieving performance gains' and 'MentorCollab improves accuracy' are supported by controlled ablation studies (removing verifier in §4.2, varying single parameters in §4.1 and §4.3), which constitute adequate single-variable manipulation."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title claims 'Efficient Reasoning' broadly. The abstract says 'selective inference-time guidance restores large-model reasoning ability' without bounding to the tested domains. While 15 model pairs and 3 domains are tested, the claims extend to 'reasoning systems' and 'efficient reasoning' generally without explicit scope limitations."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper does not substantively discuss alternative explanations for the observed improvements. For instance, the additional compute from running two models is not controlled for (only mentor token ratio is reported, not total inference cost), and the possibility that improvements come from ensembling effects rather than 'selective guidance' is not considered."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper measures benchmark accuracy (MATH, SuperGPQA, Com2) but frames results as 'reasoning ability' and 'efficient reasoning' throughout. The gap between benchmark accuracy and the broader claim of 'restoring large-model reasoning ability' is not acknowledged or discussed."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Specific model names with parameter counts are provided: Llama3.1-8B, Gemma3-4B-PT, Qwen3-8B-Base, Llama3.2-3B-Instruct, Qwen3-1.7B, Qwen3-14B, Qwen3-32B, DeepSeek-R1-Distilled-Llama-70B. These are open-source models with unambiguous identifiers (§3.1)."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Full prompt templates are provided in Appendix C for all three benchmarks: 4-shot for MATH (§C.1), 5-shot for SuperGPQA (§C.2), and zero-shot for Com2-hard-Intervention (§C.3). The verification prompt for MentorCollab-FREE is also described in §2.2."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Key hyperparameters are reported: ρ=25%, L=16, max generation length 512, greedy decoding (§3.1). Baseline hyperparameters: γ=0.40 for Nudging, α=β=0.50 for CoSD, τ=0.03 for R-Stitch. MLP verifier architecture detailed in §B.2: three hidden layers {2d, d, d/2}, dropout 0.1, batch normalization, ReLU, Adam optimizer."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. MentorCollab is a decoding-time collaboration method operating at the token/segment level."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Data selection is described: MATH official test split, 500 randomly sampled from SuperGPQA, 241 single-answer instances from Com2-hard-Intervention (§3.1). MLP training data preparation is documented in §B.1: running MentorCollab-FREE on training split, curating samples where generator fails but FREE succeeds, and downsampling the majority class for label balance."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "A dedicated 'Limitations' section is present with three substantive points: reliance on random positions for intervention, unclear behavior when high-likelihood predictions are bypassed, and restriction to a dual-model setting."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The limitations are specific to this study: (1) random position sampling may miss optimal intervention points, (2) high-likelihood generator predictions are sometimes incorrectly bypassed with unclear cause, (3) only single generator-mentor pairs are tested. These are study-specific, not boilerplate."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The paper does not explicitly state what the results do NOT show. The limitations mention future extensions (multi-mentor, better intervention criteria) but do not bound the scope of current claims. No explicit statements like 'our results apply only to X' or 'we do not claim Y.'"
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No raw experimental outputs (model generations, verifier decisions, per-example results) are made available for independent verification. Only aggregated accuracy numbers are reported."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Data sources are clearly described: MATH test split from Hendrycks et al. (2021), 500 randomly sampled examples from SuperGPQA (Team, 2025b), and 241 single-answer instances from Com2-hard-Intervention (Xiong et al., 2025)."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. All data comes from standard public benchmarks."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The evaluation pipeline is documented: prompt construction (Appendix C), decoding with greedy strategy and 512-token cap, answer extraction via regex patterns (§C.2 describes cascaded extraction for SuperGPQA). MLP training pipeline is documented in §B.1 including data balancing."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Funding is disclosed in the Acknowledgements: 'DARPA SciFy program (Agreement No. HR00112520300) and NSF Grant No. IIS2142739.'"
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Author affiliations are disclosed: UIUC, University of Washington, Allen Institute for AI. Authors are not affiliated with any of the model providers being evaluated (Meta, Google, Alibaba, DeepSeek)."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "DARPA and NSF are government funding agencies with no financial interest in the performance of specific model collaboration methods or the evaluated models."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is present in the paper. Absence of disclosure is not the same as absence of conflict."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No training data cutoff dates are stated for any of the models used (Llama 3.1/3.2, Gemma 3, Qwen 3, DeepSeek-R1). This is necessary to assess whether benchmark data could be in the training sets."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of whether any of the evaluated models may have seen MATH, SuperGPQA, or Com2-hard examples during training. MATH was published in 2021 and is widely available online."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "MATH (2021) has been publicly available for years and could be in the training data of models released in 2025-2026. This contamination risk is not discussed."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Mentor token ratio is reported: 18.4% of tokens from the mentor on average (Fig. 7). Figure 3 shows incorporated mentor token count vs. accuracy for different methods. Token-level cost breakdowns by domain are provided in Fig. 7."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No total compute budget is stated — no GPU hours, wall-clock inference time, total API cost, or hardware specifications for running the experiments."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The Bernoulli sampling of decision positions (Bt ~ Bernoulli(ρ)) introduces randomness into the method, yet no results across multiple random seeds are reported. Only single-run results are shown."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The number of experimental runs is not stated. Results appear to be from single runs, but this is not explicitly confirmed."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper tests several values of ρ (10, 25, 50, 75%) and L (1, 2, 4, 8, 16, 32) in sensitivity analyses but does not report a systematic hyperparameter search budget or how the main configuration (ρ=25%, L=16) was selected."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The main configuration (ρ=25%, L=16) is stated to 'balance inference cost and generation quality' but no validation-based selection process is described. The sensitivity analysis is post-hoc rather than a principled selection procedure."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No statistical significance tests are performed, so multiple comparison correction does not apply."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors implement all baseline methods (Average Decoding, Nudging, CoSD, R-Stitch) themselves. They use hyperparameters from the original papers, but do not acknowledge the systematic bias of authors implementing and evaluating baselines, as documented by Lucic et al. (2018)."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Figure 3 directly plots incorporated mentor token count against accuracy for different methods, showing MentorCollab achieves better accuracy with comparable or fewer mentor tokens than baselines."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The paper uses MATH, SuperGPQA, and Com2-hard-Intervention as proxies for 'reasoning ability' without discussing whether benchmark accuracy on these tasks actually measures the reasoning capability claimed."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No agentic scaffolding is involved. The collaboration method itself is the variable being tested, and all methods use the same underlying models."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of temporal leakage. MATH was published in 2021 and models released in 2025-2026 likely trained on data including MATH solutions."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the evaluation setup leaks answer information through the few-shot prompts or other contextual signals."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether training and test data share structural similarities or whether benchmark problems have near-duplicates in model training data."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method is applied (no canary strings, membership inference, temporal splits, or decontamination)."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "MentorCollab improves accuracy in 12 out of 15 generator-mentor pair settings across 3 domains, with average gains of 3.0% and up to 8.0%.",
    370       "evidence": "Table 1 shows per-pair results across MATH, SuperGPQA, and Com2-hard-Intervention for 15 generator-mentor combinations. Figure 8 shows averaged results.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "MentorCollab requires only 18.4% mentor-generated tokens on average in the final output.",
    375       "evidence": "Figure 7 shows token-level composition: ~19% non-decision tokens from mentor, ~2-4% from verifier decisions. Figure 3 compares token usage across methods.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Short lookahead segments (4-8 tokens) and moderate probing frequency are sufficient for effective collaboration.",
    380       "evidence": "Figure 6 shows segment size analysis: moderate sizes achieve similar or better accuracy than longer segments. Figure 4 shows decision proportion analysis.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "The verifier is critical for strong generators but may be overly conservative for weak generators.",
    385       "evidence": "Figure 5 shows that without the verifier, strong generators (Qwen3-8B-Base) suffer substantial degradation at high ρ, while weak generators (Gemma3-4B-PT) sometimes benefit from direct injection. Figure 9 provides additional analysis on two more generators.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "MentorCollab-MLP outperforms Co-LLM without requiring expensive collaboration-specific training.",
    390       "evidence": "Table 2 compares on 3 model pairs on MATH: MentorCollab-MLP achieves 46.60%, 46.80%, 18.00% vs Co-LLM's 25.00%, 21.80%, 4.40%.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Prior collaboration methods promote imitation of verbose chain-of-thought patterns rather than targeted error correction.",
    395       "evidence": "Case studies in Tables 4-5 show CoSD introducing hesitation tokens ('wait') and redundant reasoning. Qualitative analysis in §D.",
    396       "supported": "weak"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "No error bars or uncertainty quantification",
    402       "detail": "All results are single-run point estimates despite the method's inherent randomness from Bernoulli sampling of decision positions. The reported improvements of 1-8% could fall within run-to-run variance."
    403     },
    404     {
    405       "flag": "No contamination analysis",
    406       "detail": "MATH (2021) is widely available online and likely in the training data of 2025-2026 models. No discussion of whether model familiarity with benchmark problems confounds the comparison between collaboration methods."
    407     },
    408     {
    409       "flag": "Artificially constrained token budget favors the proposed method",
    410       "detail": "The 512-token cap causes mentor LRMs to 'frequently collapse mid-reasoning' (acknowledged in §3.2), making mentor-only baselines appear weak. MentorCollab benefits from this cap since it uses the SLM for most generation. A fairer comparison would allow mentors their natural token budget."
    411     },
    412     {
    413       "flag": "Selective reporting of averaged results",
    414       "detail": "The paper claims '12 out of 15' improvements but averages across 3 mentors per generator. On commonsense reasoning, 2/5 generators show degradation with MentorCollab. The averaging obscures that gains are inconsistent across domains."
    415     }
    416   ],
    417   "cited_papers": [
    418     {
    419       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    420       "authors": ["Daya Guo", "Dejian Yang"],
    421       "year": 2025,
    422       "arxiv_id": "2501.12948",
    423       "relevance": "Major large reasoning model used as mentor in experiments; demonstrates RL-based reasoning capability in LLMs."
    424     },
    425     {
    426       "title": "Nudging: Inference-Time Alignment of LLMs via Guided Decoding",
    427       "authors": ["Yu Fei", "Yasaman Razeghi", "Sameer Singh"],
    428       "year": 2025,
    429       "relevance": "Primary baseline for inference-time model collaboration; introduces mentor token injection based on generator uncertainty."
    430     },
    431     {
    432       "title": "Learning to Decode Collaboratively with Multiple Language Models",
    433       "authors": ["Zejiang Shen", "Hunter Lang", "Bailin Wang", "Yoon Kim", "David Sontag"],
    434       "year": 2024,
    435       "relevance": "Co-LLM baseline: trains a classifier for collaborative decoding between LLMs; requires compatible tokenizers."
    436     },
    437     {
    438       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    439       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    440       "year": 2024,
    441       "relevance": "Foundational work on cost-efficient LLM inference through query routing and model cascading."
    442     },
    443     {
    444       "title": "R-Stitch: Dynamic Trajectory Stitching for Efficient Reasoning",
    445       "authors": ["Zhuokun Chen", "Zeren Chen"],
    446       "year": 2025,
    447       "arxiv_id": "2507.17307",
    448       "relevance": "Baseline using distribution entropy as signal for switching between models during generation."
    449     },
    450     {
    451       "title": "Speculate, Then Collaborate: Fusing Knowledge of Language Models During Decoding",
    452       "authors": ["Ziyao Wang", "Muneeza Azmat"],
    453       "year": 2025,
    454       "relevance": "CoSD baseline: rule-based verification for model collaboration during decoding based on probability thresholds."
    455     },
    456     {
    457       "title": "RouteLLM: Learning to Route LLMs from Preference Data",
    458       "authors": ["Isaac Ong", "Amjad Almahairi"],
    459       "year": 2025,
    460       "relevance": "Query-level routing approach for cost-efficient LLM inference; contrasts with MentorCollab's token-level collaboration."
    461     },
    462     {
    463       "title": "Stop Overthinking: A Survey on Efficient Reasoning for Large Language Models",
    464       "authors": ["Yang Sui"],
    465       "year": 2025,
    466       "relevance": "Survey on efficient reasoning in LLMs; documents the overthinking problem that motivates inference-time collaboration approaches."
    467     },
    468     {
    469       "title": "Fast Inference from Transformers via Speculative Decoding",
    470       "authors": ["Yaniv Leviathan", "Matan Kalman", "Yossi Matias"],
    471       "year": 2023,
    472       "relevance": "Foundational speculative decoding method using small-large model collaboration for inference speedup."
    473     },
    474     {
    475       "title": "When One LLM Drools, Multi-LLM Collaboration Rules",
    476       "authors": ["Shangbin Feng", "Wenxuan Ding"],
    477       "year": 2025,
    478       "arxiv_id": "2502.04506",
    479       "relevance": "Comprehensive framework for multi-LLM collaboration covering text-level, logits-level, weights-level, and API-level approaches."
    480     },
    481     {
    482       "title": "Contrastive Decoding: Open-Ended Text Generation as Optimization",
    483       "authors": ["Xiang Lisa Li", "Ari Holtzman"],
    484       "year": 2023,
    485       "relevance": "Logit-level fusion method for text generation using contrasting distributions from expert and amateur models."
    486     },
    487     {
    488       "title": "R2R: Efficiently Navigating Divergent Reasoning Paths with Small-Large Model Token Routing",
    489       "authors": ["Tianyu Fu", "Yi Ge"],
    490       "year": 2025,
    491       "relevance": "Token-level routing between small and large models for efficient reasoning; related approach to inference-time model collaboration."
    492     }
    493   ]
    494 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs