scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19861B)
      1 {
      2   "paper": {
      3     "title": "Confidence-Driven Multi-Scale Model Selection for Cost-Efficient Inference",
      4     "authors": ["Bo-Wei Chen", "Chung-Chi Chen", "An-Zi Yen"],
      5     "year": 2026,
      6     "venue": "",
      7     "arxiv_id": "",
      8     "doi": ""
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "GitHub repository URL provided: https://github.com/NYCU-NLP-Lab/ConfDrivenInference (footnote 1, Section 2)."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "Uses publicly available benchmarks: MMLU, GPQA, and PopQA. No proprietary data was collected."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Appendix B specifies 4 NVIDIA RTX A6000 GPUs with 48GB VRAM, specific HuggingFace model identifiers, and generation parameters (do_sample=False, top_p=0.9, temperature=1.0)."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README or reproduction guide is described."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "All results are reported as point estimates without confidence intervals or error bars."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "McNemar's test is used for key comparisons (Appendix E, Table 9), with p-values reported for multiple configurations."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Performance drops and cost reductions are reported with baseline context (e.g., '83.22% vs. 83.57%' with '36% reduction in cost'), providing magnitude of effects."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No justification for sample sizes or power analysis. MMLU has ~14k questions but no discussion of whether this is sufficient for the claims made."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "Appendix B explicitly states 'All results are based on a single run per model configuration without repeated sampling.' No variance or standard deviation reported."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Individual model performance (3B, 8B, 70B alone) serves as baselines, and comparisons are made against using the largest model directly."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No comparison against other routing/cascade methods like RouteLLM (Ong et al., 2024), Eagle (Zhao et al., 2024), or Hybrid LLM (Ding et al., 2024), which are discussed in related work but not compared experimentally."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Section 4.1 presents ablation of P(IK) component (Table 5), and Appendix I provides sensitivity analysis of P(T) threshold (Table 15)."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Reports accuracy, Macro-Precision, Macro-Recall, Macro-F1 (Table 2), computational cost reduction, end-to-end time, and performance drop."
     80       },
     81       "human_evaluation": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "This is a benchmark evaluation paper measuring automated metrics on multiple-choice and QA tasks; human evaluation is not relevant to the claims."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "For P(IK) classifier: '80% of the dataset, with 10% used for validation and 10% for testing' (Section 2.2). GPQA serves as an OOD test set."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "MMLU has 57 subjects but no per-subject breakdown is provided. Only aggregate results are reported."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section 4.2 discusses failure on OOD (GPQA) where cost savings are minimal. Section 3.2 discusses error accumulation in longer chains. Section 4.1 discusses performance degradation without P(IK)."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Reports that longer transfer chains (3B->8B->70B) lead to 'more pronounced performance drop' and that OOD generalization to GPQA yields only ~4-5% cost savings vs 36% on MMLU."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Abstract claims of 20-40% cost reduction and ~60% token reduction for GPT-4o are supported by Tables 1 and 4 respectively."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Causal claims are primarily from ablation studies (removing P(IK)) which use controlled single-variable manipulation (Table 5). The paper frames improvements as resulting from the confidence-driven strategy and supports this with ablations."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "Abstract claims 'confidence-based model selection to enhance real-world LLM deployment' but experiments are limited to MMLU (multiple-choice NLU), GPQA, and PopQA. The title suggests broad applicability not supported by the narrow evaluation scope."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "No discussion of alternative explanations for results. For example, cost savings could be partly due to MMLU's question difficulty distribution rather than the confidence mechanism itself. No threats-to-validity section addresses this."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Appendix B lists exact HuggingFace model identifiers: meta-llama/Llama-3.2-3B-Instruct, meta-llama/Meta-Llama-3.1-8B-Instruct, meta-llama/Llama-3.3-70B-Instruct, Qwen/Qwen3-4B, Qwen/Qwen3-8B, Qwen/Qwen3-32B."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Table 7 (Appendix C) provides the full prompt templates for both multiple-choice and open-ended questions, with the fill values being the dataset's subject, question, and choices."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Appendix B reports do_sample=False, top_p=0.9, temperature=1.0. P(T) threshold set to 0.9 (Section 2.2). P(IK) classifier uses 80/10/10 split with hidden states from 24th transformer layer."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No agentic scaffolding is used. The method is a cascade/routing mechanism, not an agent."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "No description of how MMLU, GPQA, or PopQA data was preprocessed. The train/val/test split for P(IK) is mentioned but selection criteria and any filtering are not documented."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "A dedicated 'Limitations' section follows the Conclusions, discussing domain generalization, task coverage, and latency concerns."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Limitations section discusses specific issues: P(IK) classifier may degrade in different domains, experiments primarily focus on NLU tasks (MMLU/GPQA), and multi-model escalation may introduce latency."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Limitations explicitly state that applicability to 'generative language modeling, open-domain QA, or real-time conversational AI, remains an open challenge' and that results are primarily on NLU tasks."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No raw experimental data (per-query predictions, confidence scores, routing decisions) is released. Only aggregate results in tables."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Uses established public benchmarks (MMLU, GPQA, PopQA) with citations. P(IK) training data comes from model outputs on these benchmarks with 80/10/10 split."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants. All data comes from standard public benchmarks."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The pipeline from raw benchmark data to final results is not fully documented. For example, how P(IK) training labels were generated, how queries were formatted, and how PopQA grounding API verification worked are insufficiently described."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Acknowledgments section lists NSTC Taiwan grant, CITI Academia Sinica project, and AIST policy-based budget project."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are listed: NYCU Taiwan and AIST Japan. No evaluated products are from these institutions."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Funders are government agencies (NSTC Taiwan, AIST Japan) and academic institutions (Academia Sinica) with no financial stake in the results."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests statement is provided in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No training data cutoff dates are stated for any of the models used (LLaMA, Qwen, GPT-4o). MMLU was published in 2020 and is likely in training data."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No discussion of whether MMLU or GPQA questions appeared in model training data, despite MMLU being a widely-used benchmark likely contaminated in newer models."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "MMLU (2020) predates all models used. No contamination analysis is provided despite this being a known concern for MMLU evaluation."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants in this study."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants in this study."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in this study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": true,
    275         "justification": "Appendix G (Tables 12-13) provides detailed token pricing and estimated USD costs for all configurations. GFLOPs computation costs are reported in Section 3.1 and Table 1."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": true,
    280         "justification": "Hardware specified (4x NVIDIA RTX A6000). End-to-end time reported in Table 1. Total costs in USD estimated in Table 13. GFLOPs calculated for each configuration."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "The confidence-driven strategy achieves accuracy comparable to the largest model while reducing computational costs by 20% to 40%.",
    287       "evidence": "Table 1: 8B->70B achieves 83.22% vs 70B's 83.57% with 36.46% cost reduction. McNemar's test p=0.4048 shows no significant difference (Appendix E).",
    288       "supported": "strong"
    289     },
    290     {
    291       "claim": "When applied to GPT-4o, the method reduces token usage by approximately 60%.",
    292       "evidence": "Table 4: 70B->GPT-4o reduces tokens from 36,225 to 14,505 (59.96% reduction) while slightly improving accuracy (86.85% vs 86.43%).",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "P(IK) classifier improves performance stability in the routing framework.",
    297       "evidence": "Table 5: With P(IK), 8B->70B drops only 0.35%; without P(IK), it drops 1.26%. Sensitivity analysis (Table 15) shows P(IK) stabilizes performance across threshold values.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "The method remains effective in out-of-distribution settings (GPQA).",
    302       "evidence": "Table 6: On GPQA, 8B->70B achieves 51.79% vs 52.23% with only 3.93% cost reduction. The cost savings are minimal compared to in-distribution MMLU.",
    303       "supported": "weak"
    304     },
    305     {
    306       "claim": "The method generalizes across model families (LLaMA and Qwen).",
    307       "evidence": "Table 3: Qwen 8B->32B achieves 80.00% vs 32B's 79.51% with 33.18% cost reduction, demonstrating cross-family effectiveness.",
    308       "supported": "moderate"
    309     }
    310   ],
    311   "methodology_tags": ["benchmark-eval"],
    312   "key_findings": "The paper proposes a confidence-driven multi-scale model selection strategy combining token probability P(T) and a trained P(IK) classifier to route queries from smaller to larger LLMs. On MMLU, the 8B->70B cascade matches the 70B model's accuracy (83.22% vs 83.57%, p=0.4048) while reducing compute by 36%. When cascading to GPT-4o API, the 70B->GPT-4o configuration reduces token usage by ~60% with no accuracy loss. The approach generalizes to Qwen models but shows limited cost savings on the harder OOD benchmark GPQA (~4% vs ~36% on MMLU).",
    313   "red_flags": [
    314     {
    315       "flag": "No comparison with competing routing methods",
    316       "detail": "RouteLLM, Eagle, Hybrid LLM, and SYMBOLIC-MOE are discussed in related work but none are compared against experimentally, making it impossible to assess whether this method improves over existing approaches."
    317     },
    318     {
    319       "flag": "Single-run results",
    320       "detail": "All results are explicitly from a single run per configuration (Appendix B). No variance or standard deviation is reported, making result stability unknown."
    321     },
    322     {
    323       "flag": "MMLU contamination risk unaddressed",
    324       "detail": "MMLU was published in 2020 and is widely known to be contaminated in modern LLMs. No discussion of how contamination might affect the routing strategy's apparent effectiveness."
    325     },
    326     {
    327       "flag": "Weak OOD claim",
    328       "detail": "The abstract and conclusion claim effectiveness in 'both in-distribution and out-of-distribution settings,' but GPQA results show only 3.93-5.11% cost savings (vs 36-44% on MMLU), making the OOD claim overstated."
    329     }
    330   ],
    331   "cited_papers": [
    332     {
    333       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    334       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    335       "year": 2023,
    336       "arxiv_id": "2305.05176",
    337       "relevance": "Foundational work on LLM cascading with trained scoring functions for cost reduction."
    338     },
    339     {
    340       "title": "RouteLLM: Learning to Route LLMs from Preference Data",
    341       "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"],
    342       "year": 2024,
    343       "relevance": "Competing routing approach using preference-based scoring for multi-LLM inference."
    344     },
    345     {
    346       "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing",
    347       "authors": ["Dujian Ding", "Ankur Mallick", "Chi Wang"],
    348       "year": 2024,
    349       "relevance": "Competing approach training a query router to predict response quality gaps between small and large models."
    350     },
    351     {
    352       "title": "Eagle: Efficient Training-Free Router for Multi-LLM Inference",
    353       "authors": ["Zesen Zhao", "Shuowei Jin", "Z Morley Mao"],
    354       "year": 2024,
    355       "arxiv_id": "2409.15518",
    356       "relevance": "Training-free routing method for multi-LLM inference based on benchmark performance."
    357     },
    358     {
    359       "title": "Language Models (Mostly) Know What They Know",
    360       "authors": ["Saurav Kadavath"],
    361       "year": 2022,
    362       "arxiv_id": "2207.05221",
    363       "relevance": "Foundational work on P(True) and P(I Know) confidence calibration that this paper builds upon."
    364     },
    365     {
    366       "title": "Quantifying Uncertainty in Answers from Any Language Model and Enhancing Their Trustworthiness",
    367       "authors": ["Jiuhai Chen", "Jonas Mueller"],
    368       "year": 2024,
    369       "relevance": "LLM uncertainty quantification methods relevant to confidence-driven routing."
    370     },
    371     {
    372       "title": "Semantic Uncertainty: Linguistic Invariances for Uncertainty Estimation in Natural Language Generation",
    373       "authors": ["Lorenz Kuhn", "Yarin Gal", "Sebastian Farquhar"],
    374       "year": 2023,
    375       "arxiv_id": "2302.09664",
    376       "relevance": "Semantic entropy approach to LLM uncertainty estimation."
    377     },
    378     {
    379       "title": "Scaling Laws for Neural Language Models",
    380       "authors": ["Jared Kaplan"],
    381       "year": 2020,
    382       "arxiv_id": "2001.08361",
    383       "relevance": "Scaling laws used to estimate computational cost of model forward passes in this paper."
    384     },
    385     {
    386       "title": "Confident or Seek Stronger: Exploring Uncertainty-Based On-Device LLM Routing from Benchmarking to Generalization",
    387       "authors": ["Yu-Neng Chuang"],
    388       "year": 2025,
    389       "arxiv_id": "2502.04428",
    390       "relevance": "Concurrent work on uncertainty-based LLM routing for on-device deployment."
    391     },
    392     {
    393       "title": "Symbolic Mixture-of-Experts: Adaptive Skill-Based Routing for Scalable Heterogeneous Reasoning",
    394       "authors": ["Justin Chih-Yao Chen"],
    395       "year": 2025,
    396       "arxiv_id": "2503.05641",
    397       "relevance": "Symbolic skill-based routing approach for multi-LLM inference without gradient-based optimization."
    398     }
    399   ]
    400 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs