scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29476B)
      1 {
      2   "paper": {
      3     "title": "Towards Resource-Efficient Multimodal Intelligence: Learned Routing among Specialized Expert Models",
      4     "authors": [
      5       "Mayank Saini",
      6       "Arit Kumar Bishwas"
      7     ],
      8     "year": 2025,
      9     "venue": "arXiv.org",
     10     "arxiv_id": "2511.06441",
     11     "doi": "10.48550/arXiv.2511.06441"
     12   },
     13   "scan_version": 2,
     14   "active_modules": ["experimental_rigor", "data_leakage"],
     15   "methodology_tags": ["benchmark-eval"],
     16   "key_findings": "The paper proposes a modular, cost-aware routing framework that dispatches queries across specialized models based on modality, complexity, and cost. On MMLU and VQA-v2, the system claims to match or exceed GPT-4 performance (88.5% vs 84.2% on MMLU, 93.2% vs 89.7% on VQA-v2) while reducing premium model usage by ~72%. Routing accuracy is reported at 92.3% across a 13-category classification task. However, key details are missing (model versions, hyperparameter values, dataset sizes) and several numerical claims are internally inconsistent.",
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The evaluation uses standard public benchmarks: MMLU, GSM8K, MBPP, XSum, CNN/DailyMail, VQA-v2, and FUNSD. However, the custom 13-way routing evaluation dataset constructed via stratified sampling is not released."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No requirements.txt, Dockerfile, conda environment, or environment setup section is provided. Hardware specifications for inference are not mentioned."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No step-by-step reproduction instructions, README, or scripts are provided."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Table 3 reports ±1.8% for MMLU and ±2.3% for VQA-v2. Table 2 reports ±1.1% for routing accuracy. However, it is unclear whether these represent standard deviations across runs or variance across data subsets."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "No statistical significance tests are reported. Claims like 'outperforming the Always-Premium baseline' are made by comparing point estimates without any statistical test (e.g., t-test, bootstrap)."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Results are reported with baseline context: 88.5% vs 84.2% on MMLU, 93.2% vs 89.7% on VQA-v2, cost reductions to 68-70% of baseline, and 18% latency improvement. Both absolute values and relative improvements are provided."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No justification for sample sizes is provided. The paper does not state how many examples from each benchmark were used, nor why those quantities are sufficient for the claims made."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The paper states 'reported metrics are mean values; where applicable, we also provide variance to indicate stability across subsets' (Section 4.2). The ± values represent variance across subsets of evaluation data, not variance across multiple experimental runs. No multi-run variance or standard deviation across seeds is reported."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The paper compares against an 'Always-Premium' baseline where every query is handled by GPT-4 (Section 4)."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "The only baseline is 'always use GPT-4.' The paper cites RouteLLM, HybridLLM, and FrugalGPT as related routing systems but does not compare against any of them experimentally. These are the most relevant contemporary baselines for a routing paper."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "The system has many components (modality classifier, complexity analyzer, intent classifier, Couplet, MoE aggregator, context agent) but no ablation study is conducted to measure the contribution of individual components."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Multiple metrics are reported: task accuracy (MMLU, VQA-v2), routing accuracy, precision/recall, F1, TF-IDF cosine similarity, BERT cosine similarity, latency, throughput, and relative cost."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No human evaluation of system outputs is conducted. Human annotators were used for dataset labeling but not for evaluating the quality of the system's generated responses."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "The paper mentions 'τ is dynamically calibrated on a held-out validation set' (Section 3.2.1), but does not explicitly state that the reported evaluation results are on a separate test set distinct from the validation set used for threshold tuning."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Table 3 breaks results down by benchmark (MMLU, VQA-v2, fine-grained routing). Table 4 breaks down by routing path (open-source vs premium). Table 5 breaks down latency by modality (text, vision). Figure 5 breaks down cost by model."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 4.1 discusses the main failure mode: 'The only source of routing error or ambiguity arises in the case of text queries that serve as follow-ups to previous non-text tasks... most failures arising from subtle or ambiguous textual references to prior modalities.' Follow-up detection accuracy is reported at 90%."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "Every reported experiment shows improvement over the baseline. No ablations that hurt performance, approaches tried and abandoned, or configurations that failed are discussed."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Abstract claims of matching/exceeding Always-Premium performance are supported by Table 3 (88.5% vs 84.2% MMLU, 93.2% vs 89.7% VQA-v2). The 'over 67%' reduction in costly model reliance is supported by Table 4 (72% handled by open-source). Whether the underlying numbers are trustworthy is a separate concern."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The paper makes causal claims: 'intelligent routing can push the efficiency frontier, obtaining better results with fewer resources by leveraging specialized models' (Section 4.2). No controlled experiment isolates the causal contribution of routing vs. simply using different models. The comparison is uncontrolled (multi-model routing system vs. single-model baseline)."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The title claims 'Resource-Efficient Multimodal Intelligence' and the abstract claims 'high-quality, resource-efficient AI at scale.' But the evaluation covers only MMLU, VQA-v2, and a custom routing dataset. The paper does not bound its claims to these specific benchmarks and settings."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "No alternative explanations for the results are discussed. For example, the surprising MMLU result (routing system beating GPT-4) is not examined — it could be due to model selection effects, benchmark contamination differences, or evaluation methodology artifacts."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper measures accuracy on specific benchmarks and relative cost, then frames this as 'resource-efficient multimodal intelligence' and 'enterprise-grade requirements for accuracy, efficiency, and cost control' (Section 5). The gap between benchmark accuracy and real-world deployment effectiveness is not acknowledged."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "Models are referred to by marketing names only: 'GPT-4', 'Claude', 'Gemini', 'LLaMA', 'Mistral-7B', 'Qwen-VL', 'Mixtral 8×7B'. No specific version identifiers (e.g., 'gpt-4-0613') or snapshot dates are provided."
    146       },
    147       "prompts_provided": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "No actual prompts or system instructions used in experiments are provided. The intent classification and routing logic are described in natural language, but the prompts sent to LLMs for response generation are not shown."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "The paper introduces many tunable parameters (α, β, γ for complexity scoring; τ for threshold; δm, δu, δt, λc for routing weights; θs, θt, θm for context scoring) but provides no actual values for any of them. They are described as 'tunable' or 'empirically derived' without reporting the values used in experiments."
    156       },
    157       "scaffolding_described": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "The routing architecture is described extensively: modality classification (Section 3.2.1), complexity scoring equations, intent classification pipeline (Section 3.2.3), multi-agent LangGraph workflow (Section 3.2.4), MoE aggregation (Eq. 4-7), memory management (Section 3.2.5), and feedback loop (Section 3.2.6). Figures 1-4 illustrate the workflows."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "While the paper describes the system's input preprocessing (MIME checks, modality classification), the evaluation data preparation is inadequately documented. The custom 13-way dataset was constructed via 'stratified sampling from established benchmarks... augmented with controlled generation' but the number of examples, per-category counts, and 'controlled generation' process are not described."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The conclusion (Section 5) includes a limitations paragraph discussing the fixed 13-category classification scheme, orchestration complexity, and dependence on model availability. It is substantive though embedded in the conclusion rather than a standalone section."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The limitations are somewhat specific to this study: 'The current framework relies on a fixed 13-category classification scheme for task routing, which may not capture the full spectrum of emerging query types' and 'performance is also dependent on the quality and availability of specialized models for each domain.'"
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The paper does not explicitly state what the results do NOT show. The limitations paragraph discusses potential issues but does not define scope boundaries (e.g., 'our results do not apply to X' or 'we did not test Y'). The broad title and conclusion claims are unbounded."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "No raw experimental data (model outputs, routing decisions, per-example results) is made available for verification. Only aggregate numbers are reported in tables."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section 4.1 describes the evaluation dataset construction: 'balanced 13-way dataset via stratified sampling from established benchmarks... augmented with controlled generation for audio, video, and image-generation prompts.' Annotation process described: 'Labels were produced by two independent annotators under written guidelines, with disagreements resolved by consensus and GPT-4 arbitration; we measured inter-annotator agreement (Cohen's κ).'"
    195       },
    196       "recruitment_methods_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No human participants in the study. Data sources are standard public benchmarks."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "The paper describes the system pipeline but not the data pipeline from raw benchmark examples to reported numbers. How many examples were drawn from each benchmark, any filtering applied, and how aggregate metrics were computed are not documented."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No funding source or acknowledgments section is present. Both authors are from PwC US, a professional services firm, but no funding disclosure is made."
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Both authors list 'PwC US' as their affiliation with institutional email addresses (mayank.s.saini@pwc.com, arit.kumar.bishwas@pwc.com)."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No funding information is disclosed. PwC, as a consulting firm offering AI services, could benefit commercially from demonstrating cost-efficient AI deployment frameworks. The potential conflict is not addressed."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests or financial interests statement is present. PwC employees proposing a framework for 'enterprise-scale' AI deployment have potential commercial interests that go undisclosed."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No training data cutoff dates are stated for any of the models used (GPT-4, LLaMA, Mistral, Qwen, etc.), despite evaluating on public benchmarks like MMLU and GSM8K."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No discussion of potential train/test overlap. MMLU (2020), GSM8K (2021), MBPP (2021), and VQA-v2 (2015) are all well-known benchmarks that post-2023 models may have trained on."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "All benchmarks used (MMLU, VQA-v2, GSM8K, MBPP, XSum, CNN/DailyMail) were published years before the models' training cutoffs. No contamination analysis or discussion is provided."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Relative inference costs are reported in Table 3 (68-70% of baseline), Table 4 (query and cost share breakdown), and Figure 5 (cost breakdown by model). Latency is reported in Table 5 (419ms text, 530ms vision). However, no absolute costs (dollar amounts, tokens consumed) are reported."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No total computational budget is stated — no GPU hours, total API spend, hardware specifications, or training/evaluation compute costs are mentioned."
    293       }
    294     },
    295     "experimental_rigor": {
    296       "seed_sensitivity_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single runs."
    300       },
    301       "number_of_runs_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The number of experimental runs is never explicitly stated. The paper mentions 'mean values' with 'variance to indicate stability across subsets' but does not state how many runs produced the results."
    305       },
    306       "hyperparameter_search_budget": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Despite many tunable parameters (α, β, γ, τ, δm, δu, δt, λc, θs, θt, θm), no hyperparameter search budget is reported. The paper states τ is selected by 'sweeping candidate values' but does not report the range or number of configurations tried."
    310       },
    311       "best_config_selection_justified": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The paper mentions selecting τ by 'sweeping candidate values and selecting the operating point that maximizes routing accuracy under a fixed cost budget' (Section 3.2.1) but does not report what configurations were tried or how the best was selected."
    315       },
    316       "multiple_comparison_correction": {
    317         "applies": false,
    318         "answer": false,
    319         "justification": "No statistical significance tests are performed, so correction for multiple comparisons is not applicable."
    320       },
    321       "self_comparison_bias_addressed": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The authors designed, implemented, and evaluated their own routing system. No discussion of self-comparison bias, no independent evaluation, and no mitigation strategies are mentioned."
    325       },
    326       "compute_budget_vs_performance": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "Figure 6 plots accuracy vs. relative cost for both MMLU and VQA-v2, showing the cost-accuracy trade-off between the proposed approach and the Always-Premium baseline."
    330       },
    331       "benchmark_construct_validity": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "No discussion of whether MMLU, VQA-v2, or other benchmarks actually measure the capabilities the paper claims to evaluate. Benchmarks are used without questioning their construct validity."
    335       },
    336       "scaffold_confound_addressed": {
    337         "applies": false,
    338         "answer": false,
    339         "justification": "The routing scaffold IS the thing being evaluated — the paper proposes a routing system and evaluates it as a bundled product. The comparison is routing-system-as-a-whole vs. always-GPT-4, not model-vs-model through different scaffolds."
    340       }
    341     },
    342     "data_leakage": {
    343       "temporal_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of temporal leakage. All benchmarks (MMLU 2020, VQA-v2 2015, GSM8K 2021, MBPP 2021) predate the models used, meaning solutions could be in training data."
    347       },
    348       "feature_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of feature leakage or whether the evaluation setup provides information not available in real deployment."
    352       },
    353       "non_independence_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether train and test data are independent."
    357       },
    358       "leakage_detection_method": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No leakage detection or prevention method is used (no canary strings, membership inference, temporal splits, or decontamination)."
    362       }
    363     }
    364   },
    365   "claims": [
    366     {
    367       "claim": "The routing system matches or exceeds Always-Premium (GPT-4) performance: 88.5% vs 84.2% on MMLU and 93.2% vs 89.7% on VQA-v2.",
    368       "evidence": "Table 3 (Section 4.2) reports these accuracy figures. The MMLU improvement of 4.3 percentage points over GPT-4 is surprising and unexplained — routing to smaller models should not improve knowledge benchmark accuracy.",
    369       "supported": "weak"
    370     },
    371     {
    372       "claim": "72% of queries are handled by open-source models, reducing premium model reliance by over 67%.",
    373       "evidence": "Table 4 shows 72% open-source query share vs 28% premium. This supports the 'over 67%' claim.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "Inference cost reduced to roughly one-third of the Always-Premium approach.",
    378       "evidence": "Table 3 shows costs of 70% and 68% of baseline for MMLU and VQA-v2 respectively, which is roughly two-thirds, not one-third. The 42% figure for fine-grained routing is also not one-third. The 'one-third' claim is inconsistent with the reported numbers.",
    379       "supported": "weak"
    380     },
    381     {
    382       "claim": "Routing accuracy of 92.3% with fine-grained 13-way accuracy of 86.78%.",
    383       "evidence": "Table 2 reports these figures. However, the evaluation dataset is not released, the number of evaluation examples is not stated, and the dataset construction process lacks detail.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "94% of responses have BERT similarity above 0.8 compared to Always-Premium outputs.",
    388       "evidence": "Stated in Section 4.2 text. Average BERT cosine similarity reported as 0.93. However, BERT similarity is an imperfect proxy for answer quality, and the threshold of 0.8 is arbitrary.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "18% latency improvement and 20% throughput improvement over Always-Premium.",
    393       "evidence": "Table 5 reports text latency of 419ms vs 512ms (18.2%) and throughput of 54 vs 45 qps (20%). However, no variance or confidence intervals are provided for latency measurements.",
    394       "supported": "moderate"
    395     }
    396   ],
    397   "red_flags": [
    398     {
    399       "flag": "Implausible MMLU result",
    400       "detail": "The paper claims 88.5% on MMLU, exceeding GPT-4's 84.2%. MMLU is a knowledge benchmark where routing to smaller models should not improve accuracy over GPT-4. This result is surprising and unexplained. It could stem from evaluation methodology artifacts, contamination differences between models, or unreported dataset subsetting."
    401     },
    402     {
    403       "flag": "Internal cost claim inconsistency",
    404       "detail": "The text claims 'inference cost was reduced to roughly one-third of the Always-Premium approach' but Table 3 shows costs of 68-70% of baseline (roughly two-thirds, not one-third). The 42% figure in the fine-grained eval is also not one-third. The claim overstates the measured savings."
    405     },
    406     {
    407       "flag": "No comparison to competing routing systems",
    408       "detail": "The paper cites RouteLLM, HybridLLM, and FrugalGPT as directly related routing systems but only compares against a naive 'always GPT-4' baseline. This makes the improvements appear larger than they might be against actual competing approaches."
    409     },
    410     {
    411       "flag": "Company evaluating its own product",
    412       "detail": "PwC employees propose a framework for 'enterprise-scale' AI deployment without disclosing potential commercial interests. No competing interests statement, no funding disclosure."
    413     },
    414     {
    415       "flag": "Many tunable parameters with no reported values",
    416       "detail": "The paper introduces 11+ tunable parameters (α, β, γ, τ, δm, δu, δt, λc, θs, θt, θm) but reports actual values for none of them, making reproduction impossible."
    417     },
    418     {
    419       "flag": "No contamination analysis on old benchmarks",
    420       "detail": "MMLU (2020), VQA-v2 (2015), GSM8K (2021), and MBPP (2021) are all well-known benchmarks likely in training data of models used. No contamination discussion despite this being critical to interpreting routing effectiveness."
    421     },
    422     {
    423       "flag": "Attachment detection precision/recall of 1.00",
    424       "detail": "Perfect attachment detection (precision and recall both 1.00) on the evaluation set is acknowledged as 'dataset-specific' but remains suspiciously clean. Either the evaluation set is too easy or too small for this task."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    430       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    431       "year": 2023,
    432       "arxiv_id": "2305.05176",
    433       "relevance": "Directly relevant prior work on cost-efficient LLM routing via query-dependent cascades."
    434     },
    435     {
    436       "title": "RouteLLM: Learning to Route LLMs with Preference Data",
    437       "authors": ["Isaac Ong"],
    438       "year": 2024,
    439       "arxiv_id": "2406.18665",
    440       "relevance": "LLM routing system that trains classifiers on human preference data to decide when to use large models."
    441     },
    442     {
    443       "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing",
    444       "authors": ["Dujian Ding"],
    445       "year": 2024,
    446       "relevance": "Cost-efficient routing between small and large models for text queries."
    447     },
    448     {
    449       "title": "HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging Face",
    450       "authors": ["Yongliang Shen"],
    451       "year": 2023,
    452       "relevance": "Multimodal orchestration framework using a controller LLM to invoke domain-specific expert models."
    453     },
    454     {
    455       "title": "Evaluating Large Language Models Trained on Code",
    456       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    457       "year": 2021,
    458       "arxiv_id": "2107.03374",
    459       "relevance": "Codex evaluation paper, foundational for LLM code generation benchmarking."
    460     },
    461     {
    462       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
    463       "authors": ["Qingyun Li"],
    464       "year": 2023,
    465       "arxiv_id": "2308.08155",
    466       "relevance": "Multi-agent framework for decomposing tasks and coordinating LLM agents."
    467     },
    468     {
    469       "title": "Small Language Models are the Future of Agentic AI",
    470       "authors": ["Peter Belcak"],
    471       "year": 2025,
    472       "arxiv_id": "2506.02153",
    473       "relevance": "Argues compact SLMs can be competitive for agentic subtasks, relevant to routing cost-quality tradeoffs."
    474     },
    475     {
    476       "title": "Beyond Monoliths: Expert Orchestration for More Capable, Democratic, and Safe Large Language Models",
    477       "authors": ["Philip Quirke"],
    478       "year": 2025,
    479       "arxiv_id": "2506.00051",
    480       "relevance": "Addresses orchestration challenges for multimodal LLM deployment, directly related to routing architectures."
    481     },
    482     {
    483       "title": "Measuring Massive Multitask Language Understanding",
    484       "authors": ["Dan Hendrycks"],
    485       "year": 2020,
    486       "arxiv_id": "2009.03300",
    487       "relevance": "MMLU benchmark paper, key benchmark used in this routing evaluation."
    488     },
    489     {
    490       "title": "Phi-3 Technical Report: A Highly Capable Language Model Locally on Your Phone",
    491       "authors": ["Marah Abdin", "Sam Ade Jacobs", "Ammar Ahmad Awan"],
    492       "year": 2024,
    493       "arxiv_id": "2404.14219",
    494       "relevance": "Small language model used as an efficient expert in the routing framework."
    495     },
    496     {
    497       "title": "Mixtral of Experts",
    498       "authors": ["Albert Q Jiang"],
    499       "year": 2024,
    500       "arxiv_id": "2401.04088",
    501       "relevance": "Mixture-of-experts model used for follow-up queries in the framework, relevant to efficient LLM architecture."
    502     }
    503   ]
    504 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs