scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27669B)
      1 {
      2   "paper": {
      3     "title": "Optimizing NetGPT via Routing-Based Synergy and Reinforcement Learning",
      4     "authors": [
      5       "Yuxuan Chen",
      6       "Rongpeng Li",
      7       "Xianfu Chen",
      8       "Celimuge Wu",
      9       "Chenghui Peng",
     10       "Zhifeng Zhao",
     11       "Honggang Zhang"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2511.22217",
     16     "doi": "10.48550/arXiv.2511.22217"
     17   },
     18   "scan_version": 2,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval", "theoretical"],
     21   "key_findings": "The paper proposes a cloud-edge LLM routing framework with network-aware dynamic fallback thresholds and proves the optimal threshold is unique and monotone in RTT and bandwidth. Experiments on synthetic tool-calling tasks show dynamic controllers (FuncDyn, PolicyNet) outperform static routing approaches (RouteLLM, FrugalGPT) across simulated network regimes. SFT-anchored PPO improves edge model quality while preserving schema-correct tool-calling outputs, and periodic reward model refresh improves routing accuracy.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No repository URL, code archive, or any link to source code is provided anywhere in the paper."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The evaluation uses 'a private corpus of tool-calling tasks' (Section V.A) that is not released. The data is described as batch-generated with GPT-4o but no download link is provided."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions '8× NVIDIA A800 (80 GB) GPUs' and model names (DeepSeek-R1-Distill-Qwen-7B, DeepSeek-V3.2-Exp, Qwen2.5-1.5B-Instruct) but provides no requirements.txt, Dockerfile, library versions, or environment setup details."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions are provided. Algorithm 1 gives a pseudocode overview but no runnable scripts or commands."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Results in Figs 5-12 and throughout Section V report only point estimates. No confidence intervals, error bars, or ± notation are present in any figure or table."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Claims such as 'FuncDyn yields the highest J' (Section V.B) and 'the frontier strictly dominates' (Section V.D) are made by comparing numbers directly with no statistical significance tests."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The paper reports specific threshold values (τ*_GOOD ≈ 4.54, τ*_MID ≈ 4.28, τ*_BAD ≈ 3.59) and shows metric curves, but does not report formal effect sizes or percentage improvements with baseline context for the main comparative claims."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The evaluation uses 8,000 tasks and training uses 8,000 SFT + 2,000 RM examples (Section V.A) but no justification is given for why these specific sample sizes were chosen."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No standard deviations, interquartile ranges, or variance across runs are reported. Fig 10 mentions 'lower variance' qualitatively but does not report numerical variance measures. Fig 6 shows step-level utility with a windowed mean but no spread measure."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Section V.A describes four baselines: RouteLLM [14], FrugalGPT [11], All Edge, and All Cloud. Results are compared in Fig 5."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "RouteLLM (ICLR 2025) and FrugalGPT (TMLR 2024) are recent and represent state-of-the-art approaches in LLM routing."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Multiple ablation-style comparisons are provided: PPO with vs without SFT anchor (Fig 10), FuncDyn vs PolicyNet, before/after RL training (Fig 11), and before/after RM training (Fig 12)."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Four metrics are reported: utility J, composite quality Q, total cost C, and offload rate (Section V.A)."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "All evaluation is automated using a frozen evaluation model (Eq. 9). No human evaluation of the system's outputs is performed. Given that the paper claims quality of tool-calling outputs, human evaluation would be relevant."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section V.A describes separate data splits: 8,000 SFT examples, 2,000 RM examples, and 8,000 evaluation tasks. The evaluation tasks are distinct from training data."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Results are broken down by network regime (GOOD/MID/BAD) in Fig 5, by different λ values in Fig 9, and by various operational conditions throughout Section V."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "No failure analysis, error cases, or discussion of where the approach breaks down. All presented results show the method performing well."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": false,
    116         "justification": "Every experiment shows the proposed method performing positively. No configurations that failed, approaches that were tried and abandoned, or ablations that hurt performance are reported."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The abstract claims 'smooth quality-cost frontiers' (supported by Fig 7), 'consistent gains of dynamic fallback thresholds over fixed policies' (supported by Fig 5-6), and 'sustained reductions in offloading while maintaining task success' (supported by Fig 10). Each claim has corresponding experimental evidence."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Causal claims like 'SFT-anchored on-device RL contributes to preserving task success' are supported by controlled ablation (PPO with vs without SFT in Fig 10). The ablation design manipulates single variables, which is adequate for these causal claims."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper tests on one private synthetic dataset with specific models (DeepSeek-R1-Distill-Qwen-7B, DeepSeek-V3.2-Exp) under simulated network conditions, but frames claims broadly as a 'cloud-edge synergy for NetGPT' applicable to general tool-calling workloads. The title and contributions claim generality not supported by the narrow evaluation setting."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "No alternative explanations for the observed improvements are discussed. No consideration of confounds such as whether the evaluation model (frozen ˜qϕ) may systematically favor the proposed method, or whether the synthetic data generation process biases results."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper measures quality via a frozen evaluation model ˜qϕ (Eq. 9) and frames this as 'task quality' and 'task success.' The gap between the automated proxy score and actual tool-calling quality in real-world deployment is not discussed."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Specific model identifiers are provided: 'DeepSeek-R1-Distill-Qwen-7B (FP32, non-quantized)' for edge, 'DeepSeek-V3.2-Exp' for cloud, and 'Qwen2.5-1.5B-Instruct' for the reward model (Section V.A)."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "Prompting schemes are described in natural language ('compact schema' at edge, 'ReAct-style prompt' for cloud in Section V.A) but the actual prompt text is not provided. Listings 1-2 in the appendix show data examples, not the prompts sent to models."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "The formulation introduces many hyperparameters (ε for PPO clipping, βKL for KL penalty, γ for discount, η for SFT weight, λ for trade-off) but their specific numerical values used in experiments are not reported. No hyperparameter table is provided."
    161       },
    162       "scaffolding_described": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The agentic scaffolding is described in detail: routing mechanism (Section III-IV), tool-calling interface (Fig 2, Fig 4), prompt construction differences between edge and cloud, history management (Eq. 42), and caching setup (Appendix B)."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Appendix A describes the data generation process: batch generation with GPT-4o using a maintained template pool, followed by a two-stage filtering pass (schema validation and quality screening). Data splits are specified (8,000 SFT, 2,000 RM pairs, 8,000 evaluation)."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No dedicated limitations section exists. Section VI (Conclusion and Future Work) briefly mentions future directions ('multi-edge/multi-cloud orchestration,' 'hardware-in-the-loop evaluations on real network traces') but these are framed as future work, not as limitations of the current study."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No threats to validity are discussed. There is no consideration of how synthetic data, simulated networks, or the specific model choices might limit the findings."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The paper does not explicitly state what the results do NOT show. No acknowledgment that findings are limited to simulated network conditions, synthetic tool-calling tasks, or the specific model pair tested."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "The evaluation data is a 'private corpus' (Section V.A) that is not publicly available. No data downloads or supplementary materials are provided."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Appendix A describes data generation: batch-generated with GPT-4o from a template pool specifying output schema, with two-stage filtering (schema validation + quality screening). Task structure (10-20 candidate tools, 0-8 prior steps) is documented in Section V.A."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. The data is synthetically generated, and the generation process is described in the paper."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The pipeline from data generation (GPT-4o templates) through filtering (schema validation, quality screening) to final splits (8,000 SFT / 2,000 RM / 8,000 eval) is documented in Appendix A. The online caching pipeline is described in Appendix B."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding acknowledgment or grant information is provided anywhere in the paper."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are listed in the header: Zhejiang University, Shenzhen CyberAray Network Technology, The University of Electro-Communications, Huawei Technologies, Zhejiang Lab, and Macau University of Science and Technology."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No funding source is disclosed, so independence cannot be verified. One author (Chenghui Peng) is affiliated with Huawei Technologies, which has commercial interest in edge computing and network infrastructure — directly relevant to this paper's domain."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests statement or financial disclosure is present in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No training data cutoff dates are stated for any of the three models used (DeepSeek-R1-Distill-Qwen-7B, DeepSeek-V3.2-Exp, Qwen2.5-1.5B-Instruct)."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No discussion of whether the synthetic evaluation tasks overlap with model training data. The tasks are generated by GPT-4o from templates, but the evaluated models may have seen similar patterns during pre-training."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "Not addressed. While the evaluation data is synthetically generated (reducing traditional contamination risk), the paper does not discuss whether the generation templates or tool-calling patterns could overlap with model training data."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": true,
    292         "justification": "Cost is a core evaluation metric. The paper reports normalized cost C (Eq. 8-10) as part of the utility function, including latency components and cloud token costs. Offload rates are reported across all experiments."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "Hardware is mentioned ('8× NVIDIA A800 (80 GB) GPUs' in Section V.A) but total GPU hours, training time, and total computational budget are not quantified."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single runs."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of experimental runs is never explicitly stated. It is unclear whether results are from single runs or averaged over multiple runs."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No hyperparameter search budget is reported. The paper introduces many hyperparameters (ε, βKL, γ, η, λ, αRTT, βBW, γhist) but does not describe how their values were selected or how many configurations were tried."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "No explanation of how the final configuration was selected. The paper presents results without describing whether the reported configuration was selected from many candidates."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "Multiple methods are compared across multiple network regimes and metrics without any statistical tests, let alone corrections for multiple comparisons."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors evaluate their own FuncDyn and PolicyNet systems against their own re-implementations of baselines (RouteLLM, FrugalGPT) without acknowledging or addressing self-comparison bias."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "No comparison of performance at matched compute budgets. FuncDyn and PolicyNet have different computational requirements than RouteLLM and FrugalGPT (e.g., PolicyNet's MLP training, RM refresh), but these are not controlled for."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The private synthetic tool-calling benchmark's validity as a measure of real-world routing quality is not discussed. Whether GPT-4o-generated tasks represent realistic deployment workloads is not examined."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "Different methods use different routing architectures and decision procedures (single-shot for RouteLLM vs multi-step for FuncDyn/PolicyNet). The paper does not address whether performance differences are attributable to the routing framework structure or the specific routing logic."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of temporal leakage. The synthetic data is generated by GPT-4o, but whether the evaluated models (DeepSeek) may have trained on similar tool-calling patterns is not addressed."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether the evaluation setup leaks information. The cloud prompt includes 'full tool and argument descriptions' while the edge prompt is 'compact' — this asymmetry could systematically favor cloud offloading but is not analyzed."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of independence between training and evaluation data. Both are generated by GPT-4o from the same template pool, creating potential structural similarity that is not addressed."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No leakage detection or prevention method is used or described."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Dynamic, network-aware fallback thresholds consistently dominate fixed policies on the quality-cost frontier across GOOD/MID/BAD network regimes.",
    373       "evidence": "Fig 5 shows FuncDyn yields highest J while keeping offload rate moderate; Fig 6 shows dynamic threshold maintains higher average J than fixed threshold across regime switches (Section V.B).",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "The optimal routing rule admits a unique fallback threshold that is monotonically dependent on bandwidth and RTT.",
    378       "evidence": "Theorem 1 proves uniqueness of optimal threshold under Assumptions 1-2; Theorem 2 proves monotone comparative statics. Empirical τ* values (4.54, 4.28, 3.59 for GOOD/MID/BAD) in Section V.C support the monotone pattern.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "SFT-anchored PPO preserves schema-correct tool-calling while enabling sustained edge LLM improvement.",
    383       "evidence": "Fig 10 contrasts PPO with and without SFT anchor, showing monotonically increased reward with lower variance and declining offload rate with SFT anchoring (Section V.D).",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Quality-cost frontier strictly dominates the pre-training curve after PPO training across a range of λ values.",
    388       "evidence": "Fig 11 shows post-training frontier above pre-training frontier (Section V.D). However, no error bars or statistical tests accompany this claim.",
    389       "supported": "weak"
    390     },
    391     {
    392       "claim": "Periodic reward model refresh improves routing accuracy, as evidenced by reduced selective risk at fixed coverage.",
    393       "evidence": "Fig 12 shows post-training risk-coverage curve lies uniformly below pre-training curve, especially in medium-to-high coverage range (Section V.D).",
    394       "supported": "moderate"
    395     }
    396   ],
    397   "red_flags": [
    398     {
    399       "flag": "Private non-reproducible dataset",
    400       "detail": "The entire evaluation uses a private corpus of tool-calling tasks generated by GPT-4o. No data, code, or environment specifications are released, making independent reproduction impossible."
    401     },
    402     {
    403       "flag": "No uncertainty quantification",
    404       "detail": "All results are reported as point estimates without error bars, confidence intervals, standard deviations, or statistical tests. It is impossible to assess the reliability or stability of the reported improvements."
    405     },
    406     {
    407       "flag": "Simulated network conditions only",
    408       "detail": "Network states are simulated via a Gauss-Markov model with three fixed regimes (GOOD/MID/BAD). Claims about 'dynamic network conditions' may not transfer to real network traces, which the authors acknowledge as future work."
    409     },
    410     {
    411       "flag": "Missing hyperparameter values",
    412       "detail": "The formulation introduces numerous hyperparameters (ε, βKL, γ, η, λ, αRTT, βBW, γhist) but their specific numerical values used in experiments are never reported, making reproduction impossible even with the algorithm pseudocode."
    413     },
    414     {
    415       "flag": "Undisclosed funding with industry affiliation",
    416       "detail": "One author is from Huawei Technologies, which has direct commercial interest in edge computing and network infrastructure. No funding source or competing interests are disclosed."
    417     },
    418     {
    419       "flag": "No failure analysis",
    420       "detail": "Every experiment shows the proposed method performing well. No failure cases, negative results, or conditions under which the approach degrades are discussed."
    421     }
    422   ],
    423   "cited_papers": [
    424     {
    425       "title": "RouteLLM: Learning to route llms from preference data",
    426       "authors": ["I. Ong", "A. Almahairi"],
    427       "year": 2025,
    428       "relevance": "Learned LLM routing from preference data, used as a primary baseline in this work."
    429     },
    430     {
    431       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    432       "authors": ["L. Chen", "M. Zaharia"],
    433       "year": 2024,
    434       "relevance": "Cost-aware LLM cascade with fixed thresholds, used as a primary baseline."
    435     },
    436     {
    437       "title": "Routerdc: Query-based router by dual contrastive learning for assembling large language models",
    438       "authors": ["S. Chen", "W. Jiang"],
    439       "year": 2024,
    440       "relevance": "Dual-contrastive learning approach for LLM routing that learns per-model utility scores."
    441     },
    442     {
    443       "title": "GraphRouter: A graph-based router for LLM selections",
    444       "authors": ["T. Feng", "Y. Shen"],
    445       "year": 2025,
    446       "relevance": "Graph-based inductive LLM selection via heterogeneous query-model-task graphs."
    447     },
    448     {
    449       "title": "Hybrid-LLM: Cost-efficient and quality-aware query routing",
    450       "authors": ["D. Ding", "A. Mallick"],
    451       "year": 2024,
    452       "relevance": "Two-model routing via learned difficulty predictor for cost-quality trade-off."
    453     },
    454     {
    455       "title": "MasRouter: Learning to route llms for multi-agent systems",
    456       "authors": ["Y. Yue", "G. Zhang"],
    457       "year": 2025,
    458       "relevance": "Multi-agent LLM routing with role allocation and collaboration mode selection."
    459     },
    460     {
    461       "title": "RouterBench: A benchmark for multi-LLM routing system",
    462       "authors": ["Q. J. Hu", "J. Bieker"],
    463       "year": 2024,
    464       "arxiv_id": "2403.12031",
    465       "relevance": "Standardized benchmark for evaluating multi-LLM routing systems."
    466     },
    467     {
    468       "title": "MetaLLM: A high-performant and cost-efficient dynamic framework for wrapping LLMs",
    469       "authors": ["Q. H. Nguyen", "T. Dao"],
    470       "year": 2024,
    471       "arxiv_id": "2407.10834",
    472       "relevance": "Contextual bandit framework for adaptive LLM model selection under cost/quality uncertainty."
    473     },
    474     {
    475       "title": "ReAct: Synergizing reasoning and acting in language models",
    476       "authors": ["S. Yao", "J. Zhao"],
    477       "year": 2023,
    478       "relevance": "Foundational work on LLM reasoning+acting patterns used as the cloud prompting scheme in this paper."
    479     },
    480     {
    481       "title": "Toolformer: Language models can teach themselves to use tools",
    482       "authors": ["T. Schick", "J. Dwivedi-Yu"],
    483       "year": 2023,
    484       "arxiv_id": "2302.04761",
    485       "relevance": "Foundational work on LLM tool use, directly relevant to the tool-calling evaluation in this paper."
    486     },
    487     {
    488       "title": "DisRouter: Distributed self-routing for LLM selections",
    489       "authors": ["H. Zheng", "H. Xu"],
    490       "year": 2025,
    491       "arxiv_id": "2510.19208",
    492       "relevance": "Distributed self-routing where each LLM decides to forward or reject, trained for system utility."
    493     },
    494     {
    495       "title": "A unified approach to routing and cascading for LLMs",
    496       "authors": ["J. Dekoninck", "M. Baader"],
    497       "year": 2025,
    498       "relevance": "Theoretical analysis of optimal routing and cascade strategies with quality/cost estimators."
    499     }
    500   ]
    501 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs