ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27488B)


      1 {
      2   "paper": {
      3     "title": "Budget-Aware Agentic Routing via Boundary-Guided Training",
      4     "authors": [
      5       "Caiqi Zhang",
      6       "Menglin Xia",
      7       "Xuchao Zhang",
      8       "Daniel Madrigal",
      9       "Ankur Mallick",
     10       "Samuel Kessler",
     11       "Victor Rühle",
     12       "Saravan Rajmohan"
     13     ],
     14     "year": 2026,
     15     "venue": "arXiv preprint",
     16     "arxiv_id": "2602.21227"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "Appendix D states 'Codes will be released upon acceptance.' This is a promise of future release, not actual release. No repository URL is provided."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper uses three publicly available benchmarks: SciWorld, ALFWorld (via AgentGym framework), and AppWorld. All are established public datasets with citations and available implementations."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions using Qwen2.5-1.5B and vLLM on an A100 GPU, and OpenAI APIs, but does not provide a reproducible environment specification with library versions."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "While Appendix D provides pipeline details (hyperparameters, training stages), there are no step-by-step reproduction instructions, no README, and no scripts. The code is not yet released."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper states 'All experiments are averaged over three random seeds' (Section 5.1) but does not report confidence intervals, error bars, or standard deviations in any table or figure. Only point estimates are presented."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper makes claims like 'BoPO consistently improves the efficiency frontier' and 'BoPO generally outperforms baselines' but provides no statistical significance tests (no p-values, no bootstrap tests, no paired comparisons)."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper reports absolute success rates and costs for all methods across benchmarks (e.g., 'reaches the high-performance regime (> 63%) at a cost of $0.125'), and Table 1 provides per-setting breakdowns. Results are presented in context with baseline performance, allowing effect magnitude assessment."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Test set sizes are reported in Table 3 (200 for ALFWorld and SciWorld, 168 for AppWorld) and 3 random seeds are used, but there is no justification for why these sizes are sufficient for the claims made, nor any power analysis."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Despite running experiments over 3 random seeds (Section 5.1), no standard deviations, variance, or any spread measures are reported in the tables or figures. Only averaged point estimates are shown."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Section 5.1 describes seven baselines: Always-small/large, Random routing, Zero-shot GPT-5 router, Cascading (FrugalGPT-style), Single-turn (RouteLLM-style), SFT-only, and Vanilla RL. Additional baselines (First-Large) are included for hard-budget evaluation."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Baselines include recent work: RouteLLM (Ong et al., 2025), FrugalGPT (Chen et al., 2024), xRouter/Router-R1 (Qian et al., 2025; Zhang et al., 2025). The Zero-shot GPT-5 router is also included as an upper bound. These are all contemporary and competitive."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Section 6 and Figure 2 present a component-wise ablation study on SciWorld, testing: SFT-only, Vanilla RL, no stratified sampling, no reference advantage, and no difficulty bonus. Each component's contribution to the efficiency frontier is isolated."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Two primary metrics are reported: Success Rate (SR) and Average Cost per task ($). Additionally, Large Model Usage percentage (Use%) is reported in Table 1 for hard-budget experiments."
     88       },
     89       "human_evaluation": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "This is a benchmark evaluation paper about model routing. The system's outputs are routing decisions evaluated against task success metrics in automated environments. Human evaluation is not relevant to the claims."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table 3 explicitly separates train and test sizes: ALFWorld (2420 train / 200 test), SciWorld (2120 train / 200 test), AppWorld (105 train / 168 test). Results are reported on these held-out test sets."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down per benchmark (SciWorld, ALFWorld, AppWorld) in Figure 1 and Table 1. Figure 3 provides a per-difficulty-category breakdown (Easy/Hard/Intractable) of budget allocation. Table 1 provides per-K breakdowns for hard-budget experiments."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 5.2 discusses the limitation of static trade-off parameters, noting 'a small λ is too aggressive for low budgets (leading to early exhaustion), while a large λ is overly conservative for high budgets.' Section 6 discusses Vanilla RL's failure to identify high-leverage steps and policy collapse. Appendix F analyzes cost inversions where the small model performs worse."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper reports that BoPO with large λ shows a 'K=15 drop in AppWorld' (Section 5.2), that 'policies trained with a fixed soft-budget trade-off do not fully adapt to different hard caps' (Section 1), and that Vanilla RL collapses. Appendix A explicitly discusses limitations of static risk profiles."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims BoPO 'improves the efficiency frontier, matching strong routing baselines at substantially lower cost while demonstrating generalization to strict inference-time budget constraints.' These are supported by Figure 1 (Pareto frontiers) and Table 1 (hard-budget results). The abstract also honestly notes the generalization is partial."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper makes causal claims through its ablation study (Section 6, Figure 2), which removes components one at a time (stratified sampling, reference advantage, difficulty bonus) to show each component's causal contribution. The ablation design uses controlled single-variable manipulation, which is adequate for causal inference about component contributions."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper scopes claims to three specific benchmarks, discusses testing with two model pairs (GPT-4.1 pair and Llama-3.1 pair), and Appendix A ('Model Scope') explicitly states they focused on binary choice and acknowledges the need for N-model generalization. The limitations section discusses static risk profiles and incomplete hard-budget adaptation."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Section 5.2 discusses that GPT-5's strong performance 'likely stems from GPT-5's strong instruction-following.' The paper considers why Vanilla RL fails (sparsity of reward signal) and why First-Large is competitive (importance of early-stage trajectory correctness). Section 6 discusses why BoPO may not fully generalize to hard budgets (static λ)."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper specifies model names: 'GPT-4.1 mini' as Msmall, 'GPT-4.1' as Mlarge, and 'Qwen2.5-1.5B (instruction-tuned)' as the router. However, no API snapshot dates or specific version strings (e.g., gpt-4.1-2025-04-14) are provided. 'GPT-4.1' and 'GPT-4.1 mini' are marketing names without version identifiers. For the generalization experiment, 'Llama-3.1-8B-Instruct' and 'Llama-3.1-72B-Instruct' are specified, which are versioned model names."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Appendix G (Tables 4 and 5) provides the full prompt templates for both soft-budget and hard-budget routing modes, including the system prompt configuration, history formatting, and decision format. The templates include placeholders but the fill values are defined (task description, model name, action/observation previews, budget state)."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Appendix D provides detailed hyperparameters: global batch size 64, BoSFT learning rate 2×10⁻⁵ with cosine decay and 3% warmup, BoPO learning rate 1×10⁻⁶, GRPO group size G=8, KL coefficient β=0.04, reward weights rsuccess=1.0, rhard=0.5, λ∈{0.1,0.3,0.5,0.7,0.9}, N=20 stratified trajectories, K=5 profiling trials."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The paper describes the agent scaffolding: 'Our agent is instantiated with ReAct-style prompting (Yao et al., 2022)' (Section 5.1). The routing mechanism, decision-token generation, budget-constrained decoding, and history truncation are all described in detail. The router's role in the agentic pipeline is specified."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Appendix D documents the pipeline: difficulty profiling (K=5 trials per task), task partitioning into Easy/Hard/Intractable, stratified trajectory synthesis (N=20), expert trajectory selection, batch composition (70% Hard task oversampling), router input truncation (most recent 2k tokens), and decision-token masking."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Appendix A is titled 'Limitations and Future Work' and provides substantive discussion across three subsections: Training Overhead, Model Scope, and Static Risk Profiles."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Appendix A discusses specific limitations: (1) training overhead is a one-time cost amortized over inference, (2) the binary model choice is a simplification of N-model selection, (3) static λ does not adapt to varying hard-budget caps. Section 5.2 also identifies specific cases where the approach underperforms (BoPO Large λ at K=15 in AppWorld)."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Appendix A ('Model Scope') explicitly states: 'We intentionally focused this study on the binary choice between a cost-efficient model (Msmall) and a high-capability model (Mlarge) to isolate the fundamental economic trade-off.' The paper also states BoPO 'is not the final solution for hard-budget routing' (Section 5.2) and that future work should condition on budget state."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw trajectory data, routing decisions, or per-task results are released. Only aggregated metrics are reported in figures and tables."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 5.1 and Appendix D describe how data is collected: profiling with K=5 trials per task for difficulty taxonomy, N=20 stratified trajectories for SFT data synthesis, and evaluation on held-out test sets from three benchmarks. The evaluation protocol (max steps, termination conditions) is specified."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Data sources are standard public benchmarks (SciWorld, ALFWorld, AppWorld)."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Appendix D provides the full pipeline: (i) difficulty profiling with K=5 trials, (ii) Boundary-Guided SFT with stratified sampling (N=20), expert trajectory selection, Hard task oversampling (~70%), (iii) BoPO online optimization, (iv) Budget-Constrained Decoding at inference. Dataset statistics with train/test splits are in Table 3."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding or acknowledgments section is present in the paper. The affiliations show Microsoft and University of Cambridge, but no funding disclosures are made."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: Caiqi Zhang at University of Cambridge, and the remaining authors (Menglin Xia, Xuchao Zhang, Daniel Madrigal, Ankur Mallick, Samuel Kessler, Victor Rühle, Saravan Rajmohan) at M365 Research, Microsoft."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Seven of eight authors are Microsoft employees (M365 Research). The paper evaluates routing using OpenAI models (GPT-4.1, GPT-4.1 mini) which are integrated into Microsoft products. While the paper does not evaluate Microsoft-specific products, Microsoft has a financial interest in efficient LLM routing for its agentic AI platforms. No funding disclosure is made to assess independence."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement or financial disclosures are provided. The paper lacks a conflicts of interest declaration."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper does not state the training data cutoff dates for GPT-4.1, GPT-4.1 mini, Qwen2.5-1.5B, or Llama-3.1 models. While the router is fine-tuned, the underlying agent models' training cutoffs are not discussed."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether the benchmark tasks (SciWorld, ALFWorld, AppWorld) could have appeared in the training data of GPT-4.1 or other models. These are public benchmarks that could be in training data."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "SciWorld (2022), ALFWorld (2021), and AppWorld (2024) are all publicly available benchmarks. Models like GPT-4.1 may have been trained on data containing these benchmarks. No contamination analysis is provided."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study. It is a benchmark evaluation paper."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Average cost per task in USD is a primary metric, reported in Figure 1 and Table 1. Section 5.1 provides OpenAI pricing: GPT-4.1 is $2.00/$8.00 per 1M tokens (in/out), GPT-4.1 mini is $0.40/$1.60. The router overhead is analyzed at <0.2 seconds and <2% of total latency (Section 6)."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No total computational budget is stated for training. While the paper mentions using A100 GPUs for vLLM inference of the router, the total GPU hours for difficulty profiling (K=5 trials per task), stratified sampling (N=20 trajectories per hard task), SFT training, and BoPO training are not quantified."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "BoPO consistently improves the cost-success efficiency frontier across all three benchmarks, matching the performance of always using the large model at a fraction of the cost.",
    295       "evidence": "Figure 1 shows Pareto efficiency frontiers for SciWorld, ALFWorld, and AppWorld. BoPO curves are consistently closer to the top-left (high success, low cost) compared to all baselines. In ALFWorld, BoPO reaches >63% success at $0.125.",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "Boundary-Guided Training prevents the 'always-small' policy collapse that occurs with vanilla RL under sparse rewards.",
    300       "evidence": "Figure 1 and Section 5.2 show that Vanilla RL baseline performance is comparable to Random routing in ALFWorld and AppWorld. The ablation study (Figure 2) shows removing the reference-guided advantage causes destabilization and catastrophic forgetting.",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "BoPO generalizes to hard-budget constraints via Budget-Constrained Decoding without hard-budget-specific training.",
    305       "evidence": "Table 1 shows BoPO performance under strict K∈{5,10,15} constraints. BoPO generally outperforms baselines, though the paper acknowledges First-Large is 'a remarkably strong baseline' and that static λ limits adaptability.",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "BoPO strategically allocates budget to Hard tasks where compute is most productive.",
    310       "evidence": "Figure 3 shows BoPO concentrates 52.2% of cost on Hard tasks vs. 35.7% for Random and 37.3% for Vanilla RL, while reducing Easy task spending to 21.8%.",
    311       "supported": "strong"
    312     },
    313     {
    314       "claim": "The approach generalizes to different model pairs (open-source Llama-3.1 models).",
    315       "evidence": "Figure 4 in the appendix shows BoPO maintains its effectiveness on SciWorld with Llama-3.1-8B-Instruct / Llama-3.1-72B-Instruct. However, this is shown on only one benchmark.",
    316       "supported": "moderate"
    317     },
    318     {
    319       "claim": "The routing overhead is negligible (<2% of total trajectory latency).",
    320       "evidence": "Section 6 provides a latency analysis estimating the router (Qwen2.5-1.5B generating single-token decisions) adds <0.2 seconds vs. 11-12 seconds baseline per task, based on standardized throughput figures.",
    321       "supported": "moderate"
    322     }
    323   ],
    324   "methodology_tags": [
    325     "benchmark-eval"
    326   ],
    327   "key_findings": "The paper proposes Budget-Aware Agentic Routing (BAAR), which formulates step-wise model selection in long-horizon agentic workflows as a sequential decision process. The Boundary-Guided Training approach (BoSFT + BoPO) uses task difficulty taxonomies and boundary reference policies to stabilize RL under sparse rewards. Across SciWorld, ALFWorld, and AppWorld, BoPO consistently dominates the cost-success Pareto frontier, matching expensive always-large baselines at substantially reduced costs. The method also transfers to hard-budget constraints via Budget-Constrained Decoding, though static trade-off parameters limit full adaptability to varying budget caps.",
    328   "red_flags": [
    329     {
    330       "flag": "No variance or uncertainty reported despite multi-seed experiments",
    331       "detail": "The paper runs all experiments over 3 random seeds but reports only averaged point estimates. No standard deviations, confidence intervals, or error bars are shown in any table or figure. This makes it impossible to assess result stability or whether differences between methods are statistically meaningful."
    332     },
    333     {
    334       "flag": "No benchmark contamination analysis",
    335       "detail": "SciWorld (2022) and ALFWorld (2021) are publicly available benchmarks that may be in the training data of GPT-4.1. No contamination analysis is performed, and model training cutoff dates are not stated."
    336     },
    337     {
    338       "flag": "Code not released",
    339       "detail": "The paper states 'Codes will be released upon acceptance' but no code is available at the time of evaluation. This prevents independent verification of the results."
    340     },
    341     {
    342       "flag": "Router cost excluded from metrics",
    343       "detail": "Section 5.1 and Appendix E acknowledge that the computational cost of the routing module is excluded from reported metrics. While the paper argues this is justified for the lightweight trained router (<2% overhead), the Zero-shot GPT-5 router baseline — which incurs substantial cost — is compared on the same cost axis, making the comparison asymmetric."
    344     }
    345   ],
    346   "cited_papers": [
    347     {
    348       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    349       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    350       "year": 2024,
    351       "relevance": "Foundational work on cost-efficient LLM routing via cascading strategies, a key baseline in this paper."
    352     },
    353     {
    354       "title": "RouteLLM: Learning to route LLMs from preference data",
    355       "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu", "Wei-Lin Chiang", "Tianhao Wu", "Joseph E. Gonzalez", "M. Waleed Kadous", "Ion Stoica"],
    356       "year": 2025,
    357       "relevance": "Trains a preference classifier for single-turn LLM routing, used as a baseline in this paper."
    358     },
    359     {
    360       "title": "xRouter: Training cost-aware LLMs orchestration system via reinforcement learning",
    361       "authors": ["Cheng Qian", "Zhengming Liu", "Srinivas Kokane", "Anurag Prabhakar", "Junwei Qiu", "Hao Chen", "Zhaoyang Liu", "Heng Ji", "Weizhu Yao", "Shelby Heinecke", "Silvio Savarese", "Caiming Xiong", "Huan Wang"],
    362       "year": 2025,
    363       "arxiv_id": "2510.08439",
    364       "relevance": "RL-based multi-round LLM orchestration system, closely related work for agentic routing."
    365     },
    366     {
    367       "title": "Router-R1: Teaching LLMs multi-round routing and aggregation via reinforcement learning",
    368       "authors": ["Hao Zhang", "Tao Feng", "Jiaxuan You"],
    369       "year": 2025,
    370       "relevance": "RL-trained router for multi-round model routing and aggregation, directly comparable approach."
    371     },
    372     {
    373       "title": "MasRouter: Learning to route LLMs for multi-agent systems",
    374       "authors": ["Yanbo Yue", "Guoxin Zhang", "Biao Liu", "Guowei Wan", "Kai Wang", "Deheng Cheng", "Yuhui Qi"],
    375       "year": 2025,
    376       "doi": "10.18653/v1/2025.acl-long.757",
    377       "relevance": "Routing for multi-agent systems, addressing sub-task assignment to specialized agent roles."
    378     },
    379     {
    380       "title": "A unified approach to routing and cascading for LLMs",
    381       "authors": ["Jasper Dekoninck", "Maximilian Baader", "Martin Vechev"],
    382       "year": 2025,
    383       "relevance": "Unified routing-cascading framework for LLMs, advancing the routing paradigm."
    384     },
    385     {
    386       "title": "Hybrid LLM: Cost-efficient and quality-aware query routing",
    387       "authors": ["Dujian Ding", "Ankur Mallick", "Chi Wang", "Robert Sim", "Subhabrata Mukherjee", "Victor Rühle", "Laks V. S. Lakshmanan", "Ahmed H. Awadallah"],
    388       "year": 2024,
    389       "relevance": "Cost-quality routing framework for LLMs, prior work from some of the same authors."
    390     },
    391     {
    392       "title": "OmniRouter: Budget and performance controllable multi-LLM routing",
    393       "authors": ["Kai Mei", "Wenxuan Xu", "Mingyi Guo", "Shiwen Lin", "Yang Zhang"],
    394       "year": 2026,
    395       "doi": "10.1145/3787470.3787480",
    396       "relevance": "Casts routing as global constrained optimization over query distributions, relevant budget-aware approach."
    397     },
    398     {
    399       "title": "CARROT: A cost aware rate optimal router",
    400       "authors": ["Samuel Somerstep", "Felipe Maia Polo"],
    401       "year": 2025,
    402       "arxiv_id": "2502.03261",
    403       "relevance": "Lightweight cost-quality trade-off router with theoretical guarantees."
    404     },
    405     {
    406       "title": "Adaptive LLM routing under budget constraints",
    407       "authors": ["Pratik Panda", "R. Magazine", "C. Devaguptapu", "S. Takemori", "V. Sharma"],
    408       "year": 2025,
    409       "doi": "10.18653/v1/2025.findings-emnlp.1301",
    410       "relevance": "Budget-constrained adaptive routing policies for LLMs, directly related to the budget-aware paradigm."
    411     },
    412     {
    413       "title": "Best-route: Adaptive LLM routing with test-time optimal compute",
    414       "authors": ["Dujian Ding", "Ankur Mallick"],
    415       "year": 2025,
    416       "relevance": "Test-time compute optimization for LLM routing, related budget-constrained approach from overlapping authors."
    417     },
    418     {
    419       "title": "AppWorld: A controllable world of apps and people for benchmarking interactive coding agents",
    420       "authors": ["Harsh Trivedi", "Tushar Khot", "Marc Hartmann"],
    421       "year": 2024,
    422       "doi": "10.18653/v1/2024.acl-long.850",
    423       "relevance": "Primary benchmark for tool-use and coding complexity evaluation of agentic systems."
    424     },
    425     {
    426       "title": "AgentGym: Evolving large language model-based agents across diverse environments",
    427       "authors": ["Zhiheng Xi"],
    428       "year": 2024,
    429       "relevance": "Framework used for standardized evaluation on ALFWorld and SciWorld benchmarks."
    430     }
    431   ]
    432 }

Impressum · Datenschutz