ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30342B)


      1 {
      2   "paper": {
      3     "title": "GreenServ: Energy-Efficient Context-Aware Dynamic Routing for Multi-Model LLM Inference",
      4     "authors": [
      5       "Thomas Ziller",
      6       "Shashikant Ilager",
      7       "Alessandro Tundo",
      8       "Ezio Bartocci",
      9       "Leonardo Mariani",
     10       "Ivona Brandic"
     11     ],
     12     "year": 2026,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2601.17551",
     15     "doi": "10.48550/arXiv.2601.17551"
     16   },
     17   "scan_version": 2,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "GreenServ, a contextual multi-armed bandit routing framework for LLM inference, achieves 22% higher accuracy and 31% lower energy consumption compared to random routing across 5 benchmarks and 16 open-source LLMs. Task type is the single most informative contextual feature for routing decisions, with full-feature contexts actually hurting convergence due to increased dimensionality. The system surpasses the static Pareto front of single-model deployments and adapts to new models at runtime without offline recalibration, with negligible overhead (6.7-7.8 ms per query).",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper states 'All artifacts are open-source and available as an anonymous repository for review purposes' and provides a link to anonymous.4open.science. While this is an anonymous review link rather than a permanent public repository, a URL to the code is provided."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "All five evaluation datasets are publicly available standard benchmarks: MMLU, HellaSwag, Winogrande, GSM8K, and CNN/Daily Mail, loaded through the HuggingFace datasets library (§5)."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper specifies Python 3.10, Ubuntu 22.04.5 LTS, CUDA 12.2, A100 80GB, bfloat16, and names libraries (FastAPI, sentence-transformers, scikit-learn, NumPy, PyTorch, transformers, zeus, textstat) but does not provide a requirements.txt, Dockerfile, or library version numbers sufficient to recreate the environment."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions are included in the paper. The anonymous repository link is provided but the paper itself contains no README-equivalent or specific commands to replicate experiments."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "95% confidence intervals are reported throughout. §6.1.4: 'Results include 95% confidence intervals.' Figure 2a: 'Error bars represent 95% confidence intervals.' Figure 3 shows shaded areas representing 95% CIs."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No statistical significance tests are reported. The paper claims GreenServ 'outperforms' baselines and notes '22% increase in accuracy' but relies on comparing point estimates and overlapping confidence intervals without formal hypothesis testing (no p-values, t-tests, or other tests)."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Effect sizes with baseline context are reported: '22% increase in accuracy while reducing cumulative energy consumption by 31%' compared to random routing (§6.3.1). Additional comparisons: Smallest '+90-100% accuracy', accuracy-optimized '-10-12% accuracy, -75% energy.' These provide magnitude and baseline context."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "500 instances per dataset (2,500 total queries per run) are used with no justification for this sample size. No power analysis or rationale for why 500 was sufficient."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Results are reported across 50 independent runs (§6.3.1, §6.3.3) and 20 runs (§6.2.2) with 95% confidence intervals and shaded variance regions in figures."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Six baselines are compared (§6.1.6): Random, Largest (Yi-34B), Smallest (Qwen2.5-0.5B), Highest Accuracy (Gemma-3-27B), ε-Greedy, and Thompson Sampling."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Baselines include contemporary MAB algorithms (ε-Greedy, Contextual Thompson Sampling) and static strategies using current models (Gemma-3-27B, Yi-34B). The model pool includes recent models (Gemma 3, Qwen 2.5, Llama 3.1/3.2, Phi 4)."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "§6.2.3 and §6.3.3 present a thorough ablation study of contextual features: None (context-free), single features (Task, Cluster, Complexity), pairwise combinations, and Full features, with 50 runs per configuration."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Multiple evaluation metrics are used: mean normalized accuracy, total energy consumption (Wh), cumulative regret, moving average regret, model selection frequency, overhead latency (ms), and AIQ from RouterBench."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "All evaluation is fully automated using Exact Match and ROUGE metrics. No human evaluation of routing quality or model output quality is performed. The paper acknowledges this limitation: 'our current evaluation focuses on tasks with objective ground truth' (§6.4)."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "§6.1.2: '500 instances were uniformly sampled from the test set partition.' The task classifier uses separate training/validation splits (§4.2.1). RouterBench evaluation uses its own held-out data."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down by algorithm type, by individual model (Figure 7, Table 3), by feature configuration (Figure 5, Figure 8), by λ value (Figure 4, Figure 9), and by overhead component (Table 4)."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "No specific failure case examples are shown. The paper discusses system-level limitations (cold start, stationarity assumption) but does not present individual queries where routing failed or analyze error patterns qualitatively."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "§6.3.3 reports that including all features 'appears to raise regret levels notably' compared to task-only context, attributed to 'increased dimensionality which potentially slows convergence.' The Complexity feature slightly increases regret (+7). These are genuine negative findings."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Abstract claims of '22% increase in accuracy while reducing cumulative energy consumption by 31%' are supported by §6.3.1 and Figure 2. RouterBench claims of '71.7% average accuracy with peak 75.7%' are confirmed in Table 1."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Causal claims are made through controlled ablation (§6.3.3): removing/adding individual features demonstrates their contribution. The ablation design uses controlled single-variable manipulation across 50 runs. The model addition experiment (§6.3.4) uses controlled introduction at a specific timestep."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The abstract specifies 'five benchmark tasks and a pool of 16 contemporary open-access LLMs.' The discussion (§6.4) explicitly bounds generalization: controlled environments, single GPU hardware, objective ground truth tasks only, specific feature engineering choices."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The discussion (§6.4) covers limitations but does not consider alternative explanations for the observed improvements. For example, whether the specific benchmark selection or model pool composition could explain the routing advantages is not discussed."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper explicitly distinguishes its direct GPU energy measurements from proxy metrics used in prior work: 'Prior work often relies on proxy metrics (API costs, token budgets). We measure actual GPU power draw in watt-hours' (§5). They criticize others' use of 'synthetic proxies' (§1)."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Exact HuggingFace model handles are provided in Table 2 (Appendix A.1), e.g., 'Qwen/Qwen2.5-0.5B-Instruct', 'meta-llama/Llama-3.1-1B-Instruct', 'google/gemma-3-27b-it'. These are specific enough to identify exact model versions."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "The paper does not provide the actual prompts used to format benchmark queries for each model. §4.2.1 mentions 'the common structure of instruction-based tasks' but the actual prompt text or formatting templates used are not shown."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "MAB hyperparameters are well-documented (§6.1.5: LinUCB α=0.1, λ_reg=0.05, ε-Greedy ε₀=1.0, δ=0.98, ε_min=0.01, CTS σ=0.01, K=3, N_bins=3). However, critical LLM inference parameters (temperature, top_p, max_tokens) are not reported. batch_size=1 and bfloat16 are stated but sampling parameters are missing."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "No agentic scaffolding is used. GreenServ is a routing system that sends queries directly to models for single-pass inference without retry logic, tool use, or agent loops."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "§6.1.2: '500 instances were uniformly sampled from the test set partition using a fixed random seed.' §4.2.1-4.2.3 describe feature extraction pipeline. §5 describes data loading, embedding computation, and evaluation metric computation."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "§6.4 Discussion contains substantial limitations discussion covering stationarity assumption, ground truth requirements, hardware generalization, feature sensitivity, and controlled vs. operational environments."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Limitations in §6.4 are specific to this study: 'MAB algorithms assume stationary rewards and adapt slowly to drifts', 'our current evaluation focuses on tasks with objective ground truth', 'generalization across hardware requires latency profiles for various GPU architectures', 'sensitivity to specific feature engineering choices, such as the number of clusters K or the number of complexity bins N.'"
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "§6.4 explicitly states scope boundaries: deterministic evaluation only (not open-ended generation), controlled environments (not operational conditions with concurrency and queuing), single GPU hardware. §3.1.1: 'we focus exclusively on tasks where accuracy can be measured objectively.'"
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "No raw experimental data (individual query results, per-query energy measurements, routing decisions) is mentioned as being released. Only aggregate results are shown in figures and tables."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "§6.1.2 describes dataset selection and sampling (500 instances per test partition with fixed random seed). §5 describes energy measurement via zeus library from GPU power monitoring, latency via Python's time module, accuracy via EM and ROUGE."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. Data sources are standard public benchmarks (MMLU, HellaSwag, Winogrande, GSM8K, CNN/DailyMail)."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The pipeline is documented: datasets loaded from HuggingFace → 500 samples per dataset with fixed seed → feature extraction (embedding, clustering, complexity) → routing via MAB → inference → accuracy evaluation (EM/ROUGE) → energy/latency monitoring via zeus (§5, §6.1)."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No acknowledgments section, funding statement, or grant numbers appear in the paper."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: TU Wien, University of Amsterdam, University of Milano-Bicocca. All are academic institutions with no apparent conflict with the evaluated open-source models."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No funding information is disclosed, so independence of the funder cannot be assessed. The absence of funding disclosure is itself a gap."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests statement or financial disclosure appears in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No training data cutoff dates are stated for any of the 16 models evaluated. The models (Qwen 2.5, Gemma 3, Llama 3.1/3.2, Phi 4, etc.) are recent but their training data boundaries are not discussed."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No discussion of whether the 16 models may have been trained on the benchmark datasets. All five benchmarks (MMLU 2021, HellaSwag 2019, Winogrande 2021, GSM8K 2021, CNN/DailyMail 2015) predate the models and are widely available online."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "All five benchmarks were published years before the models' training. MMLU, HellaSwag, Winogrande, GSM8K, and CNN/DailyMail are all publicly available and likely in training data. No contamination analysis is performed or discussed."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study. Evaluation is entirely computational using benchmark datasets and automated metrics."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants. The study evaluates LLM routing performance on public benchmarks."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": true,
    291         "justification": "Energy consumption is reported in Wh throughout (e.g., ~165 Wh for GreenServ at λ=0.4). Overhead latency is reported per component (Table 4: 6.68-7.77 ms total). Per-model inference latency is detailed in Table 3 with min, P25, median, average, P75, max."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Hardware is specified (A100 80GB, 512GB RAM, AMD EPYC 9354P) but total computational budget is not stated. With 50 runs for baseline comparison, 20 runs × 11 λ values for trade-off analysis, 50 runs × 8 configs for ablation, plus additional experiments, total GPU hours are never quantified."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": true,
    303         "justification": "Results are reported across 50 independent runs for baseline comparison (§6.3.1) and ablation (§6.3.3), and 20 runs for λ sweep (§6.2.2), with 95% confidence intervals and variance shown in figures."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": true,
    308         "justification": "Explicitly stated: 50 runs for baseline comparison (§6.3.1), 20 runs for λ sweep (§6.2.2, 'we executed 20 runs'), 50 runs for ablation (§6.2.3, 'We executed 50 runs for each configuration'), single run for model addition (§6.3.4)."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "§6.1.5 states 'we conducted preliminary experiments to tune hyperparameters' and §4.2.4 calls the search 'non-exhaustive tuning,' but the number of configurations tried, search method, and compute spent on tuning are not reported."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "Final hyperparameter values are stated (§6.1.5) but the selection process is not described. No mention of selection on validation vs. test set, no reporting of all configurations tried, and no explanation of why these specific values were chosen."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The paper makes many comparisons (GreenServ vs. 7 baselines across multiple λ values and feature configurations) but applies no correction for multiple comparisons (no Bonferroni, Holm, or Benjamini-Hochberg)."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors implement all baselines (ε-Greedy, Thompson Sampling, static strategies) themselves and compare against their own GreenServ system without acknowledging the bias of evaluating one's own system against self-implemented baselines."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": true,
    333         "justification": "Performance is plotted directly against energy consumption (a compute proxy) in Figures 2b and 4. The accuracy-energy trade-off is the central analysis, with the Pareto front explicitly shown for comparison."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "The paper uses MMLU, HellaSwag, Winogrande, GSM8K, and CNN/DailyMail without discussing whether these benchmarks are appropriate proxies for real-world LLM routing scenarios or whether they measure the claimed capabilities."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "No agentic scaffolding is involved. All models use the same direct inference pipeline (transformers + PyTorch with bfloat16), so there is no scaffold confound to address."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "Not discussed. All five benchmarks (MMLU 2021, HellaSwag 2019, Winogrande 2021, GSM8K 2021, CNN/DailyMail 2015) predate the training of the 16 models used (2024-2025 era), creating potential temporal leakage."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "Not discussed. The paper does not consider whether the formatting or presentation of benchmark queries could leak answer information, or whether multiple-choice formatting provides hints."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "Not discussed. No analysis of whether training data and benchmark data share structural similarities or overlapping sources."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference tests, n-gram overlap analysis, or decontamination pipelines are mentioned."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "GreenServ achieves 22% higher accuracy and 31% lower energy consumption compared to random routing.",
    372       "evidence": "§6.3.1, Figure 2a: GreenServ accuracy ~0.65 vs. random ~0.51 with ~165 Wh vs. ~240 Wh, shown across 50 runs with 95% confidence intervals.",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "GreenServ and contextual baselines surpass the static Pareto front, achieving accuracy-energy operating points unreachable by single-model deployments.",
    377       "evidence": "§6.3.1, Figure 2b: GreenServ and contextual baselines position in the upper-left region beyond the static Pareto front (red dashed line).",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Task type is the single most informative contextual feature for routing decisions, dropping median cumulative regret to approximately 400.",
    382       "evidence": "§6.3.3, Figure 5: Task-only context shows the largest regret reduction among single features across 50 runs. Adding all features raises regret due to increased dimensionality.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "GreenServ adapts to new models at runtime without offline recalibration.",
    387       "evidence": "§6.3.4, Figure 6: After Gemma-3-12b is introduced at query 1000, selection frequency rises to 20-25% within ~100 queries. Shown for a single run only.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "GreenServ's total average overhead per query ranges between 6.68 and 7.77 ms.",
    392       "evidence": "§6.3.5, Table 4: Task classification 3.04ms, semantic clustering 3.37ms, complexity 0.15ms, LinUCB routing 0.86ms. Directly measured.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "GreenServ achieves an average accuracy of 71.7% with peak accuracy of 75.7% on RouterBench.",
    397       "evidence": "§6.3.6, Table 1: GreenServ peak accuracy 75.7%, average accuracy 71.7%, AIQ 0.607 across ~36k queries spanning 9 tasks.",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "Contextual routing consistently outperforms non-contextual and static baselines across all λ configurations.",
    402       "evidence": "§6.3.2, Figures 4 and 9: Across λ from 0 to 1, contextual algorithms maintain higher accuracy and lower energy consumption than non-contextual ε-Greedy across 20 runs.",
    403       "supported": "strong"
    404     }
    405   ],
    406   "red_flags": [
    407     {
    408       "flag": "Complete absence of contamination analysis",
    409       "detail": "All five benchmarks (MMLU, HellaSwag, Winogrande, GSM8K, CNN/DailyMail) were published in 2015-2021 and are widely available online. The 16 models tested are from the 2024-2025 era and almost certainly saw these benchmarks during training. This could inflate absolute accuracy numbers and distort routing decisions, yet contamination is never discussed."
    410     },
    411     {
    412       "flag": "No formal significance tests",
    413       "detail": "The paper claims GreenServ 'outperforms' baselines but notes that 'confidence intervals for both accuracy and energy consumption largely overlap across GreenServ and the contextual baselines' (§6.3.1). No formal hypothesis tests are conducted, making it unclear whether differences between contextual MAB variants are statistically significant."
    414     },
    415     {
    416       "flag": "Model addition experiment based on single run",
    417       "detail": "The adaptability claim (§6.3.4) relies on a single run, unlike baseline comparison (50 runs) and trade-off analysis (20 runs). This is insufficient evidence for a key contribution claim."
    418     },
    419     {
    420       "flag": "Missing LLM inference parameters",
    421       "detail": "Critical inference parameters (temperature, top_p, max_tokens) for the 16 models are not reported. These significantly affect model outputs and could influence routing behavior and accuracy measurements."
    422     },
    423     {
    424       "flag": "Anonymous repository only",
    425       "detail": "The code is provided via an anonymous review link (anonymous.4open.science) which is temporary and may not persist after review, undermining the reproducibility claim."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "RouteLLM: Learning to Route LLMs from Preference Data",
    431       "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu", "Wei-Lin Chiang", "Tianhao Wu", "Joseph E Gonzalez", "M Waleed Kadous", "Ion Stoica"],
    432       "year": 2025,
    433       "relevance": "Core related work on LLM routing using preference data, directly compared in the paper's related work section."
    434     },
    435     {
    436       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    437       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    438       "year": 2023,
    439       "arxiv_id": "2305.05176",
    440       "doi": "10.48550/ARXIV.2305.05176",
    441       "relevance": "Foundational work on cost-efficient LLM inference through cascading, motivates the routing-based alternative."
    442     },
    443     {
    444       "title": "MixLLM: Dynamic Routing in Mixed Large Language Models",
    445       "authors": ["Xinyuan Wang", "Yanchi Liu", "Wei Cheng"],
    446       "year": 2025,
    447       "relevance": "Contextual-bandit-based LLM routing with tag-enhanced embeddings, closest comparison to GreenServ's approach."
    448     },
    449     {
    450       "title": "LLM Bandit: Cost-Efficient LLM Generation via Preference-Conditioned Dynamic Routing",
    451       "authors": ["Yang Li"],
    452       "year": 2025,
    453       "arxiv_id": "2502.02743",
    454       "relevance": "Multi-armed bandit approach to LLM routing with preference conditioning, directly related methodology."
    455     },
    456     {
    457       "title": "RouterBench: A Benchmark for Multi-LLM Routing System",
    458       "authors": ["Qitian Jason Hu", "Jacob Bieker", "Xiuyu Li", "Nan Jiang"],
    459       "year": 2024,
    460       "relevance": "Evaluation framework used for external validation of GreenServ, establishes routing benchmark methodology."
    461     },
    462     {
    463       "title": "GPT-4 Technical Report",
    464       "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"],
    465       "year": 2023,
    466       "arxiv_id": "2303.08774",
    467       "relevance": "Major LLM reference relevant to the survey's coverage of foundational model capabilities and deployment costs."
    468     },
    469     {
    470       "title": "Eagle: Efficient Training-Free Router for Multi-LLM Inference",
    471       "authors": ["Zesen Zhao", "Shuowei Jin", "Z. Morley Mao"],
    472       "year": 2024,
    473       "arxiv_id": "2409.15518",
    474       "relevance": "Training-free multi-LLM routing approach, represents the zero-calibration end of the routing spectrum."
    475     },
    476     {
    477       "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing",
    478       "authors": ["Dujian Ding", "Ankur Mallick", "Chi Wang"],
    479       "year": 2024,
    480       "relevance": "Meta-learning approach to LLM routing that reduces large model calls by 40%, relevant to cost-efficiency claims."
    481     },
    482     {
    483       "title": "Smoothie: Label Free Language Model Routing",
    484       "authors": ["Neel Guha", "Mayee Chen", "Trevor Chow"],
    485       "year": 2024,
    486       "relevance": "Label-free routing via embedding-based similarity, addresses calibration overhead problem."
    487     },
    488     {
    489       "title": "EmbedLLM: Learning Compact Representations of Large Language Models",
    490       "authors": ["Richard Zhuang", "Tianhao Wu", "Zhaojin Wen"],
    491       "year": 2025,
    492       "relevance": "Compact model representations for routing, relevant to efficient model selection approaches."
    493     },
    494     {
    495       "title": "GraphRouter: A Graph-Based Router for LLM Selections",
    496       "authors": ["Tao Feng", "Yanzhen Shen", "Jiaxuan You"],
    497       "year": 2025,
    498       "relevance": "Graph neural network approach to LLM routing, models task-query-LLM interactions."
    499     },
    500     {
    501       "title": "Measuring Massive Multitask Language Understanding",
    502       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart"],
    503       "year": 2021,
    504       "relevance": "MMLU benchmark used as one of five evaluation datasets, major LLM capability benchmark."
    505     }
    506   ]
    507 }

Impressum · Datenschutz