ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (25909B)


      1 {
      2   "paper": {
      3     "title": "BOUTE: Cost-Efficient LLM Serving with Heterogeneous LLMs and GPUs via Multi-Objective Bayesian Optimization",
      4     "authors": ["Youhe Jiang", "Fangcheng Fu", "Eiko Yoneki"],
      5     "year": 2026,
      6     "venue": "MLSys 2026",
      7     "arxiv_id": "2602.10729"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. The system implementation section (§6) describes the components but does not release code."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The evaluation uses publicly available datasets: GSM8K (Cobbe et al., 2021) and MTBench (Zheng et al., 2023), both standard public benchmarks. The ETH EASL Scratchpad simulator is also publicly available at https://github.com/eth-easl/Scratchpad."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Section 7.1 specifies the GPU hardware in detail: NVIDIA H100-80G, RTX PRO 6000-96G, RTX 5090-32G, and RTX 4090-24G, with interconnect bandwidths (NVLink 300GB/s, PCIe 60GB/s). Section 6 specifies GPyTorch and BoTorch frameworks. However, no requirements.txt or Dockerfile is provided."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README with commands, or scripts for replicating the main experiments are provided. The paper describes the system architecture but not how to reproduce the results."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No confidence intervals, error bars, or ± notation are reported for the main results in Figure 4 or Tables 1-3. All results are presented as point estimates."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper makes comparative claims (e.g., 'up to 157% improvement') but no statistical significance tests (p-values, t-tests, etc.) are used to validate these differences."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper consistently reports percentage improvements with baseline context. For example, 'BOUTE reduces P95 latency by up to 66% (50% on average)' and 'reducing costs by 15%-61% (38% on average)' in §7.2 and Table 3, with absolute values provided in the figures."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification is given for the number of workload traces, the system load level (100 req/s), or the number of quality requirement thresholds tested. No power analysis is discussed."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations, variance across runs, or spread measures are reported. The simulator accuracy in Table 4 shows absolute percentage errors but no variance across multiple runs of the full system."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Section 7.1 describes four baselines: Stand-Alone-Homo (vLLM with homogeneous GPUs), Stand-Alone-Hetero (vLLM with heterogeneous GPUs), RouteLLM, and BOUTE-Homo (BOUTE without heterogeneous GPUs)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The baselines include RouteLLM (2024), vLLM (2023), and systems like Helix (2025) and ThunderServe (2025) are discussed in related work. The baselines are recent and represent state-of-the-art open-source LLM serving systems."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Table 2 presents ablations: BOUTE vs. BOUTE (w/o structural info) and BOUTE (w/o offline prep). Figure 5 also compares BOUTE, BOUTE-Homo, and BOUTE (w/o structural info) across quality requirements. §3 provides a step-by-step workload characterization showing the contribution of each component (routing, resource allocation, heterogeneous GPUs)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper evaluates on multiple metrics: P95 latency, average latency, P96-P99 latencies, throughput (req/s), quality (accuracy on GSM8K, score on MTBench), and cost ($/h) in Table 3."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is a systems paper about LLM serving infrastructure. Human evaluation of the system's outputs is not relevant — the paper evaluates system-level metrics (latency, throughput, cost) using automated measurements."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is not a machine learning modeling paper. The evaluation uses workload traces for system benchmarking, not a train/dev/test split. The concept of a held-out test set does not structurally apply."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by workload (GSM8K vs. MTBench), by quality requirement levels (87/89/91 for GSM8K, 8.1/8.3/8.5 for MTBench), and by percentile latency metrics (AVG, P95-P99) in Figure 4. Table 1 provides per-model routing and resource allocation details."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 3 discusses failure cases: Approach 1 shows that naive routing with uniform resource allocation actually increases latency by 10% over the baseline. §7.3 discusses how BOUTE (w/o structural info) frequently converges to local optima, resulting in inferior performance. Figure 5 shows cases where the ablated system underperforms even BOUTE-Homo."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 3, Approach 1 reports that model routing with uniform resource allocation increases P95 latency by 10% compared to the baseline. The paper also notes that BOUTE (w/o structural info) can perform worse than BOUTE-Homo (§7.4), and that RouteLLM achieves even lower throughput than standalone deployment in some cases (§7.2)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims 'up to 157% and 59% on average' throughput improvement and '15%-61% (38% on average)' cost reduction. These are supported by Figure 4 and Table 3 in §7.2 and §7.4."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims about component contributions via controlled ablations. The workload characterization (§3) systematically adds components one at a time (routing, resource allocation, heterogeneous GPUs), and §7.3/Table 2 ablates structural information and offline preparation. These are controlled single-variable manipulations."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title and abstract claim 'cost-efficient LLM serving' generally, but results are only shown for Llama3.1-8B and Llama3.1-70B on two benchmarks (GSM8K, MTBench) with a single router. The paper does not bound its claims to these specific models or workloads, nor acknowledge that results may differ with other model families, sizes, or workload patterns."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper does not discuss alternative explanations for its results. For example, it does not consider whether the improvements are due to the specific model pairing (8B vs 70B having a large capability gap), the particular GPU price ratios tested, or the workload characteristics of GSM8K/MTBench. No threats-to-validity or alternative explanations section is present."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Section 7.1 specifies 'Llama3.1-8B' and 'Llama3.1-70B' which are specific model versions with known architectures and parameter counts. The router is from RouteLLM (Ong et al., 2024) trained on specific data."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "This is a systems paper evaluating serving infrastructure. The evaluation uses benchmark datasets (GSM8K, MTBench) directly, not prompting. No custom prompts are designed as part of the methodology."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper reports system-level hyperparameters: system load of 100 req/s, budget of $30/h, quality requirements (86/88/90 for GSM8K, 8.1/8.3/8.5 for MTBench), GPU costs per type (§7.1), MOBO convergence criterion (stable for 20 consecutive iterations, Table 2), and kernel specifications (Matérn-5/2, Matérn-3/2, §6)."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. BOUTE is a scheduling/serving system, not an agentic LLM system."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 7.1 states workload traces are 'subsampled from GSM8K and MTBench' following prior work (Peng et al., 2025; Zhong et al., 2024). The routing score computation and system load configuration (100 req/s) are specified. Appendix A describes simulator inputs including workload-specific characteristics."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated limitations or threats-to-validity section anywhere in the paper. The paper moves directly from evaluation results to the conclusion."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity are discussed. There is no mention of potential limitations such as simulator accuracy affecting conclusions, generalizability to other model families, or sensitivity to GPU pricing."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit statements about what the results do NOT show. The paper does not clarify that results are specific to the tested model pair, GPU types, benchmarks, or routing strategy. The conclusion makes broad claims about 'democratization of LLM serving' without qualifying scope."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw experimental data, performance logs, or simulation outputs are made available. Only aggregated results in figures and tables are provided."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 7.1 describes the experimental setup: GPU server configurations, GPU types and costs, interconnect bandwidths, models used, router design, and workload trace generation methodology referencing prior work."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. The study uses computational benchmarks and simulations."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full pipeline is documented: offline preparation (§5.1) profiles GPUs and simulates deployments, building a performance database. Online optimization (§5.2) generates deployment candidates via Pareto-skimming and knapsack optimization, then runs MOBO. The coordinator (§6, Appendix C) dispatches queries based on optimal configurations."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding sources or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly stated: University of Cambridge (Youhe Jiang, Eiko Yoneki) and Shanghai Jiao Tong University (Fangcheng Fu). These are academic institutions, not vendors of the evaluated products."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Funding is not disclosed, so independence of the funder cannot be assessed. The absence of any funding or acknowledgments statement means this criterion is not satisfied."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement or financial interest declaration is present in the paper. First author Youhe Jiang has multiple related papers on heterogeneous LLM serving (HexGen, ThunderServe, etc.) which could indicate a pattern, but no declaration is made."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This paper does not evaluate model capabilities on benchmarks to measure model knowledge. It uses benchmark datasets (GSM8K, MTBench) to generate workload traces for system-level performance evaluation (latency, throughput). The quality metric measures how well the routing system preserves the models' existing capabilities, not the models' raw benchmark performance."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Same rationale: the paper evaluates serving system performance (latency, throughput, cost), not model benchmark performance. Contamination of the benchmark in model training data would not affect the validity of the systems-level claims."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Same rationale: the paper's claims are about system-level serving efficiency, not about model accuracy on benchmarks. Benchmark contamination is irrelevant to the systems claims."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Cost is a central focus of the paper. Table 3 reports the hourly cost for each system to meet specific quality and latency targets (e.g., BOUTE at $32.25/h vs. Stand-Alone-Homo at $61.83/h for GSM8K with quality 87 and latency 8s). GPU per-hour costs are specified in §7.1."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "The paper specifies the budget constraint ($30/h for main experiments, §7.1), GPU costs per type ($2.64-3.07/h for H100, $0.89/h for RTX 5090, etc.), and scheduling convergence time (23.5s to 1.2min for BOUTE, Table 2). The offline preparation completes 'within several dozen minutes' (§5.1)."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "BOUTE outperforms state-of-the-art LLM serving systems by up to 157% and 59% on average in throughput under identical cost budgets and quality requirements.",
    286       "evidence": "Figure 4 shows throughput comparisons across GSM8K and MTBench workloads with varying quality requirements. The 157% improvement is against Stand-Alone-Homo on MTBench-8.1. The 59% average is across all baselines and settings (§7.2).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "BOUTE reduces serving costs by 15%-61% (38% on average) while maintaining the same performance targets.",
    291       "evidence": "Table 3 shows hourly costs under identical quality and latency requirements. BOUTE achieves $30.98-$32.25/h vs. $37.50-$79.93/h for baselines across four test scenarios (§7.4).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Heterogeneous model deployment naturally complements heterogeneous model routing, creating a synergistic system.",
    296       "evidence": "Section 3 provides a step-by-step workload characterization showing that combining routing with heterogeneous GPU deployment reduces P95 latency by 33% compared to single-model deployment. Figure 1 and the comparison between BOUTE and BOUTE-Homo (14% average improvement, §7.2) support this.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Incorporating structural information into the MOBO framework reduces scheduling time by approximately 8x.",
    301       "evidence": "Table 2 compares BOUTE vs. BOUTE (w/o structural info): 23.5s vs. 3.6min for 3 GPU types, 32.5s vs. 5.3min for 4 GPU types (§7.3). However, this is a single measurement with no variance reported.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Different models favor different GPU types for cost-efficient serving.",
    306       "evidence": "Section 3 shows that Llama3.1-8B achieves 1.5x lower P95 latency on RTX 5090 vs H100 under equal cost, while Llama3.1-70B achieves 2x lower P95 latency on H100 vs RTX 5090. Demonstrated with one model pair only.",
    307       "supported": "moderate"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "BOUTE is a quality-aware scheduling system that jointly optimizes heterogeneous model routing and GPU deployment for cost-efficient LLM serving using multi-objective Bayesian optimization (MOBO). The system achieves up to 2.57x lower latency or 15-61% cost reduction compared to state-of-the-art baselines (vLLM, RouteLLM) on GSM8K and MTBench workloads with Llama3.1-8B/70B. The key insight is that heterogeneous model deployment (matching GPU types to model characteristics) naturally complements heterogeneous model routing, and co-optimizing both dimensions through MOBO with structural information achieves superior Pareto-optimal solutions. The offline preparation phase (simulator-based performance profiling) enables the online optimization to converge in under a minute.",
    312   "red_flags": [
    313     {
    314       "flag": "No uncertainty quantification",
    315       "detail": "All results are presented as single point estimates with no confidence intervals, error bars, or variance across experimental runs. For a systems paper, run-to-run variance in latency measurements can be significant due to queuing effects, network contention, and GPU thermal throttling."
    316     },
    317     {
    318       "flag": "No limitations section",
    319       "detail": "The paper has no limitations, threats-to-validity, or scope-bounding discussion. Given the narrow experimental scope (one model pair, two benchmarks, one router), this is a significant omission."
    320     },
    321     {
    322       "flag": "Narrow model diversity",
    323       "detail": "All experiments use only Llama3.1-8B and Llama3.1-70B. The approach's effectiveness with other model families (e.g., Mistral, Qwen), more than two model types, or models with different architectural characteristics is unknown. The abstract and conclusion make general claims about 'LLM serving' without this caveat."
    324     },
    325     {
    326       "flag": "Simulator-based evaluation concerns",
    327       "detail": "The offline preparation relies on a simulator (ETH EASL Scratchpad) with 2-7% estimation errors (Table 4). While simulator accuracy is validated for throughput, its accuracy for latency percentiles (P95-P99) under realistic conditions is not independently verified. The end-to-end results mix real GPU execution with simulator-optimized configurations."
    328     },
    329     {
    330       "flag": "Self-citation density",
    331       "detail": "Several of the cited system works (HexGen, ThunderServe, HexGen-2, Hexiscale, etc.) share the first author (Youhe Jiang), suggesting the evaluation ecosystem may be somewhat self-referential. No potential conflicts are declared."
    332     }
    333   ],
    334   "cited_papers": [
    335     {
    336       "title": "RouteLLM: Learning to Route LLMs with Preference Data",
    337       "authors": ["I. Ong", "A. Almahairi", "V. Wu", "W.-L. Chiang", "T. Wu", "J. E. Gonzalez", "M. W. Kadous", "I. Stoica"],
    338       "year": 2024,
    339       "arxiv_id": "2406.18665",
    340       "relevance": "Core baseline for heterogeneous model routing, directly compared against BOUTE in the evaluation."
    341     },
    342     {
    343       "title": "HybridLLM: Cost-Efficient and Quality-Aware Query Routing",
    344       "authors": ["D. Ding", "A. Mallick", "C. Wang", "R. Sim", "S. Mukherjee", "V. Ruhle", "L. V. Lakshmanan", "A. H. Awadallah"],
    345       "year": 2024,
    346       "arxiv_id": "2404.14618",
    347       "relevance": "Adaptive LLM routing system that switches between large and small models based on query complexity."
    348     },
    349     {
    350       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    351       "authors": ["L. Chen", "M. Zaharia", "J. Zou"],
    352       "relevance": "Cascading approach for cost-efficient LLM usage that routes through models from weakest to strongest."
    353     },
    354     {
    355       "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention",
    356       "authors": ["W. Kwon", "Z. Li", "S. Zhuang", "Y. Sheng", "L. Zheng", "C. H. Yu", "J. Gonzalez", "H. Zhang", "I. Stoica"],
    357       "year": 2023,
    358       "relevance": "vLLM serving system used as the primary baseline for standalone LLM serving in the evaluation."
    359     },
    360     {
    361       "title": "Helix: Serving Large Language Models over Heterogeneous GPUs and Network via Max-Flow",
    362       "authors": ["Y. Mei", "Y. Zhuang", "X. Miao", "J. Yang", "Z. Jia", "R. Vinayak"],
    363       "year": 2025,
    364       "relevance": "Heterogeneous GPU deployment system using max-flow algorithm, directly relevant to the LLM serving cost-efficiency research."
    365     },
    366     {
    367       "title": "ThunderServe: High-Performance and Cost-Efficient LLM Serving in Cloud Environments",
    368       "authors": ["Y. Jiang", "F. Fu", "X. Yao", "T. Wang", "B. Cui", "A. Klimovic", "E. Yoneki"],
    369       "year": 2025,
    370       "arxiv_id": "2502.09334",
    371       "relevance": "Heterogeneous GPU deployment for LLM serving that separates prefill and decoding phases across different GPU types."
    372     },
    373     {
    374       "title": "Mélange: Cost Efficient Large Language Model Serving by Exploiting GPU Heterogeneity",
    375       "authors": ["T. Griggs", "X. Liu", "J. Yu", "D. Kim", "W.-L. Chiang", "A. Cheung", "I. Stoica"],
    376       "year": 2024,
    377       "arxiv_id": "2404.14527",
    378       "relevance": "Explores GPU heterogeneity for cost-efficient LLM serving, closely related to the deployment optimization in BOUTE."
    379     },
    380     {
    381       "title": "Rerouting LLM Routers",
    382       "authors": ["A. Shafran", "R. Schuster", "T. Ristenpart", "V. Shmatikov"],
    383       "year": 2025,
    384       "arxiv_id": "2501.01818",
    385       "relevance": "Examines security and adversarial aspects of LLM routing systems, relevant to the robustness of routing-based serving approaches."
    386     },
    387     {
    388       "title": "Learning to Route LLMs with Confidence Tokens",
    389       "authors": ["Y.-N. Chuang", "P. K. Sarma", "P. Gopalan", "J. Boccio", "S. Bolouki", "X. Hu", "H. Zhou"],
    390       "year": 2024,
    391       "arxiv_id": "2410.13284",
    392       "relevance": "Alternative approach to LLM routing using confidence tokens, directly related to the heterogeneous model routing paradigm."
    393     },
    394     {
    395       "title": "Adaptive LLM Routing under Budget Constraints",
    396       "authors": ["P. Panda", "R. Magazine", "C. Devaguptapu", "S. Takemori", "V. Sharma"],
    397       "year": 2025,
    398       "arxiv_id": "2508.21141",
    399       "relevance": "Addresses the same problem space of budget-constrained LLM routing, directly relevant to cost-efficient serving research."
    400     },
    401     {
    402       "title": "DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving",
    403       "authors": ["Y. Zhong", "S. Liu", "J. Chen", "J. Hu", "Y. Zhu", "X. Liu", "X. Jin", "H. Zhang"],
    404       "year": 2024,
    405       "relevance": "Key LLM serving system that disaggregates prefill and decoding phases, influencing the serving architecture design space."
    406     }
    407   ]
    408 }

Impressum · Datenschutz