scan.json (26627B)
1 { 2 "paper": { 3 "title": "Apt-Serve: Adaptive Request Scheduling on Hybrid Cache for Scalable LLM Inference Serving", 4 "authors": ["Shihong Gao", "Xin Zhang", "Yanyan Shen", "Lei Chen"], 5 "year": 2025, 6 "venue": "Proc. ACM Manag. Data (SIGMOD)", 7 "arxiv_id": "2504.07494", 8 "doi": "10.1145/3725394" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": true, 15 "justification": "A GitHub repository is provided: https://github.com/eddiegaoo/Apt-Serve (footnote 4, Section 6.1). The appendix with the greedy scheduling algorithm is also linked from the repository." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The paper uses three publicly available datasets: ShareGPT, HumanEval, and LongBench. These are standard public benchmarks referenced with citations. The ultra-long context experiments use WikiText, Arxiv, and BookCorpus, also public datasets." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "The paper specifies hardware (NVIDIA A100 40GB GPUs with NVLink) and mentions FP16 precision, FlashAttention, NCCL, and building on vLLM, but does not provide a requirements.txt, Dockerfile, or detailed library version listing sufficient to recreate the environment." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "No step-by-step reproduction instructions are provided in the paper. While a GitHub link is given, the paper itself does not contain a 'Reproducing Results' section or specific commands to replicate the experiments." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "All results are reported as point estimates (SLO attainment percentages, throughput multipliers). No confidence intervals, error bars, or ± notation appear in any table or figure." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper makes comparative claims (e.g., '2.3× higher average request rates') but uses no statistical significance tests. Comparisons are based solely on comparing numbers from single experimental runs." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper consistently reports effect sizes as multipliers with baseline context, e.g., 'Apt-Serve achieves up to 8.8× improvement in effective throughput compared to the state-of-the-art' and detailed per-dataset, per-model comparisons (Section 6.3) such as '2.3×, 2.0×, and 1.9× higher request rates on average compared to vLLM, Sarathi-Serve, and DeepSpeed-FastGen.'" 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper uses 500 or 1,000 sampled requests per dataset but never justifies why these specific numbers were chosen or discusses whether they are sufficient for the claims made." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. It is unclear whether experiments were run multiple times. All results appear to be from single runs." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "Three baselines are compared: vLLM, Sarathi-Serve, and DeepSpeed-FastGen (Section 6.2). These are described as 'representative state-of-the-art LLM inference serving systems.'" 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "All three baselines are from 2023-2024 and represent recent state-of-the-art systems: vLLM (SOSP 2023), Sarathi-Serve (OSDI 2024), and DeepSpeed-FastGen (2024). These are contemporary and competitive." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": true, 74 "justification": "Section 6.5 presents ablation studies: (1) Apt-Serve with KV-only cache vs. hybrid cache (Table 4), and (2) FCFS scheduling vs. adaptive scheduling (Table 5). Both isolate the contribution of individual components across multiple settings." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": true, 79 "justification": "The paper reports SLO attainment rate (%), effective throughput (maximum sustainable request rate), TTFT and P99 TBT distributions (CDFs in Figure 10), and scheduling algorithm execution time (Table 6)." 80 }, 81 "human_evaluation": { 82 "applies": false, 83 "answer": false, 84 "justification": "This is a systems paper evaluating infrastructure performance (throughput, latency). Human evaluation of system outputs is not relevant to the claims about scheduling and cache management." 85 }, 86 "held_out_test_set": { 87 "applies": false, 88 "answer": false, 89 "justification": "This is not a machine learning model evaluation paper. The system is evaluated on serving workloads, not on train/test splits. The concept of held-out test sets does not apply to systems benchmarking." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "Results are broken down per dataset (ShareGPT, HumanEval, LongBench), per model size (13B, 30B, 66B), per SLO attainment threshold (90% and 60%), and per metric (TTFT vs. TBT) in Figures 8-12 and Tables 4-5." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": true, 99 "justification": "Section 6.6 discusses that Apt-Serve may cause 'a small fraction of requests (10%) experience starvation' due to the SLO-aware fallback mechanism, with high tail latency shown in Figure 10. Section 6.7 acknowledges challenges in ultra-long context scenarios where both systems struggle with TBT SLO." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "The paper reports that on HumanEval, improvement is 'moderate' compared to other datasets (Section 6.3), explains why (shorter cache lifetimes, lower variance), and notes that in ultra-long context scenarios on BookCorpus with Yi-6B-200K, 'both systems struggle to exceed 60% TBT SLO attainment' (Section 6.7)." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims 'up to 8.8× improvement in effective throughput.' Section 6.3 confirms: 'At 60% SLO attainment, Apt-Serve can handle 4.9×, 4.4×, and 4.3× higher request rates on average, with maximum values up to 8.8×, 7.9×, and 7.5×.' The 8.8× is the peak value at 60% SLO attainment comparing to vLLM." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": true, 116 "justification": "Causal claims about the two factors (KV cache memory and FCFS scheduling) causing TTFT SLO violations are supported by controlled ablation studies. Section 6.5 isolates each component: Table 4 shows the effect of hybrid cache with adaptive scheduling held constant, and Table 5 shows the effect of adaptive scheduling with hybrid cache held constant." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": true, 121 "justification": "The paper explicitly bounds claims to the tested settings: specific model families (OPT 13B/30B/66B, LLaMA3-8B, Yi-6B), specific datasets, and specific hardware configurations (A100 GPUs). Section 6.7 frames generalization experiments as 'further investigate Apt-Serve's generalization capabilities' rather than claiming universal applicability. Future work explicitly mentions extending to multi-instance scenarios." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": false, 126 "justification": "The paper does not substantively discuss alternative explanations for the observed improvements. For example, it does not consider whether the gains might be partially attributable to differences in implementation quality, tuning of baselines, or workload-specific properties beyond what is analyzed. There is no threats-to-validity discussion." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": true, 133 "justification": "Specific model names with sizes are given: OPT-13B, OPT-30B, OPT-66B (Section 6.2, Table 2), LLaMA3-8B-Instruct262K, and Yi-6B-200K (Section 6.7). These are well-defined open-source model checkpoints, not API-based models where version ambiguity is a concern." 134 }, 135 "prompts_provided": { 136 "applies": false, 137 "answer": false, 138 "justification": "This paper does not use prompting in the sense of prompt engineering for LLM capabilities. The workloads consist of request datasets (ShareGPT conversations, HumanEval code completions, LongBench summaries) fed directly to models for inference serving evaluation. No prompt design is involved." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": true, 143 "justification": "The paper reports: FP16 precision (Section 6.2), SLO thresholds (Table 3), Poisson/Gamma distribution parameters for request arrivals, coefficient of variation values, request rates, memory pool block sizes, and the preprocessing cost of ~30 seconds for the coefficient ρ (Section 4.2). Hardware configurations are in Table 2." 144 }, 145 "scaffolding_described": { 146 "applies": false, 147 "answer": false, 148 "justification": "This paper does not use agentic scaffolding. It is a systems paper about inference serving infrastructure, not an agentic AI system." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": true, 153 "justification": "Section 6.2 describes: 1,000 requests randomly sampled from each dataset, Poisson distribution for arrival generation, Figure 7 shows input/output length distributions. LongBench sequence lengths are limited due to OPT's 2048-token positional embedding constraint (footnote 5). Ultra-long dataset sampling is described in Section 6.7 with statistics in Table 7." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": false, 160 "justification": "There is no dedicated Limitations or Threats to Validity section. The conclusion mentions future work (multi-instance scenario, integrating with disaggregated architectures) but does not substantively discuss limitations of the current work." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": false, 165 "justification": "No specific threats to validity are discussed. The paper does not address potential concerns such as the limited model family (mostly OPT), synthetic workload generation, or the gap between simulated and production workloads." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": false, 170 "justification": "The paper does not explicitly state what the results do NOT show. While future work mentions multi-instance scenarios and disaggregated architectures, there is no explicit articulation of boundaries such as 'our results do not address production-scale deployment with heterogeneous GPUs' or similar scoping statements." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": false, 177 "justification": "Raw experimental data (e.g., per-request latency traces, scheduling decisions, memory utilization logs) are not made available. Only aggregated results in figures and tables are presented." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 6.2 describes data collection: 1,000 requests randomly sampled from each public dataset (ShareGPT, HumanEval, LongBench), request arrivals generated via Poisson distribution at various rates. Section 6.7 describes ultra-long context dataset sampling with statistics in Table 7." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No human participants. Data sources are standard public benchmarks (ShareGPT, HumanEval, LongBench, WikiText, Arxiv, BookCorpus)." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": true, 192 "justification": "The pipeline from datasets to experimental results is documented: sampling requests from public datasets → generating arrival patterns via Poisson/Gamma distributions → running on specified hardware with specified models → measuring SLO attainment. Distribution statistics are shown in Figure 7 and Table 7." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": true, 199 "justification": "Extensive funding disclosure in the Acknowledgments section, including NSFC, Hong Kong RGC, Guangdong Province grants, Microsoft Research Asia Collaborative Research Grant, HKUST-Webank joint research lab, and multiple other sources." 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "Author affiliations are clearly stated: HKUST, HKUST(GZ), and Shanghai Jiao Tong University. This is an academic paper evaluating open-source systems, not a product evaluation by its creators." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": true, 209 "justification": "Funding sources include government grants (NSFC, Hong Kong RGC, Guangdong Province), university funds, and Microsoft Research Asia. The paper evaluates open-source inference serving systems (vLLM, Sarathi-Serve, DeepSpeed-FastGen) and none of the funders have a direct financial stake in the specific outcome of which system performs better." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests or financial interests statement is present in the paper. There is no declaration regarding patents, equity, or other financial interests related to the findings." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": false, 220 "answer": false, 221 "justification": "This paper does not evaluate a pre-trained model's capability on a benchmark. It evaluates inference serving system performance (throughput, latency) using models as workload generators. The models' knowledge of benchmark content is irrelevant to the claims." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": false, 225 "answer": false, 226 "justification": "Not applicable for the same reason: the paper evaluates systems infrastructure performance, not model knowledge or capability. Whether the model has seen HumanEval problems does not affect the serving throughput measurements." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": false, 230 "answer": false, 231 "justification": "Not applicable. The benchmarks (ShareGPT, HumanEval, LongBench) are used to generate request workloads with representative length distributions, not to evaluate model quality. Contamination of model training data is irrelevant to the serving performance claims." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "No human participants in this study. This is a systems evaluation paper." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants in this study." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants in this study." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": true, 275 "justification": "Table 6 reports the execution time of the scheduling algorithm (0.3ms to 10.8ms for 50-1600 requests), and compares it to the practical computation time ('a single decode iteration with 50 requests using the OPT-13B model takes approximately 120 milliseconds'). The ~30-second preprocessing cost for coefficient ρ is also stated." 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": false, 280 "justification": "The paper specifies hardware (A100 GPUs, configurations in Table 2) but does not state the total computational budget, such as total GPU hours spent on experiments, wall-clock time for the full experimental suite, or total energy/cost." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "Apt-Serve achieves up to 8.8× improvement in effective throughput compared to state-of-the-art inference serving systems.", 287 "evidence": "Section 6.3: At 60% SLO attainment, Apt-Serve handles up to 8.8× higher request rates than vLLM on ShareGPT with OPT-13B. Average improvements are 4.9×, 4.4×, and 4.3× over vLLM, Sarathi-Serve, and DeepSpeed-FastGen.", 288 "supported": "strong" 289 }, 290 { 291 "claim": "KV cache memory consumption limits batch size and is a primary bottleneck for TTFT SLO attainment.", 292 "evidence": "Section 3.1, Figure 2: At 3 req/s, the system hits batch size limit for over 80% of serving time, causing SLO attainment to drop sharply. TTFT SLO attainment declines correlate with time spent at batch size limit.", 293 "supported": "strong" 294 }, 295 { 296 "claim": "FCFS scheduling produces suboptimal batch compositions and even random scheduling outperforms it.", 297 "evidence": "Section 3.2, Figure 4: Random scheduling consistently achieves higher SLO attainment than FCFS across all request rates. Per-request analysis (Figures 4b, 4c) shows FCFS causes clustered TTFT violations.", 298 "supported": "strong" 299 }, 300 { 301 "claim": "The hybrid-cache-based scheduling problem is NP-hard and the greedy solution has an approximation ratio of 2.", 302 "evidence": "Section 5 establishes NP-hardness by reduction from 0-1 knapsack (Definition 1). The proof of the approximation ratio is deferred to an online appendix at the GitHub repository.", 303 "supported": "moderate" 304 }, 305 { 306 "claim": "Apt-Serve's hybrid cache consistently improves SLO attainment over KV-cache-only under the same adaptive scheduling.", 307 "evidence": "Section 6.5, Table 4: Across all tested request rates and burstiness levels on ShareGPT and LongBench, hybrid cache achieves equal or higher SLO attainment. Gains are more prominent at higher request rates and burstier loads.", 308 "supported": "strong" 309 }, 310 { 311 "claim": "Apt-Serve achieves up to 7.5× higher SLO attainment under bursty request conditions compared to baselines.", 312 "evidence": "Section 6.4, Figure 9: Under varying burstiness (Gamma distribution with different CV values), Apt-Serve consistently outperforms baselines with widening gaps as burstiness increases.", 313 "supported": "strong" 314 }, 315 { 316 "claim": "Apt-Serve's optimizations are complementary to other techniques like Sarathi-Serve's chunked prefill.", 317 "evidence": "Section 6.7, Figure 11: Apt-Serve-S (Apt-Serve atop Sarathi-Serve) outperforms both standalone Apt-Serve and standalone Sarathi-Serve, demonstrating complementary benefits.", 318 "supported": "strong" 319 } 320 ], 321 "methodology_tags": ["benchmark-eval"], 322 "key_findings": "Apt-Serve introduces a hybrid cache scheme combining KV cache with a memory-efficient hidden cache (storing input hidden state vectors at half the memory cost) and an adaptive runtime scheduling mechanism that dynamically optimizes batch composition. The hybrid-cache-based scheduling problem is formulated as an NP-hard optimization with a greedy 2-approximation solution. Extensive experiments on three datasets (ShareGPT, HumanEval, LongBench) and three OPT model sizes (13B-66B) show Apt-Serve achieves up to 8.8× higher effective throughput than vLLM, with consistent gains across varying request patterns and model scales. The approach is demonstrated to be complementary to existing optimizations like Sarathi-Serve's chunked prefill.", 323 "red_flags": [ 324 { 325 "flag": "No error bars or repeated runs", 326 "detail": "All experimental results appear to be from single runs with no confidence intervals, standard deviations, or indications of variance across repetitions. For a systems paper with stochastic workloads (Poisson/Gamma arrivals), variance across runs could be substantial." 327 }, 328 { 329 "flag": "No limitations or threats-to-validity section", 330 "detail": "The paper lacks any discussion of limitations. Potential concerns include: reliance primarily on the OPT model family, synthetic workload generation vs. production traces, limited GPU types tested (only A100), and the gap between controlled benchmarks and real deployment scenarios." 331 }, 332 { 333 "flag": "Peak improvement number in abstract", 334 "detail": "The abstract highlights 'up to 8.8× improvement' which is the peak value at 60% SLO attainment on one dataset (ShareGPT) with OPT-13B vs. vLLM. Average improvements are more modest (2.3× at 90% SLO attainment). While supported by data, leading with the peak number overstates typical gains." 335 } 336 ], 337 "cited_papers": [ 338 { 339 "title": "Efficient memory management for large language model serving with pagedattention", 340 "authors": ["Woosuk Kwon", "Zhuohan Li", "Siyuan Zhuang", "Ying Sheng", "Lianmin Zheng", "Cody Hao Yu", "Joseph Gonzalez", "Hao Zhang", "Ion Stoica"], 341 "year": 2023, 342 "relevance": "vLLM is the primary baseline and the system Apt-Serve is built upon; foundational work on PagedAttention for KV cache management in LLM serving." 343 }, 344 { 345 "title": "Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve", 346 "authors": ["Amey Agrawal", "Nitin Kedia", "Ashish Panwar", "Jayashree Mohan", "Nipun Kwatra", "Bhargav Gulavani", "Alexey Tumanov", "Ramachandran Ramjee"], 347 "year": 2024, 348 "relevance": "State-of-the-art baseline system for LLM inference serving with chunked prefill and prefill-decode coalescing batching." 349 }, 350 { 351 "title": "Deepspeed-fastgen: High-throughput text generation for llms via mii and deepspeed-inference", 352 "authors": ["Connor Holmes", "Masahiro Tanaka", "Michael Wyatt"], 353 "year": 2024, 354 "arxiv_id": "2401.08671", 355 "relevance": "State-of-the-art baseline for LLM inference serving with prefill-decode coalescing batching techniques." 356 }, 357 { 358 "title": "DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving", 359 "authors": ["Yinmin Zhong", "Shengyu Liu", "Junda Chen", "Jianbo Hu", "Yibo Zhu", "Xuanzhe Liu", "Xin Jin", "Hao Zhang"], 360 "year": 2024, 361 "relevance": "Disaggregated serving approach that distributes prefill and decode across separate GPUs; complementary direction to Apt-Serve." 362 }, 363 { 364 "title": "Orca: A distributed serving system for Transformer-Based generative models", 365 "authors": ["Gyeong-In Yu", "Joo Seong Jeong", "Geon-Woo Kim", "Soojeong Kim", "Byung-Gon Chun"], 366 "year": 2022, 367 "relevance": "Introduced iteration-level batching for LLM serving, a foundational technique used by all systems in this paper." 368 }, 369 { 370 "title": "Towards efficient generative large language model serving: A survey from algorithms to systems", 371 "authors": ["Xupeng Miao", "Gabriele Oliaro", "Zhihao Zhang"], 372 "year": 2023, 373 "arxiv_id": "2312.15234", 374 "relevance": "Comprehensive survey of LLM inference serving optimizations covering the algorithmic and systems landscape." 375 }, 376 { 377 "title": "Mooncake: Kimi's KVCache-centric Architecture for LLM Serving", 378 "authors": ["Ruoyu Qin", "Zheming Li", "Weiran He"], 379 "year": 2024, 380 "arxiv_id": "2407.00079", 381 "relevance": "KV-cache-centric serving architecture that addresses effective throughput optimization, directly related to the cache management problem." 382 }, 383 { 384 "title": "Llumnix: Dynamic Scheduling for Large Language Model Serving", 385 "authors": ["Biao Sun", "Ziming Huang", "Hanyu Zhao"], 386 "year": 2024, 387 "relevance": "Dynamic scheduling for LLM serving using KV cache migration across instances, addressing similar scheduling challenges." 388 }, 389 { 390 "title": "Splitwise: Efficient generative llm inference using phase splitting", 391 "authors": ["Pratyush Patel", "Esha Choukse", "Chaojie Zhang"], 392 "year": 2024, 393 "relevance": "Phase-splitting approach to LLM serving that separates prefill and decode, a complementary optimization direction." 394 }, 395 { 396 "title": "Evaluating large language models trained on code", 397 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 398 "year": 2021, 399 "arxiv_id": "2107.03374", 400 "relevance": "HumanEval benchmark used as one of the three evaluation datasets in Apt-Serve's experiments." 401 }, 402 { 403 "title": "Efficient LLM Scheduling by Learning to Rank", 404 "authors": ["Yichao Fu", "Siqi Zhu", "Runlong Su", "Aurick Qiao", "Ion Stoica", "Hao Zhang"], 405 "year": 2024, 406 "relevance": "Learning-based LLM scheduling that predicts output length ranks; discussed as complementary to Apt-Serve's scheduling approach." 407 } 408 ] 409 }