scan.json (19448B)
1 { 2 "paper": { 3 "title": "Aegaeon: Effective GPU Pooling for Concurrent LLM Serving on the Market", 4 "authors": ["Yuxing Xiang", "Xue Li", "Kun Qian", "Yufan Yang", "Diwen Zhu", "Wenyuan Yu", "Ennan Zhai", "Xuanzhe Liu", "Xin Jin", "Jingren Zhou"], 5 "year": 2025, 6 "venue": "SOSP '25", 7 "doi": "10.1145/3731569.3764815" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "No repository URL or code archive is provided in the paper. The system is described as deployed at Alibaba Cloud but no open-source release is mentioned." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "The production workload data from Alibaba Cloud Model Studio is not released. ShareGPT is public, but the paper's proprietary workload traces are not available." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "Hardware is described (H800 GPUs, DDR5, Intel Xeon 8469C CPUs) but no software environment specifications (requirements.txt, library versions, CUDA version) are provided beyond mentioning vLLM and Ray." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No reproduction instructions, README, or scripts are provided. The implementation details are described at the design level but not enough for reproduction." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Results are presented as point estimates (e.g., '2-2.5x higher request arrival rates', '82% GPU resource saving') without confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims Aegaeon outperforms baselines based on comparing numbers directly without any statistical significance tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Effect sizes are reported with baseline context throughout, e.g., '2-2.5x higher request arrival rates', '1.5-9x more goodput', 'GPU utilization from 13.3%~33.9% to 48.1%', 'GPUs reduced from 1,192 to 213 (82% saving)'." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification for the number of models, workload configurations, or experimental runs chosen. The evaluation covers specific setups but does not discuss why these are sufficient." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No standard deviations or variance across runs is reported. Results appear to be single-run. CDFs are shown for auto-scaling latency but no variance across repeated experiments." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "ServerlessLLM, ServerlessLLM+ (with oracle SJF scheduling), and MuxServe are used as baselines (§7.1)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "ServerlessLLM (OSDI 2024), MuxServe (ICML 2024), and BlitzScale (OSDI 2025) are all recent works. The baselines are state-of-the-art." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "§7.3 breaks down effectiveness of individual components: latency breakdown (Figure 14), auto-scaling speed (Figure 15), memory fragmentation (Figure 16). §5 shows progressive optimization stages (T0→T1→T2→T3) with component reuse removing 80%, then further reductions." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "SLO attainment, goodput, GPU utilization, auto-scaling latency, KV cache transfer overhead, and memory fragmentation are all reported." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "This is a systems paper about GPU resource management; human evaluation of system outputs is not relevant." 84 }, 85 "held_out_test_set": { 86 "applies": false, 87 "answer": false, 88 "justification": "This is a systems paper, not a machine learning evaluation. There is no train/test split concept applicable here." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down by number of models, RPS, dataset type (ShareGPT, ShareGPT-ix2, ShareGPT-ox2), SLO strictness levels, hardware configurations (H800 vs A10), and model sizes (7B-72B)." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Figure 13(c) shows that under the strictest SLO (0.2x), Aegaeon no longer outperforms MuxServe. The paper acknowledges 'stricter SLOs reduce slack time and limit GPU pooling opportunities.'" 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that under 0.2x SLO strictness, Aegaeon no longer outperforms MuxServe (Figure 13(c)), and that ServerlessLLM+ can actually underperform ServerlessLLM when SJF causes excessive auto-scaling." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims of '2-2.5x higher request arrival rates', '1.5-9x more goodput', '97% auto-scaling overhead reduction', and '82% GPU resource saving (1,192→213)' are all supported by results in §7." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper makes causal claims about token-level auto-scaling improving performance. These are supported by ablation-style analysis (progressive optimization stages) and controlled comparisons against baselines with the same workloads." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper tests on specific hardware (H800, A10), model sizes (6B-72B), and datasets (ShareGPT variants). §7.4 explicitly tests generalization to different hardware and model sizes. Claims are generally scoped to the tested settings." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No discussion of alternative explanations for the results. The paper does not consider confounds such as whether the improvements are specific to the workload distribution or vLLM backend." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "Model families are listed (Qwen, Llama, InternLM, Yi) with parameter counts but no specific model versions or checkpoint identifiers are provided." 133 }, 134 "prompts_provided": { 135 "applies": false, 136 "answer": false, 137 "justification": "This is a systems paper about serving infrastructure; it does not use prompting as part of its methodology." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Key hyperparameters are reported: MAX_GPSIZE=8, QMAX=4s, TTFT=10s, TBT=100ms, correction factor β=0.625, prefill/decode instance split (6/10). Latency model constants (C1-C5) are described as profiled." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used. This is a systems paper about GPU resource management." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Dataset construction is described: ShareGPT is used as base, ShareGPT-ix2 and ShareGPT-ox2 are created by scaling input/output lengths by 2x. Workloads are synthesized with scaled Poisson processes and random sampling (§7.1)." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "No dedicated limitations or threats-to-validity section exists in the paper." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No threats to validity are discussed anywhere in the paper." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "The paper acknowledges scope boundaries: §7.2 notes MuxServe has advantages 'in extremely latency-sensitive scenarios' (Figure 13(c)). The system targets sporadic multi-model workloads specifically, not single-model high-throughput scenarios." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "No raw experimental data, logs, or production traces are made available for verification." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Production workload statistics are described: 779 models, 167.6M requests, 30K GPUs, arrival rate distributions (Figure 1). Experimental setup details hardware, models, and datasets (§7.1)." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. The data comes from production workloads and standard benchmarks." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The experimental pipeline is documented: workload synthesis from ShareGPT with Poisson processes, dataset variants created by scaling lengths, and the production deployment setup (213 H20 GPUs, 47 models) is described in §7.5." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "Acknowledgments section lists National Key Research and Development Program of China (Grant 2022YFB4500700), Fundamental Research Funds for Central Universities, and NSFC grants." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly listed: Peking University and Alibaba Group. The system is deployed at Alibaba Cloud Model Studio." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "Multiple authors are from Alibaba Group, and the system is deployed at Alibaba Cloud. Alibaba has a financial interest in demonstrating GPU cost savings for their cloud platform. Government grants are independent but the corporate affiliation creates a non-independent interest." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present. Alibaba employees evaluating a system deployed at Alibaba Cloud represents an undeclared potential conflict." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "This paper does not evaluate pre-trained model capabilities on benchmarks. It evaluates a serving system's resource efficiency." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "Not applicable — the paper tests serving infrastructure, not model knowledge." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "Not applicable — no model capability benchmarks are evaluated." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this systems paper." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "The paper reports GPU reduction from 1,192 to 213 GPUs (82% saving) in production. Auto-scaling latencies are reported via CDF (Figure 15). GPU utilization improved from 13.3-33.9% to 48.1%." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": true, 279 "justification": "Testbed is specified: 2 nodes with 16 NVIDIA H800 80GB GPUs, 2TB DDR5, 192 Intel Xeon CPUs. Production deployment uses 213 H20 GPUs. Implementation is 5,700 lines of Python and CUDA/C++." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Aegaeon sustains 2-2.5x higher request arrival rates compared to ServerlessLLM", 286 "evidence": "Figures 11 and 12 show SLO attainment across varying model counts and RPS. At RPS=0.1, Aegaeon supports up to 70 models vs ~35 for ServerlessLLM. At RPS=0.5, the advantage is 2.5x (§7.2).", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Aegaeon achieves 1.5-9x more goodput compared to existing solutions", 291 "evidence": "Figures 11(c) and 12 show goodput comparisons with vertical lines indicating maximum sustainable load at 90% SLO target (§7.2).", 292 "supported": "strong" 293 }, 294 { 295 "claim": "Aegaeon reduces auto-scaling overhead by 97%", 296 "evidence": "§5 details progressive optimizations from T0 (26.9s for 13B model) to T3, with component reuse removing 80% and explicit memory management + KV cache synchronization removing the remainder. Figure 15 shows sub-second scaling in practice.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "Production deployment reduces GPU count from 1,192 to 213 (82% saving)", 301 "evidence": "§7.5 describes beta deployment at Alibaba Cloud Model Studio serving 47 models (1.8B-72B). Figure 18 shows GPU utilization improvement from 13.3-33.9% to 48.1% over 70 hours.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "Token-level auto-scaling supports up to 7 models per GPU", 306 "evidence": "Figure 11(a) shows Aegaeon supporting 70 models on 10 decoding instances at RPS=0.1 with >90% SLO attainment (§7.2).", 307 "supported": "strong" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval", "case-study"], 311 "key_findings": "Aegaeon introduces token-level auto-scaling for multi-model LLM serving, enabling preemptive model switching at per-token granularity rather than per-request. Through component reuse, explicit memory management, and fine-grained KV cache synchronization, it reduces auto-scaling overhead by 97%. In controlled experiments, Aegaeon sustains 2-2.5x higher request rates and 1.5-9x more goodput than ServerlessLLM and MuxServe. Production deployment at Alibaba Cloud serving 47 models reduced GPU requirements from 1,192 to 213, an 82% saving.", 312 "red_flags": [ 313 { 314 "flag": "Company evaluating own production system", 315 "detail": "Alibaba Group employees evaluate Aegaeon, which is deployed at Alibaba Cloud Model Studio. The production deployment results (§7.5) come from their own infrastructure with no independent verification possible." 316 }, 317 { 318 "flag": "No variance or repeated trials", 319 "detail": "All experimental results appear to be single runs with no error bars, confidence intervals, or variance across repeated experiments." 320 }, 321 { 322 "flag": "No limitations section", 323 "detail": "The paper lacks a dedicated limitations or threats-to-validity section despite being a systems paper with production claims." 324 }, 325 { 326 "flag": "No artifact release", 327 "detail": "Neither code nor production workload traces are released, making independent reproduction impossible." 328 } 329 ], 330 "cited_papers": [ 331 { 332 "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention", 333 "authors": ["Woosuk Kwon", "Zhuohan Li", "Siyuan Zhuang"], 334 "year": 2023, 335 "relevance": "Foundational LLM serving system (vLLM) used as Aegaeon's execution backend; central to LLM inference infrastructure." 336 }, 337 { 338 "title": "ServerlessLLM: Low-Latency Serverless Inference for Large Language Models", 339 "authors": ["Yao Fu", "Leyang Xue"], 340 "year": 2024, 341 "relevance": "Primary baseline for auto-scaling LLM serving; represents state-of-the-art in serverless LLM inference." 342 }, 343 { 344 "title": "MuxServe: Flexible Spatial-Temporal Multiplexing for Multiple LLM Serving", 345 "authors": ["Jiangfei Duan", "Runyu Lu"], 346 "year": 2024, 347 "relevance": "Primary baseline for multiplexing-based multi-model serving on GPUs." 348 }, 349 { 350 "title": "DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving", 351 "authors": ["Yinmin Zhong", "Shengyu Liu"], 352 "year": 2024, 353 "relevance": "Pioneered prefill-decoding disaggregation adopted by Aegaeon; key architectural influence." 354 }, 355 { 356 "title": "BlitzScale: Fast and Live Large Model Autoscaling with O(1) Host Caching", 357 "authors": ["Dingyan Zhang", "Haotian Wang"], 358 "year": 2025, 359 "relevance": "Contemporary auto-scaling system for LLM serving, addressing cold start optimization." 360 }, 361 { 362 "title": "Splitwise: Efficient Generative LLM Inference Using Phase Splitting", 363 "authors": ["Pratyush Patel", "Esha Choukse"], 364 "year": 2024, 365 "relevance": "Another approach to disaggregating prefill and decoding phases in LLM inference." 366 }, 367 { 368 "title": "AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving", 369 "authors": ["Zhuohan Li", "Lianmin Zheng"], 370 "year": 2023, 371 "relevance": "Statistical multiplexing approach to multi-model serving that Aegaeon improves upon." 372 }, 373 { 374 "title": "Mooncake: Trading More Storage for Less Computation — A KVCache-centric Architecture for Serving LLM Chatbot", 375 "authors": ["Ruoyu Qin", "Zheming Li"], 376 "year": 2025, 377 "relevance": "KV cache management architecture for LLM serving; related industrial deployment at scale." 378 } 379 ] 380 }