scan.json (25627B)
1 { 2 "paper": { 3 "title": "Aladdin: Joint Placement and Scaling for SLO-Aware LLM Serving", 4 "authors": ["Chengyi Nie", "Rodrigo Fonseca", "Zhenhua Liu"], 5 "year": 2024, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2405.06856" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "No GitHub link, Zenodo archive, or any other code repository URL is provided in the paper. No mention of code availability." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses the publicly available ShareGPT_V3_unfiltered_cleaned_split dataset (reference [27]) and publicly available Llama2 models (reference [28]). The datasets and models used are all public." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper describes hardware (A100 80GB GPUs with PCIe, V100 32GB GPUs with NVLink, Intel Xeon processors, RAM amounts) in Section 6.1 but provides no software environment details — no Python version, no library versions, no requirements.txt, no Dockerfile." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No reproduction instructions, README, or scripts are provided. The algorithms are described but there are no step-by-step instructions for reproducing the experiments." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": true, 36 "justification": "The paper includes prediction intervals (shaded areas) in Figures 6a, 6b, 7a, 7b, and 8 for the performance model validation. Section 6.2 states 'The shaded area is the prediction interval, which represents the estimation of the range in which future observations are likely to fall.'" 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims Aladdin outperforms baselines (up to 71% cost reduction, 51% SLO improvement) but uses no statistical significance tests — comparisons are based solely on comparing point estimates from single simulation/experiment runs." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports effect sizes with baseline context throughout. For example, Section 6.3: 'Aladdin reduces the SLO violation rate by up to 3.5X,' 'improves the SLO attainment rate by up to 51% compared with JSQ,' 'reduces the P99 ATGT by up to 40%.' Section 6.4: 'reduces the LLM serving cost by up to 71% and 40% compared with the default vLLM and JSQ.' Baseline and treatment values are visible in figures." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification is provided for why specific workload sizes, arrival rates, or numbers of requests were chosen. The experiments use ShareGPT prompts and Poisson arrival distributions but no rationale for why these quantities are sufficient." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "The end-to-end experiments (Sections 6.3 and 6.4) report single-run results with no standard deviation, no variance across multiple runs, and no indication of how many times experiments were repeated. Only the performance model validation includes prediction intervals." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper compares against multiple baselines: default vLLM, Aladdin optimal worker + JSQ (ablation), Power-of-Two algorithm, and JSQ. These are described in Section 6.1 under 'Baselines.'" 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include vLLM (2023), DistServe (2024), Splitwise (2023), and the Power-of-Two algorithm from TetriInfer (2024). These are contemporary systems at the time of writing." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Section 6.3 includes an ablation: 'Aladdin best worker + JSQ' isolates the contribution of optimal worker configuration from request placement. The comparison between full Aladdin and this ablated version shows the value of the placement algorithm specifically." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper reports multiple metrics: SLO attainment rate, P99 ATGT (average token generation time), GPU count required, TTFT, and scheduling overhead (ms). See Sections 6.3, 6.4, and 6.5." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "This is a systems paper about GPU resource scheduling for LLM inference. Human evaluation of system outputs is not relevant — the claims are about latency, cost, and SLO attainment." 84 }, 85 "held_out_test_set": { 86 "applies": false, 87 "answer": false, 88 "justification": "This is a systems paper, not a machine learning model evaluation. There is no train/test split concept applicable to the scheduling system evaluation." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down by model size (7b, 13b, 70b), testbed (A100, V100), and inference setting (default continuous batching vs split-phase). Figures 9-12 show per-model, per-testbed results." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 4.3 discusses prediction errors and how the system handles them. Section 6.3 discusses when baselines outperform (low arrival rates on V100: 'when the arrival rate is low, the P99 ATGT of baseline default vLLM outperforms the performance with optimal worker configuration'). The paper also discusses scheduling overhead limitations for high-demand scenarios." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that at low arrival rates, the default vLLM outperforms optimal worker configuration on V100 (Section 6.3). It also acknowledges the scheduling overhead becomes a problem at high arrival rates (Section 6.5) and discusses output length prediction errors as an inherent limitation." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims Aladdin 'reduces the serving cost of a single model by up to 71% for the same SLO level compared with the baselines.' This is supported by Section 6.4/Figure 11, which shows up to 71% GPU reduction compared to default vLLM. The 'millions of dollars per year' claim is not quantified in the paper body but the GPU reduction percentages are substantiated." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper makes causal claims through ablation design — comparing full Aladdin against 'Aladdin best worker + JSQ' isolates the contribution of request placement. The performance improvements are attributed to specific components (worker configuration, request placement, re-balancing) with controlled comparisons." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper tests only Llama2 models (7b, 13b, 70b) on two testbeds but the title and abstract make general claims about 'LLM Serving' without bounding to Llama2. Section 5.3 notes 'Aladdin is specifically designed for single-model serving' but the broader framing does not adequately bound to the tested models and hardware." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper does not discuss alternative explanations for the observed improvements. For example, the improvements could be partially due to the specific workload characteristics of ShareGPT rather than general properties of LLM inference, but this is not explored." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Specific model versions are provided: 'Llama2-chat 70b,' 'Llama2-chat 13b,' 'Llama2-chat 7b' (Table 2), and 'Llama2-70b-chat-hf,' 'Llama2-13b-chat-hf,' 'Llama2-7b-chat-hf' (Table 3, Figure 8). These are specific HuggingFace model identifiers." 133 }, 134 "prompts_provided": { 135 "applies": false, 136 "answer": false, 137 "justification": "The paper does not prompt LLMs for any task — it uses ShareGPT prompts as workload input to test the scheduling system. The prompts are the workload, not part of the methodology that needs reproducing." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Temperature is set to 0 (Section 6.1), maximum output token limit is 2048, SLO values are specified in Table 2, gamma=0.5 for the scheduling bound, and theta is discussed as a hyperparameter. Poisson distribution parameters for arrival rates are varied and shown in figures." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "This is not an agentic AI system. It is a scheduling system for LLM inference, with no agentic scaffolding involved." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 6.1 documents workload preparation: 'we first collect the prompts from users of ShareGPT_V3_unfiltered_cleaned_split dataset, then submit the prompts follows a Poisson distribution. The outputs are generated by each evaluated model with a temperature of 0 and a maximum output token limit of 2048.' For simulation: prompt lengths from ShareGPT with output length predicted from CDF." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "There is no dedicated limitations or threats-to-validity section. Some limitations are mentioned inline (e.g., scheduling overhead at high arrival rates, no request migration support, no cold start consideration) but they are scattered, not collected in a dedicated section." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No specific threats to validity are discussed. The paper mentions some inline limitations but does not systematically analyze threats — e.g., the reliance on Poisson arrival patterns, single workload dataset (ShareGPT), homogeneous GPU assumption, and single-model serving limitation are not framed as threats." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "Several scope boundaries are stated: 'Aladdin is specifically designed for single-model serving' (Section 5.3), 'we consider homogeneous GPUs in this paper' (Section 4.1), 'we focus on predicting the minimal GPU required for the varying arrival rate without considering the cold start problem and the switching cost' (Section 5.2), and 'we consider the tensor parallelism distributed inference' (Section 4.1)." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "No raw experimental data (latency measurements, trace logs, simulation outputs) are made available. The ShareGPT dataset is public but the experimental results themselves are not independently verifiable." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 6.1 describes the workload generation process: prompts from ShareGPT, Poisson arrival distribution, temperature 0, max output 2048 tokens. Section 6.2 describes how performance model traces are collected on the testbeds." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. The data source is a standard public benchmark dataset (ShareGPT) and hardware testbed measurements." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The pipeline is documented: collect ShareGPT prompts -> generate outputs with Llama2 models -> record input/output lengths -> use as workload for Poisson arrival simulation. For performance modeling: collect batch inference traces -> fit linear models -> validate predictions against observations." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding or acknowledgments section is present in the paper. One author is from Azure Research - Systems (Microsoft), suggesting potential corporate funding, but this is not disclosed." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are listed: Chengyi Nie (Stony Brook University), Rodrigo Fonseca (Azure Research - Systems), Zhenhua Liu (Stony Brook University). The Microsoft/Azure affiliation is disclosed." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "An author is from Azure Research - Systems (Microsoft), a major cloud provider that sells GPU compute for LLM inference. A system reducing GPU costs could be commercially valuable to Azure. No statement is made about funder independence. The funder is not independent of the outcome." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement or financial interests declaration is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "This paper does not evaluate a pre-trained model's capability on any benchmark. It evaluates a scheduling system for LLM inference. The LLMs are used as workload generators, not evaluated for their knowledge or capability." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "Not applicable — the paper evaluates a scheduling system, not a model's benchmark performance. There is no concern about train/test overlap for the scheduling algorithm." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "Not applicable — the paper does not evaluate pre-trained model capability on benchmarks. The ShareGPT data is used as workload, not as a capability benchmark." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "The primary metric of the paper is GPU count required for SLO-guaranteed serving, which directly measures inference cost. Section 6.4 reports GPU numbers needed at various arrival rates. Section 6.5 reports scheduling overhead in milliseconds (Figure 13)." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": true, 279 "justification": "Hardware is specified: 4 A100 80GB GPUs per machine with Intel Xeon Platinum 8380 and 512GB RAM, and 4 V100 32GB GPUs per machine with Intel Xeon Gold 6230 and 128GB RAM (Section 6.1). The simulation scales up to 25 req/s arrival rates." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Aladdin reduces the serving cost of a single model by up to 71% for the same SLO level compared with baselines.", 286 "evidence": "Section 6.4, Figure 11 shows GPU reduction of up to 71% compared to default vLLM with continuous batching across different models and testbeds.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Aladdin reduces GPU number required for the decode phase by up to 60% compared with JSQ in split-phase inference.", 291 "evidence": "Section 6.4, Figure 12 shows decode-phase GPU reduction of up to 60% vs JSQ and 49% vs Power-of-Two across models and testbeds.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "Aladdin improves the SLO attainment rate by up to 51% compared with JSQ with optimal workers.", 296 "evidence": "Section 6.3, Figure 9a shows 51% improvement for LlaMa2-13b on A100 testbed. However, the improvement for 70b is only up to 19% due to having only two workers.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "The performance models predict prefill and decode latency with maximum error less than 10%.", 301 "evidence": "Section 6.2: prefill latency prediction error less than 4% (Figure 6), decode latency prediction error less than 5% (Figure 7), KV cache prediction error less than 1% (Figure 8).", 302 "supported": "strong" 303 }, 304 { 305 "claim": "The scheduling overhead is acceptable for request arrival rates up to about 25 requests per second.", 306 "evidence": "Section 6.5, Figure 13 shows scheduling overhead under 50ms at 25 req/s. However, overhead grows significantly beyond this point, reaching ~500ms at 100 req/s.", 307 "supported": "strong" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "Aladdin is a co-adaptive scheduler for LLM inference that jointly optimizes request placement and GPU resource scaling with SLO awareness. It models prefill and decode latency as linear functions of input length, batch size, and context length, achieving prediction errors under 10%. By formulating request placement as a multi-dimensional bin packing problem with a best-fit heuristic, and dynamically configuring worker GPU counts, Aladdin reduces serving costs by up to 71% compared to default vLLM and 60% compared to split-phase inference baselines for SLO-guaranteed service across Llama2 models of varying sizes on A100 and V100 testbeds.", 312 "red_flags": [ 313 { 314 "flag": "No code release", 315 "detail": "The system is not open-sourced and no code repository is provided, making independent verification or reproduction of results impossible." 316 }, 317 { 318 "flag": "Single-run results without variance", 319 "detail": "End-to-end experiments (Sections 6.3, 6.4) report single-run results with no standard deviation, variance, or confidence intervals. It is unclear whether the improvements are consistent across multiple runs with different random seeds for Poisson arrivals." 320 }, 321 { 322 "flag": "Simulated arrival patterns only", 323 "detail": "All experiments use Poisson arrival distributions. The paper acknowledges there is 'no available trace of LLM inference that includes the arrival time of each request.' Real-world arrival patterns may differ significantly from Poisson (e.g., bursty traffic), and the system's performance under such conditions is unknown." 324 }, 325 { 326 "flag": "Cherry-picked 'up to' numbers", 327 "detail": "Claims use 'up to X%' phrasing (up to 71%, up to 60%, up to 51%), which reports best-case improvements. The improvements vary substantially by model size and testbed — for example, the SLO attainment improvement for 70b is only 19% vs the 51% reported for 13b." 328 }, 329 { 330 "flag": "No limitations section", 331 "detail": "The paper lacks a dedicated limitations or threats-to-validity section, despite having several notable constraints: Llama2-only evaluation, homogeneous GPU assumption, single-model serving only, no cold start consideration, Poisson-only arrival patterns." 332 }, 333 { 334 "flag": "Azure affiliation without COI disclosure", 335 "detail": "One author is from Azure Research (Microsoft), a major cloud compute provider. A system that reduces GPU requirements for LLM serving has direct commercial relevance to Azure, but no conflict-of-interest statement is provided." 336 } 337 ], 338 "cited_papers": [ 339 { 340 "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention", 341 "authors": ["Woosuk Kwon", "Zhuohan Li", "Siyuan Zhuang"], 342 "year": 2023, 343 "relevance": "vLLM is a foundational LLM serving system with PagedAttention that Aladdin builds on top of, directly relevant to LLM inference efficiency." 344 }, 345 { 346 "title": "Splitwise: Efficient Generative LLM Inference Using Phase Splitting", 347 "authors": ["Pratyush Patel", "Esha Choukse", "Chaojie Zhang"], 348 "year": 2023, 349 "relevance": "Split-phase inference approach that separates prefill and decode, a key baseline and complementary technique for LLM serving." 350 }, 351 { 352 "title": "DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving", 353 "authors": ["Yinmin Zhong", "Shengyu Liu", "Junda Chen"], 354 "year": 2024, 355 "relevance": "Goodput-optimized LLM serving system that disaggregates prefill and decoding phases, a direct baseline for Aladdin." 356 }, 357 { 358 "title": "Orca: A Distributed Serving System for Transformer-Based Generative Models", 359 "authors": ["Gyeong-In Yu", "Joo Seong Jeong", "Geon-Woo Kim"], 360 "year": 2022, 361 "relevance": "Introduced continuous batching for LLM serving, foundational to all modern LLM inference systems." 362 }, 363 { 364 "title": "Sarathi: Efficient LLM Inference by Piggybacking Decodes with Chunked Prefills", 365 "authors": ["Amey Agrawal", "Ashish Panwar", "Jayashree Mohan"], 366 "year": 2023, 367 "relevance": "Chunked prefill technique for balancing prefill and decode in LLM inference, complementary to cluster-level scheduling." 368 }, 369 { 370 "title": "Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve", 371 "authors": ["Amey Agrawal", "Nitin Kedia", "Ashish Panwar"], 372 "year": 2024, 373 "relevance": "Addresses throughput-latency tradeoffs in LLM inference serving, closely related to SLO-aware scheduling." 374 }, 375 { 376 "title": "Inference without Interference: Disaggregate LLM Inference for Mixed Downstream Workloads", 377 "authors": ["Cunchen Hu", "Heyang Huang", "Liangliang Xu"], 378 "year": 2024, 379 "relevance": "Disaggregated LLM inference for mixed workloads with Power-of-Two scheduling, a direct baseline for request placement." 380 }, 381 { 382 "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models", 383 "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"], 384 "year": 2023, 385 "relevance": "The Llama2 model family used as the primary evaluation workload across all Aladdin experiments." 386 }, 387 { 388 "title": "Efficient Interactive LLM Serving with Proxy Model-Based Sequence Length Prediction", 389 "authors": ["Haoran Qiu", "Weichao Mao", "Archit Patke"], 390 "year": 2024, 391 "relevance": "Proxy model-based output length prediction for LLM serving, related to Aladdin's output length prediction component." 392 }, 393 { 394 "title": "ExeGPT: Constraint-Aware Resource Scheduling for LLM Inference", 395 "authors": ["Hyungjun Oh", "Kihong Kim", "Jaemin Kim"], 396 "year": 2024, 397 "relevance": "Constraint-aware resource scheduling for LLM inference at ASPLOS, directly related to resource management for LLM serving." 398 }, 399 { 400 "title": "Andes: Defining and Enhancing Quality-of-Experience in LLM-Based Text Streaming Services", 401 "authors": ["Jiachen Liu", "Zhiyu Wu", "Jae-Won Chung"], 402 "year": 2024, 403 "relevance": "QoE-focused LLM serving system, related to Aladdin's ATGT SLO metric for quality of experience." 404 }, 405 { 406 "title": "FlexGen: High-Throughput Generative Inference of Large Language Models with a Single GPU", 407 "authors": ["Ying Sheng", "Lianmin Zheng", "Binhang Yuan"], 408 "year": 2023, 409 "relevance": "CPU-GPU offloading approach for LLM inference throughput, an alternative approach to efficient LLM serving." 410 } 411 ] 412 }