scan-v4.json (31214B)
1 { 2 "scan_version": 4, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Equinox: Holistic Fair Scheduling in Serving Large Language Models", 6 "authors": [ 7 "Zhixiang Wei", 8 "James Yen", 9 "Jingyi Chen", 10 "Ziyang Zhang", 11 "Zhibai Huang", 12 "Chen Chen", 13 "Xingzi Yu", 14 "Yicheng Gu", 15 "Chenggang Wu", 16 "Yun Wang", 17 "Mingyuan Xia", 18 "Jie Wu", 19 "Hao Wang", 20 "Zhengwei Qi" 21 ], 22 "year": 2025, 23 "venue": "arXiv.org", 24 "arxiv_id": "2508.16646", 25 "doi": "10.48550/arXiv.2508.16646" 26 }, 27 "checklist": { 28 "claims_and_evidence": { 29 "abstract_claims_supported": { 30 "applies": true, 31 "answer": true, 32 "justification": "Abstract claims of 1.3× throughput (Figure 9/17), 60% lower TTFT (Figure 9a), 13% higher fairness (Figure 13), and 94% GPU utilization (Figure 9b) are supported by results in the paper.", 33 "source": "opus" 34 }, 35 "causal_claims_justified": { 36 "applies": true, 37 "answer": true, 38 "justification": "The paper makes causal claims about Equinox improving fairness. The ablation study (Table 1) with controlled single-variable manipulation (same predictor, different scheduler; same scheduler, different predictor) adequately supports the causal attribution.", 39 "source": "opus" 40 }, 41 "generalization_bounded": { 42 "applies": true, 43 "answer": false, 44 "justification": "The paper claims to 'redefine fairness in multi-tenant LLM serving' (Section 9) broadly, but evaluates only on Llama-2-7b and Llama-2-70b models. No evaluation on other model families (e.g., Mistral, GPT) or model sizes. The title and abstract make broad claims not bounded to the tested setting.", 45 "source": "opus" 46 }, 47 "alternative_explanations_discussed": { 48 "applies": true, 49 "answer": false, 50 "justification": "No discussion of alternative explanations for the results. For example, could the improvements be due to the specific workload characteristics rather than the general framework design? No robustness checks against different workload distributions.", 51 "source": "opus" 52 }, 53 "proxy_outcome_distinction": { 54 "applies": true, 55 "answer": true, 56 "justification": "The paper explicitly discusses the gap between token count as a proxy and actual fairness (Section 1, Figure 1), arguing that token-level metrics are inadequate proxies for the multi-dimensional fairness they measure. The HF metric is clearly defined in terms of what it measures (Section 3.3).", 57 "source": "opus" 58 } 59 }, 60 "limitations_and_scope": { 61 "limitations_section_present": { 62 "applies": true, 63 "answer": false, 64 "justification": "No dedicated limitations section exists. The paper briefly mentions multi-node deployment needing 'additional engineering efforts' (Section 7.5) but this is not a substantive limitations discussion.", 65 "source": "opus" 66 }, 67 "threats_to_validity_specific": { 68 "applies": true, 69 "answer": false, 70 "justification": "No threats to validity are discussed. No mention of specific concerns about generalizability beyond the tested models, workloads, or hardware.", 71 "source": "opus" 72 }, 73 "scope_boundaries_stated": { 74 "applies": true, 75 "answer": false, 76 "justification": "No explicit scope boundaries are stated. The paper does not clarify what settings or configurations the results do NOT apply to.", 77 "source": "opus" 78 } 79 }, 80 "conflicts_of_interest": { 81 "funding_disclosed": { 82 "applies": true, 83 "answer": false, 84 "justification": "No funding information or acknowledgments section is present in the paper.", 85 "source": "opus" 86 }, 87 "affiliations_disclosed": { 88 "applies": true, 89 "answer": true, 90 "justification": "All author affiliations are listed: Shanghai Jiao Tong University, UltraRISC Shanghai, China Telecom Cloud Computing Research Institute, Stevens Institute of Technology.", 91 "source": "opus" 92 }, 93 "funder_independent_of_outcome": { 94 "applies": true, 95 "answer": false, 96 "justification": "Funding is not disclosed, so independence cannot be assessed. One author is from UltraRISC and another from China Telecom, which could have commercial interest in LLM serving.", 97 "source": "opus" 98 }, 99 "financial_interests_declared": { 100 "applies": true, 101 "answer": false, 102 "justification": "No competing interests or financial interests statement is present. One author is affiliated with UltraRISC (a commercial entity), which may have financial interest in LLM serving technology.", 103 "source": "opus" 104 } 105 }, 106 "scope_and_framing": { 107 "key_terms_defined": { 108 "applies": true, 109 "answer": true, 110 "justification": "Key terms are precisely defined with formulas: 'Holistic Fairness' in Section 3.3 (Eq. 3.3), 'UFC' in Section 3.1, 'RFC' in Section 3.2, and 'MoPE' in Section 6.", 111 "source": "haiku" 112 }, 113 "intended_contribution_clear": { 114 "applies": true, 115 "answer": true, 116 "justification": "Three contributions are explicitly enumerated: the holistic fairness formalization, the deterministic MoPE prediction framework, and the Equinox open-source system implementation.", 117 "source": "haiku" 118 }, 119 "engagement_with_prior_work": { 120 "applies": true, 121 "answer": true, 122 "justification": "Section 8 engages specifically with VTC, FCFS/RPM, vLLM, SGLang, Sarathi-Serve, DistServe, and length prediction approaches, explaining how Equinox addresses the limitations of each rather than merely listing them.", 123 "source": "haiku" 124 } 125 } 126 }, 127 "type_checklist": { 128 "empirical": { 129 "artifacts": { 130 "code_released": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper describes Equinox as 'open-source' (Section 1, Section 4) but provides no repository URL, GitHub link, or archive link anywhere in the paper.", 134 "source": "opus" 135 }, 136 "data_released": { 137 "applies": true, 138 "answer": true, 139 "justification": "The paper uses publicly available datasets: LMSYS Chatbot Arena (lmsys-chat-1m) [47] and ShareGPT [38]. These are standard public benchmarks.", 140 "source": "opus" 141 }, 142 "environment_specified": { 143 "applies": true, 144 "answer": false, 145 "justification": "Hardware is specified (A100-80GB, 8×A100-40GB cluster, Intel Xeon Gold 5218, 256GB DRAM), but no software environment details are given — no Python version, no library versions, no requirements.txt or Dockerfile.", 146 "source": "opus" 147 }, 148 "reproduction_instructions": { 149 "applies": true, 150 "answer": false, 151 "justification": "No reproduction instructions, README, or scripts are provided. The paper mentions ~1000 lines of Python atop existing systems but provides no way to replicate experiments.", 152 "source": "opus" 153 } 154 }, 155 "statistical_methodology": { 156 "confidence_intervals_or_error_bars": { 157 "applies": true, 158 "answer": false, 159 "justification": "No confidence intervals or error bars are reported on any of the main results. Figures show point estimates only (e.g., Figures 9-13).", 160 "source": "opus" 161 }, 162 "significance_tests": { 163 "applies": true, 164 "answer": false, 165 "justification": "Claims like '1.3× higher throughput' and '60% lower TTFT' are made by comparing raw numbers with no statistical significance tests.", 166 "source": "opus" 167 }, 168 "effect_sizes_reported": { 169 "applies": true, 170 "answer": true, 171 "justification": "Relative improvements are reported with baseline context throughout: '1.3× higher throughput', '60% lower TTFT', '13% higher fairness', '42% worst-case service gap reduction', '86% average gap reduction' (Section 1, Section 7).", 172 "source": "opus" 173 }, 174 "sample_size_justified": { 175 "applies": true, 176 "answer": false, 177 "justification": "No justification for workload sizes (e.g., 1280 prompts for SGLang, 1000 requests per client for vLLM). No discussion of whether these are sufficient.", 178 "source": "opus" 179 }, 180 "variance_reported": { 181 "applies": true, 182 "answer": false, 183 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. It is unclear whether experiments were run multiple times.", 184 "source": "opus" 185 } 186 }, 187 "evaluation_design": { 188 "baselines_included": { 189 "applies": true, 190 "answer": true, 191 "justification": "FCFS and VTC are used as baselines throughout all experiments (Section 7.1). For prediction, a single proxy model [31] is the baseline.", 192 "source": "opus" 193 }, 194 "baselines_contemporary": { 195 "applies": true, 196 "answer": true, 197 "justification": "VTC (Sheng et al., OSDI 2024 [39]) is the state-of-the-art fair scheduling approach for LLM serving. The proxy model baseline is from 2024 [31]. These are recent and competitive.", 198 "source": "opus" 199 }, 200 "ablation_study": { 201 "applies": true, 202 "answer": true, 203 "justification": "Section 7.4 presents an ablation study (Table 1) isolating MoPE's contribution by comparing Equinox and VTC with different predictors (Single, MoPE, Oracle).", 204 "source": "opus" 205 }, 206 "multiple_metrics": { 207 "applies": true, 208 "answer": true, 209 "justification": "Multiple metrics are used: service rate, service difference, TTFT (P50/P90), end-to-end latency, Jain's fairness index, throughput, GPU utilization (Section 7.1).", 210 "source": "opus" 211 }, 212 "human_evaluation": { 213 "applies": false, 214 "answer": false, 215 "justification": "This is a systems/scheduling paper; human evaluation of system outputs is not relevant to the claims about fairness and throughput.", 216 "source": "opus" 217 }, 218 "held_out_test_set": { 219 "applies": true, 220 "answer": true, 221 "justification": "MoPE is trained on LMSYS dataset and tested on the unseen ShareGPT dataset (Section 7.3), demonstrating generalization across datasets.", 222 "source": "opus" 223 }, 224 "per_category_breakdown": { 225 "applies": true, 226 "answer": true, 227 "justification": "Results are broken down per-client (Figures 9c, 10d, 12c), per-system (S-LoRA, vLLM, SGLang in Figure 13), per-metric (UFC/RFC components in Figure 5), and per-scenario (balanced, stochastic, overload, dynamic).", 228 "source": "opus" 229 }, 230 "failure_cases_discussed": { 231 "applies": true, 232 "answer": false, 233 "justification": "No failure cases or scenarios where Equinox underperforms are discussed. All results show improvements.", 234 "source": "opus" 235 }, 236 "negative_results_reported": { 237 "applies": true, 238 "answer": true, 239 "justification": "Figure 15 shows the fairness-throughput tradeoff: at α=0.9, throughput drops 20%. The paper also notes VTC+Single performs worse than baseline VTC (Table 1), and acknowledges multi-node deployment needs 'additional engineering efforts' (Section 7.5).", 240 "source": "opus" 241 } 242 }, 243 "setup_transparency": { 244 "model_versions_specified": { 245 "applies": true, 246 "answer": true, 247 "justification": "Specific model versions are given: Llama-2-7b for synthetic traces, Llama-2-70b for real-world traces (Section 7.1). BERT-base for MoPE experts (Section 6).", 248 "source": "opus" 249 }, 250 "prompts_provided": { 251 "applies": false, 252 "answer": false, 253 "justification": "This is a systems/scheduling paper that does not use prompting as part of its method. The LLM requests come from existing trace datasets.", 254 "source": "opus" 255 }, 256 "hyperparameters_reported": { 257 "applies": true, 258 "answer": true, 259 "justification": "Key hyperparameters are reported: α=0.7, β=0.3 (Section 7.6), δ=0.1 (Section 3.1), 3 MoPE experts with boundaries at 33rd/66th/99th percentiles (<53, 53-210, >210 tokens), BF16 precision (Section 6), TP=8 for multi-GPU (Section 7.1).", 260 "source": "opus" 261 }, 262 "scaffolding_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No agentic scaffolding is used. This is a scheduling system, not an agentic AI system.", 266 "source": "opus" 267 }, 268 "data_preprocessing_documented": { 269 "applies": true, 270 "answer": true, 271 "justification": "MoPE training data partitioning is documented: router trained on up to 120k samples from LMSYS, data split by output length boundaries at 33rd/66th/99th percentiles (Section 6, Figure 7c). Workload configurations are described in detail for each experiment (Sections 7.2-7.3).", 272 "source": "opus" 273 } 274 }, 275 "data_integrity": { 276 "raw_data_available": { 277 "applies": true, 278 "answer": false, 279 "justification": "No raw experimental data (logs, traces, measurements) is made available. Only aggregated results in figures and tables.", 280 "source": "opus" 281 }, 282 "data_collection_described": { 283 "applies": true, 284 "answer": true, 285 "justification": "Synthetic workload parameters are fully specified (client rates, input/output lengths). Real-world traces are from public datasets (LMSYS, ShareGPT) with setup described in Section 7.1.", 286 "source": "opus" 287 }, 288 "recruitment_methods_described": { 289 "applies": false, 290 "answer": false, 291 "justification": "No human participants. Data sources are standard public benchmark traces.", 292 "source": "opus" 293 }, 294 "data_pipeline_documented": { 295 "applies": true, 296 "answer": true, 297 "justification": "The MoPE training pipeline is documented: training dataset → router training → dataset splitting → expert training (Figure 8). Online prediction pipeline is also documented. Workload generation is specified per experiment.", 298 "source": "opus" 299 } 300 }, 301 "contamination": { 302 "training_cutoff_stated": { 303 "applies": false, 304 "answer": false, 305 "justification": "This paper evaluates a scheduling system, not a pre-trained model's capability on a benchmark. The LLMs (Llama-2) are used as workload generators, not as the subject of evaluation.", 306 "source": "opus" 307 }, 308 "train_test_overlap_discussed": { 309 "applies": false, 310 "answer": false, 311 "justification": "Not applicable — the paper evaluates scheduling fairness, not model knowledge. No benchmark contamination concern exists for the scheduling evaluation.", 312 "source": "opus" 313 }, 314 "benchmark_contamination_addressed": { 315 "applies": false, 316 "answer": false, 317 "justification": "Not applicable — scheduling system evaluation, not model capability evaluation.", 318 "source": "opus" 319 } 320 }, 321 "human_studies": { 322 "pre_registered": { 323 "applies": false, 324 "answer": false, 325 "justification": "No human participants in this study.", 326 "source": "opus" 327 }, 328 "irb_or_ethics_approval": { 329 "applies": false, 330 "answer": false, 331 "justification": "No human participants in this study.", 332 "source": "opus" 333 }, 334 "demographics_reported": { 335 "applies": false, 336 "answer": false, 337 "justification": "No human participants in this study.", 338 "source": "opus" 339 }, 340 "inclusion_exclusion_criteria": { 341 "applies": false, 342 "answer": false, 343 "justification": "No human participants in this study.", 344 "source": "opus" 345 }, 346 "randomization_described": { 347 "applies": false, 348 "answer": false, 349 "justification": "No human participants in this study.", 350 "source": "opus" 351 }, 352 "blinding_described": { 353 "applies": false, 354 "answer": false, 355 "justification": "No human participants in this study.", 356 "source": "opus" 357 }, 358 "attrition_reported": { 359 "applies": false, 360 "answer": false, 361 "justification": "No human participants in this study.", 362 "source": "opus" 363 } 364 }, 365 "cost_and_practicality": { 366 "inference_cost_reported": { 367 "applies": true, 368 "answer": true, 369 "justification": "MoPE overhead is reported: 0.02ms router overhead, 4.5ms total MoPE inference, less than 1% of average prompt latency (Section 6, Figure 7d). Memory usage is reported in Figure 7b.", 370 "source": "opus" 371 }, 372 "compute_budget_stated": { 373 "applies": true, 374 "answer": false, 375 "justification": "Hardware is listed but total compute budget (GPU hours for training MoPE, total experiment time) is not stated.", 376 "source": "opus" 377 } 378 }, 379 "experimental_rigor": { 380 "seed_sensitivity_reported": { 381 "applies": true, 382 "answer": false, 383 "justification": "No mention of multiple random seeds or seed sensitivity. Stochastic arrival experiments use Poisson processes but it's unclear if multiple seeds were tested.", 384 "source": "opus" 385 }, 386 "number_of_runs_stated": { 387 "applies": true, 388 "answer": false, 389 "justification": "The number of experimental runs is never stated. It is unclear whether results are from single runs or averaged.", 390 "source": "opus" 391 }, 392 "hyperparameter_search_budget": { 393 "applies": true, 394 "answer": false, 395 "justification": "α/β sensitivity is studied (Figure 15) but the search budget for other hyperparameters (δ, MoPE expert boundaries, training parameters) is not reported.", 396 "source": "opus" 397 }, 398 "best_config_selection_justified": { 399 "applies": true, 400 "answer": true, 401 "justification": "The selection of α=0.7, β=0.3 is justified by the sensitivity analysis in Section 7.6/Figure 15, showing the fairness-throughput tradeoff. The 3-expert MoPE configuration is justified by Figure 7a-b.", 402 "source": "opus" 403 }, 404 "multiple_comparison_correction": { 405 "applies": true, 406 "answer": false, 407 "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied despite many comparisons across scenarios, metrics, and systems.", 408 "source": "opus" 409 }, 410 "self_comparison_bias_addressed": { 411 "applies": true, 412 "answer": false, 413 "justification": "The authors implement Equinox and compare it against their own implementations of FCFS and VTC. No acknowledgment of self-comparison bias.", 414 "source": "opus" 415 }, 416 "compute_budget_vs_performance": { 417 "applies": true, 418 "answer": false, 419 "justification": "No comparison of performance at matched compute budgets. MoPE adds prediction overhead (reported as <1%) but the training cost of MoPE vs. simpler predictors is not compared.", 420 "source": "opus" 421 }, 422 "benchmark_construct_validity": { 423 "applies": true, 424 "answer": true, 425 "justification": "Section 1 and Section 2 provide extensive analysis of why token count is an inadequate proxy for fairness (Figures 1-2), directly addressing construct validity of the VTC metric. The paper argues for multi-dimensional fairness as the construct.", 426 "source": "opus" 427 }, 428 "scaffold_confound_addressed": { 429 "applies": false, 430 "answer": false, 431 "justification": "No agentic scaffolding is involved. The system evaluates scheduling, not model capability through scaffolds.", 432 "source": "opus" 433 } 434 }, 435 "data_leakage": { 436 "temporal_leakage_addressed": { 437 "applies": false, 438 "answer": false, 439 "justification": "Not applicable — this evaluates a scheduling system, not model knowledge. The LLM is a workload generator, not the evaluation subject.", 440 "source": "opus" 441 }, 442 "feature_leakage_addressed": { 443 "applies": false, 444 "answer": false, 445 "justification": "Not applicable — scheduling evaluation does not involve prediction tasks where feature leakage is a concern.", 446 "source": "opus" 447 }, 448 "non_independence_addressed": { 449 "applies": true, 450 "answer": true, 451 "justification": "MoPE is trained on LMSYS data and tested on the unseen ShareGPT dataset (Section 7.1, 7.3), ensuring train-test independence for the prediction component.", 452 "source": "opus" 453 }, 454 "leakage_detection_method": { 455 "applies": false, 456 "answer": false, 457 "justification": "Not applicable — no model capability benchmark where leakage detection would be relevant.", 458 "source": "opus" 459 } 460 } 461 } 462 }, 463 "claims": [ 464 { 465 "claim": "Equinox achieves up to 1.3× higher throughput compared to FCFS and VTC", 466 "evidence": "Figure 9 (balanced load scenario on A100-80GB) shows 1.3× throughput; Figure 11 shows up to 25% throughput improvement in SGLang at high RPS with ShareGPT trace", 467 "supported": "moderate" 468 }, 469 { 470 "claim": "Equinox achieves up to 60% lower time-to-first-token latency compared to VTC", 471 "evidence": "Section 7.2.1 states '60% lower response times than VTC' and Figure 9a shows response time comparison; this appears to come from the synthetic balanced-load scenario only", 472 "supported": "moderate" 473 }, 474 { 475 "claim": "Equinox achieves 13% higher Jain's fairness index compared to both FCFS and VTC across serving systems", 476 "evidence": "Figure 13 shows consistent improvements: vLLM 0.90 vs 0.76 (VTC), SGLang 0.88 vs 0.73, S-LoRA 0.80 vs 0.66 — all approximately 13pp improvement", 477 "supported": "strong" 478 }, 479 { 480 "claim": "MoPE reduces L1 prediction error from 80 to 33 tokens versus single proxy model", 481 "evidence": "Figure 7a shows per-expert specialization with labeled L1 values; contributions section states this improvement explicitly", 482 "supported": "strong" 483 }, 484 { 485 "claim": "Equinox reduces worst-case service gaps by 42% and average gaps by 86% versus VTC", 486 "evidence": "Table 1 ablation: VTC max diff=1505 vs Equinox+MoPE max diff=866 (~42%); VTC avg diff=1106 vs Equinox+MoPE avg diff=151 (~86%)", 487 "supported": "strong" 488 }, 489 { 490 "claim": "MoPE adds negligible scheduling overhead of less than 1% of average prompt latency", 491 "evidence": "Figure 7d shows total MoPE inference time of 4.5ms with router overhead of 0.02ms, compared to average prompt latency of 2400ms", 492 "supported": "strong" 493 }, 494 { 495 "claim": "Token count is a fundamentally insufficient fairness metric for LLM serving due to prefill-decode asymmetry", 496 "evidence": "Figure 1 demonstrates equal token counts yielding 2.8s vs 6.7s latency and 140% vs 5% throughput differences; Figure 2 provides systematic evidence across token ranges on A100 with LMSYS trace", 497 "supported": "strong" 498 } 499 ], 500 "methodology_tags": [ 501 "benchmark-eval" 502 ], 503 "key_findings": "Equinox formalizes multi-tenant LLM scheduling as a multi-objective optimization problem by separating user fairness (weighted tokens + latency via UFC) from resource fairness (GPU utilization + throughput via RFC), arguing that single-metric token-counting is fundamentally unable to capture the prefill-decode computational asymmetry. The Mixture of Prediction Experts (MoPE) resolves the scheduling paradox by predicting all four required metrics before execution, reducing L1 token-length prediction error from 80 to 33 tokens versus a single proxy model by routing requests to three specialized BERT-based regressors. Across S-LoRA, vLLM, and SGLang serving systems on A100 GPUs with LMSYS and ShareGPT traces, Equinox achieves 13% higher Jain's fairness index, up to 1.3× throughput, and 60% lower TTFT versus VTC, with the fairness-throughput trade-off explicitly controlled by tunable α/β weights. An ablation study confirms that both the holistic scheduling algorithm and accurate prediction independently contribute to fairness gains, with neither alone sufficient.", 504 "red_flags": [ 505 { 506 "flag": "No error bars or confidence intervals", 507 "detail": "All figures present point estimates without any uncertainty quantification; experiments appear to be single runs throughout, making it impossible to assess result stability or variance." 508 }, 509 { 510 "flag": "No statistical significance testing", 511 "detail": "All comparative claims rely solely on observed differences without significance tests; improvements of 5-13% in Jain's fairness index are presented without evidence they exceed experimental noise." 512 }, 513 { 514 "flag": "Open-source claim without repository URL", 515 "detail": "The paper repeatedly describes Equinox as 'open-source' but provides no repository URL, DOI, or link to any code release, making the claim unverifiable." 516 }, 517 { 518 "flag": "No limitations section", 519 "detail": "No dedicated limitations or threats-to-validity section exists; a single sentence about multi-node deployment in Section 7.5 is insufficient, and no threats such as workload representativeness or hardware specificity are addressed." 520 }, 521 { 522 "flag": "Generalization overclaim: A100-only results marketed as heterogeneous", 523 "detail": "The abstract claims 'proving fairness under bounded discrepancy across heterogeneous platforms' but all experiments use NVIDIA A100 GPUs (80GB and 40GB variants); results on other GPU architectures (H100, AMD, edge hardware) are entirely absent." 524 }, 525 { 526 "flag": "94% GPU utilization claim untraceable", 527 "detail": "The abstract and conclusion claim 'maintaining 94% GPU utilization' but this specific figure does not appear in any labeled figure or table in the experimental sections." 528 }, 529 { 530 "flag": "No funding disclosure", 531 "detail": "No funding source is acknowledged; two authors affiliated with industry organizations (UltraRISC Shanghai, China Telecom) raise potential conflict-of-interest questions that are not addressed." 532 } 533 ], 534 "cited_papers": [ 535 { 536 "title": "Fairness in Serving Large Language Models (VTC)", 537 "relevance": "Primary baseline; Equinox directly extends and critiques the Virtual Token Counter's single-metric fairness approach, which is the central prior work" 538 }, 539 { 540 "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention (vLLM)", 541 "relevance": "One of three serving systems Equinox is evaluated on; establishes PagedAttention and continuous batching foundations that Equinox builds upon" 542 }, 543 { 544 "title": "SGLang: Efficient Execution of Structured Language Model Programs", 545 "relevance": "One of three serving systems Equinox is evaluated on; standard benchmark integrated into SGLang is used for evaluation" 546 }, 547 { 548 "title": "Orca: A Distributed Serving System for Transformer-Based Generative Models", 549 "relevance": "Establishes continuous batching as the base serving paradigm that all scheduling approaches in this paper assume" 550 }, 551 { 552 "title": "Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve", 553 "relevance": "Related serving optimization using chunked prefills; discussed in related work as addressing interference without resolving fairness" 554 }, 555 { 556 "title": "LMSYS-Chat-1M: A Large-Scale Real-World LLM Conversation Dataset", 557 "relevance": "Primary training dataset for MoPE and evaluation trace for real-world workload experiments; used as both training corpus and benchmark" 558 }, 559 { 560 "title": "Efficient Interactive LLM Serving with Proxy Model-based Sequence Length Prediction", 561 "relevance": "Direct baseline for MoPE's prediction component; Equinox's single-proxy model baseline uses the methodology from this work" 562 }, 563 { 564 "title": "DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving", 565 "relevance": "Related work on prefill-decode disaggregation that addresses the same architectural bifurcation Equinox formalizes as the root cause of fairness failure" 566 } 567 ], 568 "engagement_factors": { 569 "practical_relevance": { 570 "score": 3, 571 "justification": "Directly addresses production LLM serving fairness with an open-source implementation on vLLM and SGLang, targeting a real pain point in multi-tenant deployments." 572 }, 573 "surprise_contrarian": { 574 "score": 1, 575 "justification": "The finding that token counts are poor fairness proxies is intuitive once stated; the dual-counter approach is a natural extension rather than a counterintuitive reversal of conventional wisdom." 576 }, 577 "fear_safety": { 578 "score": 0, 579 "justification": "No AI safety or risk concerns raised; this is a systems efficiency paper about resource allocation fairness." 580 }, 581 "drama_conflict": { 582 "score": 0, 583 "justification": "No controversy or conflict angle; the paper is a straightforward engineering improvement over existing scheduling methods." 584 }, 585 "demo_ability": { 586 "score": 2, 587 "justification": "Claims open-source release integrated with vLLM and SGLang, making it potentially deployable if a repository becomes available; no URL currently provided." 588 }, 589 "brand_recognition": { 590 "score": 1, 591 "justification": "Shanghai Jiao Tong University is a respected research institution but not a prominent AI lab; no affiliation with OpenAI, Google, Meta, or Anthropic." 592 } 593 }, 594 "hn_data": { 595 "threads": [ 596 { 597 "hn_id": "42898914", 598 "title": "Gradual Disempowerment: How Even Incremental AI Progress Poses Existential Risks", 599 "points": 87, 600 "comments": 84, 601 "url": "https://news.ycombinator.com/item?id=42898914", 602 "created_at": "2025-02-01T15:12:22Z" 603 } 604 ], 605 "top_points": 87, 606 "total_points": 87, 607 "total_comments": 84 608 } 609 }