ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (19448B)


      1 {
      2   "paper": {
      3     "title": "Aegaeon: Effective GPU Pooling for Concurrent LLM Serving on the Market",
      4     "authors": ["Yuxing Xiang", "Xue Li", "Kun Qian", "Yufan Yang", "Diwen Zhu", "Wenyuan Yu", "Ennan Zhai", "Xuanzhe Liu", "Xin Jin", "Jingren Zhou"],
      5     "year": 2025,
      6     "venue": "SOSP '25",
      7     "doi": "10.1145/3731569.3764815"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL or code archive is provided in the paper. The system is described as deployed at Alibaba Cloud but no open-source release is mentioned."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The production workload data from Alibaba Cloud Model Studio is not released. ShareGPT is public, but the paper's proprietary workload traces are not available."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "Hardware is described (H800 GPUs, DDR5, Intel Xeon 8469C CPUs) but no software environment specifications (requirements.txt, library versions, CUDA version) are provided beyond mentioning vLLM and Ray."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No reproduction instructions, README, or scripts are provided. The implementation details are described at the design level but not enough for reproduction."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are presented as point estimates (e.g., '2-2.5x higher request arrival rates', '82% GPU resource saving') without confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims Aegaeon outperforms baselines based on comparing numbers directly without any statistical significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Effect sizes are reported with baseline context throughout, e.g., '2-2.5x higher request arrival rates', '1.5-9x more goodput', 'GPU utilization from 13.3%~33.9% to 48.1%', 'GPUs reduced from 1,192 to 213 (82% saving)'."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for the number of models, workload configurations, or experimental runs chosen. The evaluation covers specific setups but does not discuss why these are sufficient."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations or variance across runs is reported. Results appear to be single-run. CDFs are shown for auto-scaling latency but no variance across repeated experiments."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "ServerlessLLM, ServerlessLLM+ (with oracle SJF scheduling), and MuxServe are used as baselines (§7.1)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "ServerlessLLM (OSDI 2024), MuxServe (ICML 2024), and BlitzScale (OSDI 2025) are all recent works. The baselines are state-of-the-art."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "§7.3 breaks down effectiveness of individual components: latency breakdown (Figure 14), auto-scaling speed (Figure 15), memory fragmentation (Figure 16). §5 shows progressive optimization stages (T0→T1→T2→T3) with component reuse removing 80%, then further reductions."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "SLO attainment, goodput, GPU utilization, auto-scaling latency, KV cache transfer overhead, and memory fragmentation are all reported."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is a systems paper about GPU resource management; human evaluation of system outputs is not relevant."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is a systems paper, not a machine learning evaluation. There is no train/test split concept applicable here."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by number of models, RPS, dataset type (ShareGPT, ShareGPT-ix2, ShareGPT-ox2), SLO strictness levels, hardware configurations (H800 vs A10), and model sizes (7B-72B)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Figure 13(c) shows that under the strictest SLO (0.2x), Aegaeon no longer outperforms MuxServe. The paper acknowledges 'stricter SLOs reduce slack time and limit GPU pooling opportunities.'"
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that under 0.2x SLO strictness, Aegaeon no longer outperforms MuxServe (Figure 13(c)), and that ServerlessLLM+ can actually underperform ServerlessLLM when SJF causes excessive auto-scaling."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims of '2-2.5x higher request arrival rates', '1.5-9x more goodput', '97% auto-scaling overhead reduction', and '82% GPU resource saving (1,192→213)' are all supported by results in §7."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims about token-level auto-scaling improving performance. These are supported by ablation-style analysis (progressive optimization stages) and controlled comparisons against baselines with the same workloads."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper tests on specific hardware (H800, A10), model sizes (6B-72B), and datasets (ShareGPT variants). §7.4 explicitly tests generalization to different hardware and model sizes. Claims are generally scoped to the tested settings."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No discussion of alternative explanations for the results. The paper does not consider confounds such as whether the improvements are specific to the workload distribution or vLLM backend."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Model families are listed (Qwen, Llama, InternLM, Yi) with parameter counts but no specific model versions or checkpoint identifiers are provided."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "This is a systems paper about serving infrastructure; it does not use prompting as part of its methodology."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Key hyperparameters are reported: MAX_GPSIZE=8, QMAX=4s, TTFT=10s, TBT=100ms, correction factor β=0.625, prefill/decode instance split (6/10). Latency model constants (C1-C5) are described as profiled."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is a systems paper about GPU resource management."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Dataset construction is described: ShareGPT is used as base, ShareGPT-ix2 and ShareGPT-ox2 are created by scaling input/output lengths by 2x. Workloads are synthesized with scaled Poisson processes and random sampling (§7.1)."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No dedicated limitations or threats-to-validity section exists in the paper."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No threats to validity are discussed anywhere in the paper."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper acknowledges scope boundaries: §7.2 notes MuxServe has advantages 'in extremely latency-sensitive scenarios' (Figure 13(c)). The system targets sporadic multi-model workloads specifically, not single-model high-throughput scenarios."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw experimental data, logs, or production traces are made available for verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Production workload statistics are described: 779 models, 167.6M requests, 30K GPUs, arrival rate distributions (Figure 1). Experimental setup details hardware, models, and datasets (§7.1)."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. The data comes from production workloads and standard benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The experimental pipeline is documented: workload synthesis from ShareGPT with Poisson processes, dataset variants created by scaling lengths, and the production deployment setup (213 H20 GPUs, 47 models) is described in §7.5."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Acknowledgments section lists National Key Research and Development Program of China (Grant 2022YFB4500700), Fundamental Research Funds for Central Universities, and NSFC grants."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: Peking University and Alibaba Group. The system is deployed at Alibaba Cloud Model Studio."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Multiple authors are from Alibaba Group, and the system is deployed at Alibaba Cloud. Alibaba has a financial interest in demonstrating GPU cost savings for their cloud platform. Government grants are independent but the corporate affiliation creates a non-independent interest."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present. Alibaba employees evaluating a system deployed at Alibaba Cloud represents an undeclared potential conflict."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This paper does not evaluate pre-trained model capabilities on benchmarks. It evaluates a serving system's resource efficiency."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Not applicable — the paper tests serving infrastructure, not model knowledge."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not applicable — no model capability benchmarks are evaluated."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this systems paper."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "The paper reports GPU reduction from 1,192 to 213 GPUs (82% saving) in production. Auto-scaling latencies are reported via CDF (Figure 15). GPU utilization improved from 13.3-33.9% to 48.1%."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Testbed is specified: 2 nodes with 16 NVIDIA H800 80GB GPUs, 2TB DDR5, 192 Intel Xeon CPUs. Production deployment uses 213 H20 GPUs. Implementation is 5,700 lines of Python and CUDA/C++."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Aegaeon sustains 2-2.5x higher request arrival rates compared to ServerlessLLM",
    286       "evidence": "Figures 11 and 12 show SLO attainment across varying model counts and RPS. At RPS=0.1, Aegaeon supports up to 70 models vs ~35 for ServerlessLLM. At RPS=0.5, the advantage is 2.5x (§7.2).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Aegaeon achieves 1.5-9x more goodput compared to existing solutions",
    291       "evidence": "Figures 11(c) and 12 show goodput comparisons with vertical lines indicating maximum sustainable load at 90% SLO target (§7.2).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Aegaeon reduces auto-scaling overhead by 97%",
    296       "evidence": "§5 details progressive optimizations from T0 (26.9s for 13B model) to T3, with component reuse removing 80% and explicit memory management + KV cache synchronization removing the remainder. Figure 15 shows sub-second scaling in practice.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Production deployment reduces GPU count from 1,192 to 213 (82% saving)",
    301       "evidence": "§7.5 describes beta deployment at Alibaba Cloud Model Studio serving 47 models (1.8B-72B). Figure 18 shows GPU utilization improvement from 13.3-33.9% to 48.1% over 70 hours.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Token-level auto-scaling supports up to 7 models per GPU",
    306       "evidence": "Figure 11(a) shows Aegaeon supporting 70 models on 10 decoding instances at RPS=0.1 with >90% SLO attainment (§7.2).",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval", "case-study"],
    311   "key_findings": "Aegaeon introduces token-level auto-scaling for multi-model LLM serving, enabling preemptive model switching at per-token granularity rather than per-request. Through component reuse, explicit memory management, and fine-grained KV cache synchronization, it reduces auto-scaling overhead by 97%. In controlled experiments, Aegaeon sustains 2-2.5x higher request rates and 1.5-9x more goodput than ServerlessLLM and MuxServe. Production deployment at Alibaba Cloud serving 47 models reduced GPU requirements from 1,192 to 213, an 82% saving.",
    312   "red_flags": [
    313     {
    314       "flag": "Company evaluating own production system",
    315       "detail": "Alibaba Group employees evaluate Aegaeon, which is deployed at Alibaba Cloud Model Studio. The production deployment results (§7.5) come from their own infrastructure with no independent verification possible."
    316     },
    317     {
    318       "flag": "No variance or repeated trials",
    319       "detail": "All experimental results appear to be single runs with no error bars, confidence intervals, or variance across repeated experiments."
    320     },
    321     {
    322       "flag": "No limitations section",
    323       "detail": "The paper lacks a dedicated limitations or threats-to-validity section despite being a systems paper with production claims."
    324     },
    325     {
    326       "flag": "No artifact release",
    327       "detail": "Neither code nor production workload traces are released, making independent reproduction impossible."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention",
    333       "authors": ["Woosuk Kwon", "Zhuohan Li", "Siyuan Zhuang"],
    334       "year": 2023,
    335       "relevance": "Foundational LLM serving system (vLLM) used as Aegaeon's execution backend; central to LLM inference infrastructure."
    336     },
    337     {
    338       "title": "ServerlessLLM: Low-Latency Serverless Inference for Large Language Models",
    339       "authors": ["Yao Fu", "Leyang Xue"],
    340       "year": 2024,
    341       "relevance": "Primary baseline for auto-scaling LLM serving; represents state-of-the-art in serverless LLM inference."
    342     },
    343     {
    344       "title": "MuxServe: Flexible Spatial-Temporal Multiplexing for Multiple LLM Serving",
    345       "authors": ["Jiangfei Duan", "Runyu Lu"],
    346       "year": 2024,
    347       "relevance": "Primary baseline for multiplexing-based multi-model serving on GPUs."
    348     },
    349     {
    350       "title": "DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving",
    351       "authors": ["Yinmin Zhong", "Shengyu Liu"],
    352       "year": 2024,
    353       "relevance": "Pioneered prefill-decoding disaggregation adopted by Aegaeon; key architectural influence."
    354     },
    355     {
    356       "title": "BlitzScale: Fast and Live Large Model Autoscaling with O(1) Host Caching",
    357       "authors": ["Dingyan Zhang", "Haotian Wang"],
    358       "year": 2025,
    359       "relevance": "Contemporary auto-scaling system for LLM serving, addressing cold start optimization."
    360     },
    361     {
    362       "title": "Splitwise: Efficient Generative LLM Inference Using Phase Splitting",
    363       "authors": ["Pratyush Patel", "Esha Choukse"],
    364       "year": 2024,
    365       "relevance": "Another approach to disaggregating prefill and decoding phases in LLM inference."
    366     },
    367     {
    368       "title": "AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving",
    369       "authors": ["Zhuohan Li", "Lianmin Zheng"],
    370       "year": 2023,
    371       "relevance": "Statistical multiplexing approach to multi-model serving that Aegaeon improves upon."
    372     },
    373     {
    374       "title": "Mooncake: Trading More Storage for Less Computation — A KVCache-centric Architecture for Serving LLM Chatbot",
    375       "authors": ["Ruoyu Qin", "Zheming Li"],
    376       "year": 2025,
    377       "relevance": "KV cache management architecture for LLM serving; related industrial deployment at scale."
    378     }
    379   ]
    380 }

Impressum · Datenschutz