scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25645B)
      1 {
      2   "paper": {
      3     "title": "Equinox: Holistic Fair Scheduling in Serving Large Language Models",
      4     "authors": [
      5       "Zhixiang Wei",
      6       "James Yen",
      7       "Jingyi Chen",
      8       "Ziyang Zhang",
      9       "Zhibai Huang",
     10       "Chen Chen",
     11       "Xingzi Yu",
     12       "Yicheng Gu",
     13       "Chenggang Wu",
     14       "Yun Wang",
     15       "Mingyuan Xia",
     16       "Jie Wu",
     17       "Hao Wang",
     18       "Zhengwei Qi"
     19     ],
     20     "year": 2025,
     21     "venue": "arXiv",
     22     "arxiv_id": "2508.16646",
     23     "doi": "10.48550/arXiv.2508.16646"
     24   },
     25   "scan_version": 2,
     26   "active_modules": ["experimental_rigor", "data_leakage"],
     27   "methodology_tags": ["benchmark-eval"],
     28   "key_findings": "Equinox proposes a dual-counter fairness framework (UFC + RFC) for LLM serving that separates user-facing metrics (latency, weighted tokens) from operator-facing metrics (GPU utilization, throughput). A Mixture of Prediction Experts (MoPE) with 3 specialized experts reduces L1 token prediction error from 80 to 33 versus single proxy models. Evaluations on ShareGPT and LMSYS traces across S-LoRA, vLLM, and SGLang show up to 1.3x throughput improvement, 60% lower TTFT, and 13% higher Jain's fairness index versus VTC.",
     29   "checklist": {
     30     "artifacts": {
     31       "code_released": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper describes Equinox as 'open-source' (Section 1, Section 4) but provides no repository URL, GitHub link, or archive link anywhere in the paper."
     35       },
     36       "data_released": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The paper uses publicly available datasets: LMSYS Chatbot Arena (lmsys-chat-1m) [47] and ShareGPT [38]. These are standard public benchmarks."
     40       },
     41       "environment_specified": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "Hardware is specified (A100-80GB, 8×A100-40GB cluster, Intel Xeon Gold 5218, 256GB DRAM), but no software environment details are given — no Python version, no library versions, no requirements.txt or Dockerfile."
     45       },
     46       "reproduction_instructions": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "No reproduction instructions, README, or scripts are provided. The paper mentions ~1000 lines of Python atop existing systems but provides no way to replicate experiments."
     50       }
     51     },
     52     "statistical_methodology": {
     53       "confidence_intervals_or_error_bars": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No confidence intervals or error bars are reported on any of the main results. Figures show point estimates only (e.g., Figures 9-13)."
     57       },
     58       "significance_tests": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Claims like '1.3× higher throughput' and '60% lower TTFT' are made by comparing raw numbers with no statistical significance tests."
     62       },
     63       "effect_sizes_reported": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Relative improvements are reported with baseline context throughout: '1.3× higher throughput', '60% lower TTFT', '13% higher fairness', '42% worst-case service gap reduction', '86% average gap reduction' (Section 1, Section 7)."
     67       },
     68       "sample_size_justified": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No justification for workload sizes (e.g., 1280 prompts for SGLang, 1000 requests per client for vLLM). No discussion of whether these are sufficient."
     72       },
     73       "variance_reported": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. It is unclear whether experiments were run multiple times."
     77       }
     78     },
     79     "evaluation_design": {
     80       "baselines_included": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "FCFS and VTC are used as baselines throughout all experiments (Section 7.1). For prediction, a single proxy model [31] is the baseline."
     84       },
     85       "baselines_contemporary": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "VTC (Sheng et al., OSDI 2024 [39]) is the state-of-the-art fair scheduling approach for LLM serving. The proxy model baseline is from 2024 [31]. These are recent and competitive."
     89       },
     90       "ablation_study": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Section 7.4 presents an ablation study (Table 1) isolating MoPE's contribution by comparing Equinox and VTC with different predictors (Single, MoPE, Oracle)."
     94       },
     95       "multiple_metrics": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Multiple metrics are used: service rate, service difference, TTFT (P50/P90), end-to-end latency, Jain's fairness index, throughput, GPU utilization (Section 7.1)."
     99       },
    100       "human_evaluation": {
    101         "applies": false,
    102         "answer": false,
    103         "justification": "This is a systems/scheduling paper; human evaluation of system outputs is not relevant to the claims about fairness and throughput."
    104       },
    105       "held_out_test_set": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "MoPE is trained on LMSYS dataset and tested on the unseen ShareGPT dataset (Section 7.3), demonstrating generalization across datasets."
    109       },
    110       "per_category_breakdown": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Results are broken down per-client (Figures 9c, 10d, 12c), per-system (S-LoRA, vLLM, SGLang in Figure 13), per-metric (UFC/RFC components in Figure 5), and per-scenario (balanced, stochastic, overload, dynamic)."
    114       },
    115       "failure_cases_discussed": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "No failure cases or scenarios where Equinox underperforms are discussed. All results show improvements."
    119       },
    120       "negative_results_reported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Figure 15 shows the fairness-throughput tradeoff: at α=0.9, throughput drops 20%. The paper also notes VTC+Single performs worse than baseline VTC (Table 1), and acknowledges multi-node deployment needs 'additional engineering efforts' (Section 7.5)."
    124       }
    125     },
    126     "claims_and_evidence": {
    127       "abstract_claims_supported": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Abstract claims of 1.3× throughput (Figure 9/17), 60% lower TTFT (Figure 9a), 13% higher fairness (Figure 13), and 94% GPU utilization (Figure 9b) are supported by results in the paper."
    131       },
    132       "causal_claims_justified": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper makes causal claims about Equinox improving fairness. The ablation study (Table 1) with controlled single-variable manipulation (same predictor, different scheduler; same scheduler, different predictor) adequately supports the causal attribution."
    136       },
    137       "generalization_bounded": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "The paper claims to 'redefine fairness in multi-tenant LLM serving' (Section 9) broadly, but evaluates only on Llama-2-7b and Llama-2-70b models. No evaluation on other model families (e.g., Mistral, GPT) or model sizes. The title and abstract make broad claims not bounded to the tested setting."
    141       },
    142       "alternative_explanations_discussed": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "No discussion of alternative explanations for the results. For example, could the improvements be due to the specific workload characteristics rather than the general framework design? No robustness checks against different workload distributions."
    146       },
    147       "proxy_outcome_distinction": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "The paper explicitly discusses the gap between token count as a proxy and actual fairness (Section 1, Figure 1), arguing that token-level metrics are inadequate proxies for the multi-dimensional fairness they measure. The HF metric is clearly defined in terms of what it measures (Section 3.3)."
    151       }
    152     },
    153     "setup_transparency": {
    154       "model_versions_specified": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Specific model versions are given: Llama-2-7b for synthetic traces, Llama-2-70b for real-world traces (Section 7.1). BERT-base for MoPE experts (Section 6)."
    158       },
    159       "prompts_provided": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "This is a systems/scheduling paper that does not use prompting as part of its method. The LLM requests come from existing trace datasets."
    163       },
    164       "hyperparameters_reported": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Key hyperparameters are reported: α=0.7, β=0.3 (Section 7.6), δ=0.1 (Section 3.1), 3 MoPE experts with boundaries at 33rd/66th/99th percentiles (<53, 53-210, >210 tokens), BF16 precision (Section 6), TP=8 for multi-GPU (Section 7.1)."
    168       },
    169       "scaffolding_described": {
    170         "applies": false,
    171         "answer": false,
    172         "justification": "No agentic scaffolding is used. This is a scheduling system, not an agentic AI system."
    173       },
    174       "data_preprocessing_documented": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "MoPE training data partitioning is documented: router trained on up to 120k samples from LMSYS, data split by output length boundaries at 33rd/66th/99th percentiles (Section 6, Figure 7c). Workload configurations are described in detail for each experiment (Sections 7.2-7.3)."
    178       }
    179     },
    180     "limitations_and_scope": {
    181       "limitations_section_present": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No dedicated limitations section exists. The paper briefly mentions multi-node deployment needing 'additional engineering efforts' (Section 7.5) but this is not a substantive limitations discussion."
    185       },
    186       "threats_to_validity_specific": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "No threats to validity are discussed. No mention of specific concerns about generalizability beyond the tested models, workloads, or hardware."
    190       },
    191       "scope_boundaries_stated": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "No explicit scope boundaries are stated. The paper does not clarify what settings or configurations the results do NOT apply to."
    195       }
    196     },
    197     "data_integrity": {
    198       "raw_data_available": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "No raw experimental data (logs, traces, measurements) is made available. Only aggregated results in figures and tables."
    202       },
    203       "data_collection_described": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Synthetic workload parameters are fully specified (client rates, input/output lengths). Real-world traces are from public datasets (LMSYS, ShareGPT) with setup described in Section 7.1."
    207       },
    208       "recruitment_methods_described": {
    209         "applies": false,
    210         "answer": false,
    211         "justification": "No human participants. Data sources are standard public benchmark traces."
    212       },
    213       "data_pipeline_documented": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "The MoPE training pipeline is documented: training dataset → router training → dataset splitting → expert training (Figure 8). Online prediction pipeline is also documented. Workload generation is specified per experiment."
    217       }
    218     },
    219     "conflicts_of_interest": {
    220       "funding_disclosed": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No funding information or acknowledgments section is present in the paper."
    224       },
    225       "affiliations_disclosed": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "All author affiliations are listed: Shanghai Jiao Tong University, UltraRISC Shanghai, China Telecom Cloud Computing Research Institute, Stevens Institute of Technology."
    229       },
    230       "funder_independent_of_outcome": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "Funding is not disclosed, so independence cannot be assessed. One author is from UltraRISC and another from China Telecom, which could have commercial interest in LLM serving."
    234       },
    235       "financial_interests_declared": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No competing interests or financial interests statement is present. One author is affiliated with UltraRISC (a commercial entity), which may have financial interest in LLM serving technology."
    239       }
    240     },
    241     "contamination": {
    242       "training_cutoff_stated": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "This paper evaluates a scheduling system, not a pre-trained model's capability on a benchmark. The LLMs (Llama-2) are used as workload generators, not as the subject of evaluation."
    246       },
    247       "train_test_overlap_discussed": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "Not applicable — the paper evaluates scheduling fairness, not model knowledge. No benchmark contamination concern exists for the scheduling evaluation."
    251       },
    252       "benchmark_contamination_addressed": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "Not applicable — scheduling system evaluation, not model capability evaluation."
    256       }
    257     },
    258     "human_studies": {
    259       "pre_registered": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "irb_or_ethics_approval": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "demographics_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "inclusion_exclusion_criteria": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "randomization_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       },
    284       "blinding_described": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in this study."
    288       },
    289       "attrition_reported": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "No human participants in this study."
    293       }
    294     },
    295     "cost_and_practicality": {
    296       "inference_cost_reported": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "MoPE overhead is reported: 0.02ms router overhead, 4.5ms total MoPE inference, less than 1% of average prompt latency (Section 6, Figure 7d). Memory usage is reported in Figure 7b."
    300       },
    301       "compute_budget_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Hardware is listed but total compute budget (GPU hours for training MoPE, total experiment time) is not stated."
    305       }
    306     },
    307     "experimental_rigor": {
    308       "seed_sensitivity_reported": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No mention of multiple random seeds or seed sensitivity. Stochastic arrival experiments use Poisson processes but it's unclear if multiple seeds were tested."
    312       },
    313       "number_of_runs_stated": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The number of experimental runs is never stated. It is unclear whether results are from single runs or averaged."
    317       },
    318       "hyperparameter_search_budget": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "α/β sensitivity is studied (Figure 15) but the search budget for other hyperparameters (δ, MoPE expert boundaries, training parameters) is not reported."
    322       },
    323       "best_config_selection_justified": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "The selection of α=0.7, β=0.3 is justified by the sensitivity analysis in Section 7.6/Figure 15, showing the fairness-throughput tradeoff. The 3-expert MoPE configuration is justified by Figure 7a-b."
    327       },
    328       "multiple_comparison_correction": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied despite many comparisons across scenarios, metrics, and systems."
    332       },
    333       "self_comparison_bias_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The authors implement Equinox and compare it against their own implementations of FCFS and VTC. No acknowledgment of self-comparison bias."
    337       },
    338       "compute_budget_vs_performance": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No comparison of performance at matched compute budgets. MoPE adds prediction overhead (reported as <1%) but the training cost of MoPE vs. simpler predictors is not compared."
    342       },
    343       "benchmark_construct_validity": {
    344         "applies": true,
    345         "answer": true,
    346         "justification": "Section 1 and Section 2 provide extensive analysis of why token count is an inadequate proxy for fairness (Figures 1-2), directly addressing construct validity of the VTC metric. The paper argues for multi-dimensional fairness as the construct."
    347       },
    348       "scaffold_confound_addressed": {
    349         "applies": false,
    350         "answer": false,
    351         "justification": "No agentic scaffolding is involved. The system evaluates scheduling, not model capability through scaffolds."
    352       }
    353     },
    354     "data_leakage": {
    355       "temporal_leakage_addressed": {
    356         "applies": false,
    357         "answer": false,
    358         "justification": "Not applicable — this evaluates a scheduling system, not model knowledge. The LLM is a workload generator, not the evaluation subject."
    359       },
    360       "feature_leakage_addressed": {
    361         "applies": false,
    362         "answer": false,
    363         "justification": "Not applicable — scheduling evaluation does not involve prediction tasks where feature leakage is a concern."
    364       },
    365       "non_independence_addressed": {
    366         "applies": true,
    367         "answer": true,
    368         "justification": "MoPE is trained on LMSYS data and tested on the unseen ShareGPT dataset (Section 7.1, 7.3), ensuring train-test independence for the prediction component."
    369       },
    370       "leakage_detection_method": {
    371         "applies": false,
    372         "answer": false,
    373         "justification": "Not applicable — no model capability benchmark where leakage detection would be relevant."
    374       }
    375     }
    376   },
    377   "claims": [
    378     {
    379       "claim": "Equinox achieves up to 1.3× higher throughput than FCFS and VTC",
    380       "evidence": "Figure 9c and Appendix Figure 17a show per-client service rates 1.3× higher than baselines in the balanced load scenario (Section 7.2.1).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Equinox reduces time-to-first-token latency by up to 60% versus VTC",
    385       "evidence": "Figure 9a shows response time comparison. The 60% figure appears to come from the balanced load scenario (Section 7.2.1).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Equinox achieves 13% higher Jain's fairness index compared to FCFS and VTC",
    390       "evidence": "Figure 13 shows cross-system fairness comparison: Equinox achieves 0.80-0.90 vs FCFS 0.60-0.72 and VTC 0.66-0.76 across S-LoRA, vLLM, and SGLang.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "MoPE reduces L1 prediction error for output tokens from 80 to 33 versus single proxy models",
    395       "evidence": "Figure 7a shows expert specialization results with 3 experts achieving mean L1 error of 33 vs baseline of 80 (Section 6).",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "Equinox cuts worst-case service gaps by 42% and average gaps by 86% versus VTC",
    400       "evidence": "Table 1 ablation study: Equinox+MoPE max diff 865.62 vs VTC 1505.13 (42% reduction), avg diff 150.64 vs 1106.31 (86% reduction).",
    401       "supported": "strong"
    402     },
    403     {
    404       "claim": "MoPE introduces less than 1% latency overhead",
    405       "evidence": "Figure 7d shows MoPE total inference 4.5ms vs average prompt latency 2400ms. Router overhead is 0.02ms (Section 6).",
    406       "supported": "strong"
    407     }
    408   ],
    409   "red_flags": [
    410     {
    411       "flag": "No error bars or variance across runs",
    412       "detail": "None of the main results include error bars, confidence intervals, or variance across runs. For a systems paper with stochastic workloads (Poisson arrivals), this is a significant omission — it's impossible to assess result stability."
    413     },
    414     {
    415       "flag": "Open-source claim without URL",
    416       "detail": "The paper repeatedly describes Equinox as 'open-source' (abstract, Section 1, Section 4) but provides no repository URL or archive link. This makes the claim unverifiable."
    417     },
    418     {
    419       "flag": "No limitations section",
    420       "detail": "The paper has no limitations or threats-to-validity section despite making broad claims about 'redefining fairness in multi-tenant LLM serving.'"
    421     },
    422     {
    423       "flag": "Narrow model evaluation for broad claims",
    424       "detail": "All evaluations use only Llama-2 models (7b and 70b). The paper claims general applicability to LLM serving but does not test on other model families, architectures, or sizes."
    425     },
    426     {
    427       "flag": "Cherry-picked improvement numbers",
    428       "detail": "The abstract highlights 'up to 1.3×' throughput and 'up to 60%' TTFT reduction. These are best-case numbers from specific scenarios. The SGLang results show more modest improvements (up to 30% TTFT, ~25% throughput at high RPS only)."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "Fairness in Serving Large Language Models",
    434       "authors": ["Ying Sheng", "Shiyi Cao", "Dacheng Li"],
    435       "year": 2024,
    436       "relevance": "VTC paper — the primary baseline for fair scheduling in LLM serving, published at OSDI 2024."
    437     },
    438     {
    439       "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention",
    440       "authors": ["Woosuk Kwon", "Zhuohan Li", "Siyuan Zhuang"],
    441       "year": 2023,
    442       "doi": "10.1145/3600006.3613165",
    443       "relevance": "vLLM paper — foundational LLM serving system with PagedAttention, one of the platforms Equinox is evaluated on."
    444     },
    445     {
    446       "title": "SGLang: Efficient Execution of Structured Language Model Programs",
    447       "authors": ["Lianmin Zheng", "Liangsheng Yin", "Zhiqiang Xie"],
    448       "year": 2024,
    449       "relevance": "SGLang serving system — another platform Equinox is evaluated on, published at NeurIPS 2024."
    450     },
    451     {
    452       "title": "Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve",
    453       "authors": ["Amey Agrawal", "Nitin Kedia", "Ashish Panwar"],
    454       "year": 2024,
    455       "relevance": "Introduces chunked prefills for LLM serving, addressing prefill-decode interference — a key related system optimization."
    456     },
    457     {
    458       "title": "DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving",
    459       "authors": ["Yinmin Zhong", "Shengyu Liu", "Junda Chen"],
    460       "year": 2024,
    461       "arxiv_id": "2401.09670",
    462       "relevance": "Proposes disaggregated prefill/decode serving — directly relevant to the prefill-decode bifurcation problem Equinox addresses."
    463     },
    464     {
    465       "title": "Orca: A Distributed Serving System for Transformer-Based Generative Models",
    466       "authors": ["Gyeong-In Yu", "Joo Seong Jeong", "Geon-Woo Kim"],
    467       "year": 2022,
    468       "relevance": "Introduced continuous batching for LLM serving at OSDI 2022 — foundational technique Equinox builds upon."
    469     },
    470     {
    471       "title": "Efficient Interactive LLM Serving with Proxy Model-based Sequence Length Prediction",
    472       "authors": ["Haoran Qiu", "Weichao Mao", "Archit Patke"],
    473       "year": 2024,
    474       "arxiv_id": "2404.08509",
    475       "relevance": "Proxy model prediction baseline that MoPE is compared against for token length prediction."
    476     },
    477     {
    478       "title": "LoongServe: Efficiently Serving Long-Context Large Language Models with Elastic Sequence Parallelism",
    479       "authors": ["Bingyang Wu", "Shengyu Liu", "Yinmin Zhong"],
    480       "year": 2024,
    481       "doi": "10.1145/3694715.3695948",
    482       "relevance": "Addresses long-context LLM serving challenges, relevant to the workload diversity that Equinox's fairness framework handles."
    483     },
    484     {
    485       "title": "LMSYS-Chat-1M: A Large-Scale Real-World LLM Conversation Dataset",
    486       "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"],
    487       "year": 2024,
    488       "arxiv_id": "2309.11998",
    489       "relevance": "Primary dataset used for MoPE training and evaluation traces in Equinox experiments."
    490     },
    491     {
    492       "title": "Past-Future Scheduler for LLM Serving under SLA Guarantees",
    493       "authors": ["Ruihao Gong", "Shihao Bai", "Siyu Wu"],
    494       "year": 2025,
    495       "doi": "10.1145/3676641.3716011",
    496       "relevance": "SLA-aware LLM scheduling at ASPLOS 2025 — related approach to quality-of-service in LLM serving."
    497     }
    498   ]
    499 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs