scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24926B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Equinox: Holistic Fair Scheduling in Serving Large Language Models",
      6     "authors": [
      7       "Zhixiang Wei",
      8       "James Yen",
      9       "Jingyi Chen",
     10       "Ziyang Zhang",
     11       "Zhibai Huang",
     12       "Chen Chen",
     13       "Xingzi Yu",
     14       "Yicheng Gu",
     15       "Chenggang Wu",
     16       "Yun Wang",
     17       "Mingyuan Xia",
     18       "Jie Wu",
     19       "Hao Wang",
     20       "Zhengwei Qi"
     21     ],
     22     "year": 2025,
     23     "venue": "arXiv.org",
     24     "arxiv_id": "2508.16646",
     25     "doi": "10.48550/arXiv.2508.16646"
     26   },
     27   "checklist": {
     28     "claims_and_evidence": {
     29       "abstract_claims_supported": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The abstract's main claims (1.3× throughput, 60% lower TTFT, 13% higher fairness vs VTC, 94% GPU utilization) are all backed by figures and tables in Sections 7.2–7.3 and the ablation (Table 1).",
     33         "source": "haiku"
     34       },
     35       "causal_claims_justified": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "The paper uses controlled experiments with clear baselines (FCFS, VTC) and an ablation study in Table 1 that isolates MoPE vs. scheduling-algorithm contributions, providing adequate causal support for a systems paper.",
     39         "source": "haiku"
     40       },
     41       "generalization_bounded": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The conclusion claims 'proving fairness under bounded discrepancy across heterogeneous platforms,' but experiments are limited to A100 GPUs only; different GPU architectures, CPU-bound serving, or non-chatbot workloads are not tested.",
     45         "source": "haiku"
     46       },
     47       "alternative_explanations_discussed": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Equinox bundles adaptive batching and stall-free scheduling alongside the fairness algorithm; the ablation isolates MoPE but not these extra optimizations, and no alternative explanations for the throughput gains are discussed.",
     51         "source": "haiku"
     52       },
     53       "proxy_outcome_distinction": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper uses well-defined metrics (Jain's Fairness Index, service difference, TTFT, throughput) and clearly matches claims to the metrics measured; no conflation of proxy metrics with higher-level goals.",
     57         "source": "haiku"
     58       }
     59     },
     60     "limitations_and_scope": {
     61       "limitations_section_present": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "There is no dedicated limitations or threats-to-validity section; Section 7.5 briefly notes multi-node deployment as future engineering work, but this is in the scalability subsection, not a limitations section.",
     65         "source": "haiku"
     66       },
     67       "threats_to_validity_specific": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No threats-to-validity are discussed; the paper does not address potential confounds such as workload distribution assumptions, hardware-specific optimizations, or generalizability beyond Llama-2 models.",
     71         "source": "haiku"
     72       },
     73       "scope_boundaries_stated": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "The paper does not explicitly state what the results do not show (e.g., inapplicability to non-chat workloads, non-A100 hardware, models beyond Llama-2, multi-node clusters); the scope is implied but never stated as a boundary.",
     77         "source": "haiku"
     78       }
     79     },
     80     "conflicts_of_interest": {
     81       "funding_disclosed": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "No funding acknowledgment section appears anywhere in the paper; affiliations include UltraRISC Shanghai (a commercial chip company) and China Telecom, raising potential undisclosed interests.",
     85         "source": "haiku"
     86       },
     87       "affiliations_disclosed": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "All author affiliations are listed in the header: Shanghai Jiao Tong University, UltraRISC Shanghai, Cloud Computing Research Institute/China Telecom, and Stevens Institute of Technology.",
     91         "source": "haiku"
     92       },
     93       "funder_independent_of_outcome": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "No funding is disclosed, so independence cannot be assessed; the affiliation with UltraRISC Shanghai (hardware company) and China Telecom is noted but no funder/outcome relationship is established.",
     97         "source": "haiku"
     98       },
     99       "financial_interests_declared": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is present anywhere in the paper.",
    103         "source": "haiku"
    104       }
    105     },
    106     "scope_and_framing": {
    107       "key_terms_defined": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Holistic fairness, UFC, RFC, and Jain's Fairness Index are all given precise mathematical definitions in Section 3; prefill-decode bifurcation is explained with the roofline model in Figure 3.",
    111         "source": "haiku"
    112       },
    113       "intended_contribution_clear": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Three explicit contributions are listed: (1) formalizing holistic fairness, (2) the deterministic MoPE framework, and (3) the Equinox open-source system implementation.",
    117         "source": "haiku"
    118       },
    119       "engagement_with_prior_work": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Section 8 (Related Work) explicitly positions Equinox against VTC, FCFS, chunked-prefill systems (Sarathi-Serve, DistServe), and existing length prediction methods, explaining why each falls short.",
    123         "source": "haiku"
    124       }
    125     }
    126   },
    127   "type_checklist": {
    128     "empirical": {
    129       "artifacts": {
    130         "code_released": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "The paper calls Equinox 'open-source' but provides no repository URL or code link anywhere in the paper; without a URL, this cannot be verified and fails the strict criterion.",
    134           "source": "haiku"
    135         },
    136         "data_released": {
    137           "applies": true,
    138           "answer": true,
    139           "justification": "Both ShareGPT and LMSYS Chat-1M are publicly available standard datasets used unmodified as workload traces.",
    140           "source": "haiku"
    141         },
    142         "environment_specified": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "Hardware (A100 GPUs, Intel Xeon Gold 5218) and TP=8 are specified, but no requirements.txt, Dockerfile, or dependency version list is provided; 'implemented in ~1000 lines of Python' is insufficient.",
    146           "source": "haiku"
    147         },
    148         "reproduction_instructions": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "No step-by-step instructions for reproducing experiments are included; the paper describes the system design but not how to replicate the evaluation setup.",
    152           "source": "haiku"
    153         }
    154       },
    155       "statistical_methodology": {
    156         "confidence_intervals_or_error_bars": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "No confidence intervals or error bars appear on any result figure or table; all results are reported as point estimates.",
    160           "source": "haiku"
    161         },
    162         "significance_tests": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "No statistical significance tests are performed for any comparative claims; improvements are stated as direct measurements without p-values or hypothesis tests.",
    166           "source": "haiku"
    167         },
    168         "effect_sizes_reported": {
    169           "applies": true,
    170           "answer": true,
    171           "justification": "Effect sizes are reported in context: 1.3× throughput improvement, 60% TTFT reduction, 13% fairness gain over VTC baseline, all with baselines specified.",
    172           "source": "haiku"
    173         },
    174         "sample_size_justified": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "No justification is given for the number of clients (256 in SGLang, 27 in S-LoRA, 1-8 in vLLM), request counts (1280, 1000), or experiment durations.",
    178           "source": "haiku"
    179         },
    180         "variance_reported": {
    181           "applies": true,
    182           "answer": false,
    183           "justification": "Table 1 reports variance of service difference, but main metrics (Jain's Index, TTFT, throughput) are reported as single values without standard deviation or variance across runs.",
    184           "source": "haiku"
    185         }
    186       },
    187       "evaluation_design": {
    188         "baselines_included": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "FCFS and VTC are used as baselines throughout all experiments, and a Single Proxy Model baseline is used for the prediction component in the ablation study.",
    192           "source": "haiku"
    193         },
    194         "baselines_contemporary": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "VTC (OSDI 2024) is a contemporary and directly relevant baseline; FCFS is the de facto production default and an appropriate lower bound.",
    198           "source": "haiku"
    199         },
    200         "ablation_study": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Section 7.4 (Table 1) systematically ablates MoPE by comparing Equinox+Oracle, Equinox+MoPE, Equinox+Single, VTC+Oracle, VTC+MoPE, VTC+Single, isolating both scheduling algorithm and prediction contributions.",
    204           "source": "haiku"
    205         },
    206         "multiple_metrics": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "Evaluation uses service rate, absolute service difference, Jain's Fairness Index, P50/P90 TTFT, end-to-end latency, GPU utilization, memory bandwidth, and throughput (RPS).",
    210           "source": "haiku"
    211         },
    212         "human_evaluation": {
    213           "applies": false,
    214           "answer": false,
    215           "justification": "This is a systems scheduling paper; human evaluation of outputs is clearly irrelevant to the research questions.",
    216           "source": "haiku"
    217         },
    218         "held_out_test_set": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "MoPE is trained on LMSYS Chat-1M and explicitly tested for generalizability on the unseen ShareGPT dataset, as stated in Section 7.1.",
    222           "source": "haiku"
    223         },
    224         "per_category_breakdown": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "Per-client breakdowns are provided in all experiment figures (service rate, latency, fairness index per client); cross-system comparisons appear in Figure 13.",
    228           "source": "haiku"
    229         },
    230         "failure_cases_discussed": {
    231           "applies": true,
    232           "answer": false,
    233           "justification": "No failure cases are shown or discussed; Section 7.5 briefly notes multi-node deployment as future work but does not identify scenarios where Equinox would fail or underperform.",
    234           "source": "haiku"
    235         },
    236         "negative_results_reported": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "Table 1 shows VTC+Single performs worse than VTC alone (Max Diff 3344 vs 1505), and Section 7.4 explicitly states 'Equinox+single proxy model offers little benefit over VTC with the same predictor.'",
    240           "source": "haiku"
    241         }
    242       },
    243       "setup_transparency": {
    244         "model_versions_specified": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "Llama-2-7b and Llama-2-70b are specified as the serving models; MoPE uses BERT-base as the regression backbone.",
    248           "source": "haiku"
    249         },
    250         "prompts_provided": {
    251           "applies": false,
    252           "answer": false,
    253           "justification": "This is a serving-system scheduling paper; the concept of LLM prompts as an experimental variable is not applicable to the research design.",
    254           "source": "haiku"
    255         },
    256         "hyperparameters_reported": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "Key hyperparameters are reported: α=0.7, β=0.3, δ=0.1, 3 experts for MoPE, expert boundaries at 33rd/66th/99th percentiles (<53, 53–210, >210 tokens), and TP=8.",
    260           "source": "haiku"
    261         },
    262         "scaffolding_described": {
    263           "applies": false,
    264           "answer": false,
    265           "justification": "There is no agentic scaffolding in this paper; it is an LLM inference scheduling system, not an agentic pipeline.",
    266           "source": "haiku"
    267         },
    268         "data_preprocessing_documented": {
    269           "applies": true,
    270           "answer": true,
    271           "justification": "MoPE training pipeline is described in Section 6 and Figure 8: feature embedding, similarity lookups, rule-based + data-driven routing, stratified splits, and early stopping are all specified.",
    272           "source": "haiku"
    273         }
    274       },
    275       "data_integrity": {
    276         "raw_data_available": {
    277           "applies": true,
    278           "answer": false,
    279           "justification": "Raw experimental measurements and trace logs are not shared; only summarized results in figures and tables are available.",
    280           "source": "haiku"
    281         },
    282         "data_collection_described": {
    283           "applies": true,
    284           "answer": true,
    285           "justification": "Workloads are described precisely: synthetic scenarios give exact client rates, input/output lengths, and arrival distributions; real traces use publicly described LMSYS Chat-1M and ShareGPT datasets.",
    286           "source": "haiku"
    287         },
    288         "recruitment_methods_described": {
    289           "applies": false,
    290           "answer": false,
    291           "justification": "No human participants involved; workloads use existing public conversation datasets.",
    292           "source": "haiku"
    293         },
    294         "data_pipeline_documented": {
    295           "applies": true,
    296           "answer": true,
    297           "justification": "Figure 8 documents the full MoPE offline training pipeline (dataset → router training → dataset split → expert training) and online prediction workflow.",
    298           "source": "haiku"
    299         }
    300       },
    301       "contamination": {
    302         "training_cutoff_stated": {
    303           "applies": false,
    304           "answer": false,
    305           "justification": "This paper evaluates a scheduling system, not LLM model capabilities on benchmarks; LLM training cutoff is not relevant to the research question.",
    306           "source": "haiku"
    307         },
    308         "train_test_overlap_discussed": {
    309           "applies": false,
    310           "answer": false,
    311           "justification": "Contamination of LLM training data is not applicable; MoPE train/test split uses LMSYS for training and ShareGPT for testing, which is explicitly described.",
    312           "source": "haiku"
    313         },
    314         "benchmark_contamination_addressed": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "Not evaluating LLM model capabilities on benchmarks; contamination is irrelevant to scheduling algorithm evaluation.",
    318           "source": "haiku"
    319         }
    320       },
    321       "human_studies": {
    322         "pre_registered": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants.",
    326           "source": "haiku"
    327         },
    328         "irb_or_ethics_approval": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants.",
    332           "source": "haiku"
    333         },
    334         "demographics_reported": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants.",
    338           "source": "haiku"
    339         },
    340         "inclusion_exclusion_criteria": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants.",
    344           "source": "haiku"
    345         },
    346         "randomization_described": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants.",
    350           "source": "haiku"
    351         },
    352         "blinding_described": {
    353           "applies": false,
    354           "answer": false,
    355           "justification": "No human participants.",
    356           "source": "haiku"
    357         },
    358         "attrition_reported": {
    359           "applies": false,
    360           "answer": false,
    361           "justification": "No human participants.",
    362           "source": "haiku"
    363         }
    364       },
    365       "cost_and_practicality": {
    366         "inference_cost_reported": {
    367           "applies": true,
    368           "answer": true,
    369           "justification": "MoPE overhead is explicitly reported as 4.5ms total (<1% of average prompt latency of 2400ms); TTFT and end-to-end latency are the primary evaluation metrics throughout.",
    370           "source": "haiku"
    371         },
    372         "compute_budget_stated": {
    373           "applies": true,
    374           "answer": false,
    375           "justification": "Hardware is specified (A100 GPUs) but total compute budget, experiment runtimes, or GPU-hours for training MoPE and running all evaluations are not reported.",
    376           "source": "haiku"
    377         }
    378       }
    379     }
    380   },
    381   "claims": [
    382     {
    383       "claim": "Equinox achieves up to 1.3× higher throughput compared to VTC",
    384       "evidence": "Figure 9 (balanced load, synthetic) shows 1.3× service rate improvement; Figure 17 (overload) corroborates.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Equinox achieves up to 60% lower time-to-first-token latency compared to VTC",
    389       "evidence": "Section 7.2.1 states 'up to 60% lower response times than VTC' (Figure 9a); SGLang real-world shows up to 30% TTFT improvement (Figure 11).",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Equinox achieves 13% higher fairness (Jain's Index) versus VTC and FCFS across S-LoRA, vLLM, and SGLang",
    394       "evidence": "Figure 13 shows Jain's index: S-LoRA (VTC 0.66 → Equinox 0.80), vLLM (0.76 → 0.90), SGLang (0.73 → 0.88).",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "MoPE reduces L1 token prediction error from 80 to 33 tokens versus single proxy models",
    399       "evidence": "Figure 7a shows L1 error: single expert (baseline) = 80, three experts (MoPE) = 33, five experts = 25.",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "Equinox+MoPE achieves fairness close to Oracle prediction with only a 17% gap",
    404       "evidence": "Table 1: Equinox+MoPE average service difference = 150.64 vs. Equinox+Oracle = 99.80; gap is approximately 51%, not 17% — the 17% figure appears in the abstract but Table 1 shows larger gaps.",
    405       "supported": "weak"
    406     },
    407     {
    408       "claim": "Token count (VTC) is an inadequate fairness metric due to prefill-decode bifurcation",
    409       "evidence": "Figures 1, 2, and 16 demonstrate that equal token counts produce divergent latency, throughput, and GPU utilization patterns across multiple serving systems.",
    410       "supported": "strong"
    411     }
    412   ],
    413   "methodology_tags": [
    414     "benchmark-eval",
    415     "observational"
    416   ],
    417   "key_findings": "Equinox demonstrates that token-count-based fairness (VTC) is fundamentally inadequate for LLM serving because identical token counts produce divergent latency, throughput, and GPU utilization due to the prefill-decode bifurcation. The dual-counter framework (UFC for user-perceived latency/tokens, RFC for GPU utilization/throughput) combined with MoPE prediction improves Jain's fairness index by ~13%, throughput by up to 1.3×, and TTFT by up to 60% vs. VTC across three serving systems. The ablation study establishes that both accurate prediction and the holistic scheduling algorithm are necessary — neither MoPE alone nor the scheduling algorithm alone achieves the combined benefit.",
    418   "red_flags": [
    419     {
    420       "flag": "No error bars or confidence intervals",
    421       "detail": "All performance results (fairness index, TTFT, throughput) are reported as point estimates without variance, confidence intervals, or multiple trial runs reported."
    422     },
    423     {
    424       "flag": "Open-source claim without repository URL",
    425       "detail": "The abstract and text repeatedly call Equinox 'open-source' but no code repository URL is provided anywhere in the paper, making the claim unverifiable."
    426     },
    427     {
    428       "flag": "Bundled optimizations confound fairness attribution",
    429       "detail": "Equinox includes adaptive batching and stall-free scheduling in addition to the holistic fairness algorithm; the ablation only isolates MoPE, not these extra components, so attribution of throughput gains to the fairness mechanism specifically is unclear."
    430     },
    431     {
    432       "flag": "17% Oracle gap claim inconsistent with Table 1",
    433       "detail": "The abstract claims Equinox+MoPE achieves fairness 'with only a 17% gap' to Oracle, but Table 1 shows average service difference 150.64 (MoPE) vs. 99.80 (Oracle), a ~51% gap, not 17%."
    434     },
    435     {
    436       "flag": "No funding disclosed despite commercial affiliations",
    437       "detail": "Authors include affiliates from UltraRISC Shanghai (commercial chip startup) and China Telecom; no funding source or competing interests are declared."
    438     }
    439   ],
    440   "cited_papers": [
    441     {
    442       "title": "Fairness in Serving Large Language Models (VTC)",
    443       "relevance": "Primary baseline system; Equinox's main contribution is to improve upon VTC's token-count fairness metric"
    444     },
    445     {
    446       "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention (vLLM)",
    447       "relevance": "One of three serving systems Equinox is implemented on and evaluated against"
    448     },
    449     {
    450       "title": "SGLang: Efficient Execution of Structured Language Model Programs",
    451       "relevance": "One of three serving systems used in evaluation; ShareGPT benchmark integrated into SGLang"
    452     },
    453     {
    454       "title": "Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve",
    455       "relevance": "Related work on chunked prefill; Equinox incorporates and extends this optimization"
    456     },
    457     {
    458       "title": "DistServe: Disaggregating Prefill and Decoding for Goodput-optimized LLM Serving",
    459       "relevance": "Related approach to the prefill-decode bifurcation problem that Equinox addresses from a fairness angle"
    460     },
    461     {
    462       "title": "Efficient Interactive LLM Serving with Proxy Model-based Sequence Length Prediction",
    463       "relevance": "Baseline prediction method that MoPE outperforms; directly compared in Figures 4 and 7"
    464     },
    465     {
    466       "title": "LMSYS-Chat-1M: A Large-Scale Real-World LLM Conversation Dataset",
    467       "relevance": "Primary training and evaluation dataset for MoPE and scheduling experiments"
    468     },
    469     {
    470       "title": "Orca: A Distributed Serving System for Transformer-Based Generative Models",
    471       "relevance": "Foundational continuous batching paper underlying the scheduling context"
    472     }
    473   ],
    474   "engagement_factors": {
    475     "practical_relevance": {
    476       "score": 3,
    477       "justification": "LLM serving fairness directly impacts production multi-tenant deployments; the system is implemented on vLLM/SGLang which are widely used."
    478     },
    479     "surprise_contrarian": {
    480       "score": 2,
    481       "justification": "The finding that VTC's token-count fairness actually worsens fairness compared to FCFS in some metrics (Figure 13) is counterintuitive and contrarian."
    482     },
    483     "fear_safety": {
    484       "score": 0,
    485       "justification": "No AI safety or risk concerns; purely a systems/infrastructure paper."
    486     },
    487     "drama_conflict": {
    488       "score": 1,
    489       "justification": "Frames VTC as fundamentally flawed and broken for LLM serving, but this is a technical argument rather than a dramatic controversy."
    490     },
    491     "demo_ability": {
    492       "score": 2,
    493       "justification": "Implemented on top of vLLM and SGLang which practitioners already use; if the code were released, it would be directly deployable."
    494     },
    495     "brand_recognition": {
    496       "score": 1,
    497       "justification": "Shanghai Jiao Tong University is a well-known Chinese institution but not a top-tier AI lab brand like Google, Meta, or OpenAI."
    498     }
    499   },
    500   "hn_data": {
    501     "threads": [
    502       {
    503         "hn_id": "42898914",
    504         "title": "Gradual Disempowerment: How Even Incremental AI Progress Poses Existential Risks",
    505         "points": 87,
    506         "comments": 84,
    507         "url": "https://news.ycombinator.com/item?id=42898914",
    508         "created_at": "2025-02-01T15:12:22Z"
    509       }
    510     ],
    511     "top_points": 87,
    512     "total_points": 87,
    513     "total_comments": 84
    514   }
    515 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs