scan.json (18286B)
1 { 2 "paper": { 3 "title": "Database Perspective on LLM Inference Systems", 4 "authors": [ 5 "James Pan", 6 "Guoliang Li" 7 ], 8 "year": 2025, 9 "venue": "PVLDB", 10 "doi": "10.14778/3750601.3750703" 11 }, 12 "scan_version": 3, 13 "active_modules": [ 14 "survey_methodology" 15 ], 16 "methodology_tags": [ 17 "meta-analysis" 18 ], 19 "key_findings": "This tutorial paper reviews LLM inference techniques from a database perspective, organizing them into request processing, model optimization/execution, and memory management. It surveys centralized systems (vLLM, SGLang), distributed systems (Mooncake, DeepFlow), and frontend frameworks (DSPy, LMQL, LangChain). The paper identifies open problems in cost estimation, adaptive scheduling, and inference benchmarking.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "No code or analysis artifacts are released. This is a tutorial paper with no repository link." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": false, 30 "justification": "No dataset or curated corpus of surveyed systems/techniques is released." 31 }, 32 "environment_specified": { 33 "applies": false, 34 "answer": false, 35 "justification": "Tutorial/survey paper with no computational experiments requiring an environment." 36 }, 37 "reproduction_instructions": { 38 "applies": false, 39 "answer": false, 40 "justification": "No experiments to reproduce; this is a tutorial overview." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": false, 46 "answer": false, 47 "justification": "Survey/tutorial paper with no experiments or quantitative results." 48 }, 49 "significance_tests": { 50 "applies": false, 51 "answer": false, 52 "justification": "No comparative claims based on the authors' own experiments." 53 }, 54 "effect_sizes_reported": { 55 "applies": false, 56 "answer": false, 57 "justification": "No experiments; no effect sizes to report." 58 }, 59 "sample_size_justified": { 60 "applies": false, 61 "answer": false, 62 "justification": "No experiments or data collection performed." 63 }, 64 "variance_reported": { 65 "applies": false, 66 "answer": false, 67 "justification": "No experiments; no variance to report." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": false, 74 "justification": "The tutorial does not compare against prior survey/tutorial papers systematically. It mentions one related tutorial [10] briefly but does not compare coverage or depth." 75 }, 76 "baselines_contemporary": { 77 "applies": false, 78 "answer": false, 79 "justification": "No systematic baseline comparison is attempted." 80 }, 81 "ablation_study": { 82 "applies": false, 83 "answer": false, 84 "justification": "Tutorial paper; no system or method to ablate." 85 }, 86 "multiple_metrics": { 87 "applies": false, 88 "answer": false, 89 "justification": "No evaluation performed." 90 }, 91 "human_evaluation": { 92 "applies": false, 93 "answer": false, 94 "justification": "Tutorial paper with no system outputs to evaluate." 95 }, 96 "held_out_test_set": { 97 "applies": false, 98 "answer": false, 99 "justification": "No experiments requiring train/test splits." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "The tutorial organizes techniques into clear categories: request processing, model optimization/execution, memory management, and inference systems (Sections 2.1-2.4)." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 2.5 discusses open problems and limitations of current approaches, such as heuristic-based scheduling and lack of accurate cost estimates." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": false, 114 "justification": "The tutorial does not discuss techniques that failed or approaches that were tried and abandoned." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The abstract claims to review inference techniques from a database perspective across request processing, execution, and memory management. The paper delivers on this structure in Sections 2.1-2.4." 122 }, 123 "causal_claims_justified": { 124 "applies": false, 125 "answer": false, 126 "justification": "The paper makes no causal claims; it describes and categorizes existing techniques." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": true, 131 "justification": "The paper is appropriately scoped as a tutorial overview and does not overclaim. It explicitly states the intended audience and tutorial duration." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": false, 135 "answer": false, 136 "justification": "Pure survey/tutorial presenting no empirical results requiring alternative explanations." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": false, 140 "answer": false, 141 "justification": "This is a tutorial/survey paper with no empirical measurements. It reviews existing inference techniques without making claims backed by the authors' own measurements. No proxy-outcome gap exists because no measurements are taken." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": false, 147 "answer": false, 148 "justification": "No models are evaluated; this is a survey of techniques." 149 }, 150 "prompts_provided": { 151 "applies": false, 152 "answer": false, 153 "justification": "No prompting is used in this tutorial paper." 154 }, 155 "hyperparameters_reported": { 156 "applies": false, 157 "answer": false, 158 "justification": "No experiments conducted." 159 }, 160 "scaffolding_described": { 161 "applies": false, 162 "answer": false, 163 "justification": "No agentic scaffolding used." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": false, 168 "justification": "No description of how surveyed papers/systems were selected. No search strategy, inclusion/exclusion criteria, or filtering pipeline is documented." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": false, 175 "justification": "No dedicated limitations section. Section 2.5 discusses open problems in the field but not limitations of the tutorial itself." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": false, 180 "justification": "No threats to validity discussed for the tutorial's own coverage or methodology." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": true, 185 "justification": "The paper states it covers inference from a 'database perspective' and distinguishes its scope from related tutorial [10] which focuses on trustworthiness/quality. It also specifies intended audience and tutorial duration (1.5 hours)." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": false, 192 "justification": "No supplementary data, list of surveyed systems, or structured dataset is made available." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": false, 197 "justification": "No description of how the surveyed papers and systems were identified or selected." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants; data sources are published papers/systems." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": false, 207 "justification": "No documentation of the selection or organization pipeline for the surveyed techniques." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": true, 214 "justification": "Acknowledgments section lists funding: National Key R&D Program of China, NSF of China, Shenzhen Project, Zhongguancun Lab, Huawei, and BNRist." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Both authors are listed as Tsinghua University. The paper does not evaluate Tsinghua-affiliated products." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": true, 224 "justification": "Funding sources (government grants, Huawei, academic labs) do not have a direct stake in the tutorial's conclusions. Huawei is listed as a funder but none of its products are prominently featured or evaluated." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests statement is present in the paper." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": false, 235 "answer": false, 236 "justification": "Tutorial/survey paper that does not evaluate any pre-trained model on a benchmark." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": false, 240 "answer": false, 241 "justification": "No model evaluation performed." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": false, 245 "answer": false, 246 "justification": "No model evaluation performed." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": false, 289 "answer": false, 290 "justification": "Survey/tutorial paper with no method of its own to cost." 291 }, 292 "compute_budget_stated": { 293 "applies": false, 294 "answer": false, 295 "justification": "Survey/tutorial paper with no computational experiments." 296 } 297 }, 298 "survey_methodology": { 299 "prisma_or_structured_protocol": { 300 "applies": true, 301 "answer": false, 302 "justification": "No structured review protocol, PRISMA diagram, or reproducible search strategy is described. The paper selection appears ad-hoc." 303 }, 304 "quality_assessment_of_sources": { 305 "applies": true, 306 "answer": false, 307 "justification": "The tutorial treats all cited systems and techniques without assessing their methodological quality or maturity level." 308 }, 309 "publication_bias_discussed": { 310 "applies": true, 311 "answer": false, 312 "justification": "No discussion of publication bias or whether the surveyed techniques represent a biased sample of the field." 313 } 314 } 315 }, 316 "claims": [ 317 { 318 "claim": "LLM inference techniques can be organized into a database-style stack: request processing, model optimization/execution, and memory management.", 319 "evidence": "Figure 1 and Sections 2.1-2.3 present this organizational framework.", 320 "supported": "moderate" 321 }, 322 { 323 "claim": "Distributed disaggregated architectures (e.g., Mooncake, DeepFlow) provide higher throughput by flexibly adapting to different needs of prefill and decode phases.", 324 "evidence": "Section 2.4 describes these systems but cites the original papers rather than providing independent evaluation.", 325 "supported": "weak" 326 }, 327 { 328 "claim": "Existing batching and scheduling techniques rely on heuristics and more accurate cost estimation is an open problem.", 329 "evidence": "Section 2.5 states this as an open problem but does not provide quantitative evidence of heuristic inadequacy.", 330 "supported": "moderate" 331 } 332 ], 333 "red_flags": [ 334 { 335 "flag": "No systematic selection methodology", 336 "detail": "The tutorial surveys 20 references but provides no methodology for how these systems and techniques were selected. The coverage appears ad-hoc, making it unclear whether important systems are missing." 337 }, 338 { 339 "flag": "Very short for scope claimed", 340 "detail": "At 4 pages, the paper attempts to cover request processing, optimization, memory management, and system architectures. Each topic receives only surface-level treatment with no independent analysis." 341 } 342 ], 343 "cited_papers": [ 344 { 345 "title": "Efficient memory management for large language model serving with PagedAttention", 346 "authors": [ 347 "Woosuk Kwon", 348 "Zhuohan Li", 349 "Siyuan Zhuang", 350 "Ying Sheng", 351 "Lianmin Zheng", 352 "Cody Hao Yu", 353 "Joseph Gonzalez", 354 "Hao Zhang", 355 "Ion Stoica" 356 ], 357 "year": 2023, 358 "relevance": "Core infrastructure paper for LLM serving (vLLM); relevant to understanding compute costs and system design for LLM deployment." 359 }, 360 { 361 "title": "SGLang: Efficient execution of structured language model programs", 362 "authors": [ 363 "Lianmin Zheng", 364 "Liangsheng Yin", 365 "Zhiqiang Xie" 366 ], 367 "year": 2024, 368 "arxiv_id": "2312.07104", 369 "relevance": "Frontend-runtime co-designed inference system enabling structured output generation; relevant to agentic programming infrastructure." 370 }, 371 { 372 "title": "FlashAttention: Fast and memory-efficient exact attention with IO-awareness", 373 "authors": [ 374 "Tri Dao", 375 "Daniel Y. Fu", 376 "Stefano Ermon", 377 "Atri Rudra", 378 "Christopher RĂ©" 379 ], 380 "year": 2022, 381 "relevance": "Foundational kernel optimization for transformer inference, widely used in LLM systems." 382 }, 383 { 384 "title": "Mooncake: A KVCache-centric disaggregated architecture for LLM serving", 385 "authors": [ 386 "Ruoyu Qin", 387 "Zheming Li", 388 "Weiran He" 389 ], 390 "year": 2024, 391 "arxiv_id": "2407.00079", 392 "relevance": "Distributed LLM serving system with prefill/decode disaggregation; relevant to scaling LLM inference." 393 }, 394 { 395 "title": "DeepFlow: Serverless large language model serving at scale", 396 "authors": [ 397 "Junhao Hu", 398 "Jiang Xu", 399 "Zhixia Liu" 400 ], 401 "year": 2025, 402 "arxiv_id": "2501.14417", 403 "relevance": "Serverless LLM inference architecture; relevant to cost and scalability of deploying LLM-based applications." 404 }, 405 { 406 "title": "LLM-Inference-Bench: Inference benchmarking of large language models on AI accelerators", 407 "authors": [ 408 "Krishna Teja Chitty-Venkata", 409 "Siddhisanket Raskar", 410 "Bharat Kale" 411 ], 412 "year": 2024, 413 "arxiv_id": "2411.00136", 414 "relevance": "Benchmark for LLM inference systems; relevant to evaluation methodology for LLM deployment." 415 }, 416 { 417 "title": "Taming throughput-latency tradeoff in LLM inference with Sarathi-Serve", 418 "authors": [ 419 "Amey Agrawal", 420 "Nitin Kedia", 421 "Ashish Panwar" 422 ], 423 "year": 2024, 424 "relevance": "Addresses chunked prefills and continuous batching for LLM inference throughput-latency tradeoffs." 425 }, 426 { 427 "title": "Fast inference from transformers via speculative decoding", 428 "authors": [ 429 "Yaniv Leviathan", 430 "Matan Kalman", 431 "Yossi Matias" 432 ], 433 "year": 2023, 434 "relevance": "Foundational technique for accelerating LLM inference using draft models." 435 }, 436 { 437 "title": "Attention is all you need", 438 "authors": [ 439 "Ashish Vaswani", 440 "Noam Shazeer", 441 "Niki Parmar" 442 ], 443 "year": 2017, 444 "relevance": "Original transformer architecture paper; foundational to all LLM inference work." 445 } 446 ], 447 "engagement_factors": { 448 "practical_relevance": { 449 "score": 2, 450 "justification": "Surveys actionable inference systems (vLLM, SGLang, Mooncake) and techniques practitioners deploying LLMs can directly apply." 451 }, 452 "surprise_contrarian": { 453 "score": 0, 454 "justification": "Organizes known techniques into a database framework without challenging any conventional wisdom or presenting unexpected findings." 455 }, 456 "fear_safety": { 457 "score": 0, 458 "justification": "No safety, security, or risk angle is discussed." 459 }, 460 "drama_conflict": { 461 "score": 0, 462 "justification": "A neutral tutorial survey with no controversy, no critique of specific companies, and no conflict." 463 }, 464 "demo_ability": { 465 "score": 0, 466 "justification": "A 4-page tutorial paper with no code, demo, or reproducible artifact." 467 }, 468 "brand_recognition": { 469 "score": 1, 470 "justification": "From Tsinghua University (well-known in CS but not a tech-industry household name) and covers systems like vLLM and SGLang that are known in the MLOps community." 471 } 472 } 473 }