ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (15738B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Database Perspective on LLM Inference Systems",
      6     "authors": [
      7       "James Pan",
      8       "Guoliang Li"
      9     ],
     10     "year": 2025,
     11     "venue": "PVLDB",
     12     "arxiv_id": null,
     13     "doi": "10.14778/3750601.3750703"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "All abstract claims are supported by paper content: systematically covers request processing (§2.1), model optimization (§2.2), memory management (§2.3), and how systems combine techniques (§2.4).",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": false,
     25         "answer": false,
     26         "justification": "Tutorial/review format; no causal claims tested via study design. Technique descriptions attributed entirely to cited papers.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Scope clearly bounded: LLM inference systems from database perspective. No claims beyond this domain.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Multiple techniques presented (paged allocation vs vAttention, eviction vs offloading) but no comparison, trade-off discussion, or guidance on when each is preferable.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Paper clearly distinguishes measured outcomes (latency, throughput, memory) from claims; explicitly distinguishes prefill vs decode phase metrics.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Open Problems section (§2.5) discusses limitations: heuristic-based batching/scheduling, uncertain cost estimates, missing benchmarks.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "Open Problems section is generic and forward-looking ('develop better estimates', 'adaptive techniques') rather than identifying specific threats to reviewed techniques.",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "Scope mentioned implicitly (request processing, optimization, memory) but not explicitly bounded. Does not state what is excluded (training, fine-tuning, inference quality, fairness).",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Acknowledgments explicitly disclose: Chinese National Key R&D Program, NSF of China, Shenzhen Project, Huawei, Zhongguancun Lab, BNRist.",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Both authors from Tsinghua University (Li is ACM Fellow); no apparent affiliation with systems reviewed (vLLM, SGLang, Mooncake, DeepFlow).",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Diverse funders (government + corporate); Huawei involvement disclosed. Tutorial is balanced pedagogical framework, not product advocacy.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests statement provided. Standard academic funding context, but no explicit declaration of patents, equity, or consulting relationships.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Key terms defined in context: LLM as 'transformer-based' with attention/FFN; prefill/decode phases explained; KV cache, batching, scheduling explained through usage.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Explicitly frames contribution: pedagogical tutorial organizing LLM inference from database systems perspective. Intended audience and contribution clearly stated.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "Only a brief 'Related Tutorials' section mentioning one complementary tutorial. No engagement with survey literature, no discussion of how this framework compares to other organizing principles.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "survey": {
    117       "search_and_selection": {
    118         "search_strategy_reproducible": {
    119           "applies": true,
    120           "answer": false,
    121           "justification": "No search strategy described. Paper does not explain how ~20 systems/techniques were identified or selected from a larger corpus.",
    122           "source": "haiku"
    123         },
    124         "inclusion_exclusion_explicit": {
    125           "applies": true,
    126           "answer": false,
    127           "justification": "No inclusion/exclusion criteria stated. Selection process for cited systems not documented.",
    128           "source": "haiku"
    129         },
    130         "prisma_or_structured_protocol": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "Organized as pedagogical tutorial (5 sections) rather than systematic review. No mention of PRISMA or structured review protocol.",
    134           "source": "haiku"
    135         },
    136         "search_terms_provided": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "No search terms, queries, or search strategy provided. Does not describe databases/sources searched.",
    140           "source": "haiku"
    141         },
    142         "databases_listed": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "Paper does not specify whether sources came from arXiv, Google Scholar, VLDB/SOSP proceedings, or other venues.",
    146           "source": "haiku"
    147         },
    148         "screening_process_documented": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "No screening documentation. No counts showing how many papers were considered vs. included.",
    152           "source": "haiku"
    153         },
    154         "review_scope_justified": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "Scope mentioned (request processing, optimization, memory) but not justified. No explanation for choice of techniques, timeframes, or venues.",
    158           "source": "haiku"
    159         }
    160       },
    161       "synthesis_quality": {
    162         "conflicting_findings_acknowledged": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "Paper presents techniques descriptively but does not discuss conflicting evidence, competing claims, or trade-offs between approaches.",
    166           "source": "haiku"
    167         },
    168         "quality_assessment_of_sources": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "No quality assessment, risk-of-bias tool, or structured appraisal of reviewed systems. All treated as equally credible.",
    172           "source": "haiku"
    173         },
    174         "publication_bias_discussed": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "No discussion of publication bias, positive-result bias, or whether reviewed literature skews toward particular findings.",
    178           "source": "haiku"
    179         },
    180         "quantitative_synthesis_present": {
    181           "applies": true,
    182           "answer": false,
    183           "justification": "No meta-analysis, vote counting, or effect size synthesis. Purely narrative descriptions of techniques.",
    184           "source": "haiku"
    185         },
    186         "recommendations_supported_by_evidence": {
    187           "applies": true,
    188           "answer": false,
    189           "justification": "No evidence-based recommendations (e.g., 'use technique X when Y'). Open Problems section is vague forward-looking speculation.",
    190           "source": "haiku"
    191         }
    192       }
    193     }
    194   },
    195   "claims": [
    196     {
    197       "claim": "Prefill phase is compute-intensive; decode phase is memory-intensive, motivating different operator designs",
    198       "evidence": "Stated in abstract and §2.1; motivates discussion of sparse attention vs. KV cache management.",
    199       "supported": "moderate"
    200     },
    201     {
    202       "claim": "FlashAttention reduces memory I/O costs through tiled matrix multiplication and online softmax",
    203       "evidence": "§2.2 Kernels section; cited from reference [6]",
    204       "supported": "moderate"
    205     },
    206     {
    207       "claim": "Request batching increases throughput but introduces ragged tensors that waste GPU computation",
    208       "evidence": "§2.2 Request Batching; mentions TurboTransformers and ByteTransformer solutions",
    209       "supported": "moderate"
    210     },
    211     {
    212       "claim": "KV cache size is unpredictable during autoregressive decoding, requiring dynamic memory management",
    213       "evidence": "§2.3: 'length-constrained generation' noted as exception; dynamic paged allocation presented as solution",
    214       "supported": "moderate"
    215     },
    216     {
    217       "claim": "Prefix sharing via radix trees identifies reusable KV cache across requests, reducing recomputation",
    218       "evidence": "§2.3 Cache Persistence; §2.4 describes SGLang's cache-aware scheduler exploiting prefix sharing",
    219       "supported": "moderate"
    220     },
    221     {
    222       "claim": "Disaggregated prefill/decode architecture improves throughput by adapting hardware to phase-specific requirements",
    223       "evidence": "§2.4 Distributed Systems (Mooncake, DeepFlow); no empirical throughput comparison provided",
    224       "supported": "weak"
    225     }
    226   ],
    227   "methodology_tags": [
    228     "case-study"
    229   ],
    230   "key_findings": "The paper organizes LLM inference system design from a database perspective around four dimensions: (1) request processing via prefill and decode phases with efficient operators (sparse attention, speculative decoding); (2) model execution optimization through specialized kernels (FlashAttention, PagedAttention), intelligent batching, and scheduling algorithms for job prioritization and load balancing; (3) dynamic KV cache management via paged allocation, eviction/offloading, quantization, and prefix-sharing persistence; (4) system architectures combining these techniques (centralized low-latency systems like vLLM vs. distributed high-throughput systems like Mooncake and DeepFlow). The framework suggests LLM inference challenges parallel classical database systems optimization problems.",
    231   "red_flags": [
    232     {
    233       "flag": "Misclassified as systematic survey",
    234       "detail": "Paper is a tutorial, not a systematic literature review. No search strategy, inclusion criteria, screening process, or methodology reported. All survey-specific evaluation criteria are inapplicable."
    235     },
    236     {
    237       "flag": "No empirical comparison",
    238       "detail": "Describes systems and techniques but provides no benchmarks, direct comparisons, or validation of claims. All effectiveness claims are second-hand citations."
    239     },
    240     {
    241       "flag": "Trade-offs not discussed",
    242       "detail": "Multiple techniques presented for same problem (vLLM vs. vAttention, paged vs. native allocation) without discussing relative costs, latency impact, or appropriateness in different scenarios."
    243     },
    244     {
    245       "flag": "No critical appraisal",
    246       "detail": "Zero quality assessment or risk-of-bias evaluation of reviewed systems. No discussion of limitations in vLLM, SGLang, Mooncake, or DeepFlow designs."
    247     },
    248     {
    249       "flag": "Implicit scope boundaries",
    250       "detail": "What is deliberately excluded is unstated (e.g., training efficiency, inference quality/accuracy, fairness, cost-benefit analysis, failure modes)."
    251     },
    252     {
    253       "flag": "Vague open problems",
    254       "detail": "§2.5 (5 min of 90-min tutorial) provides generic recommendations ('develop more accurate cost estimates') unmoored from evidence synthesis."
    255     }
    256   ],
    257   "cited_papers": [
    258     {
    259       "title": "Attention is All You Need",
    260       "authors": "Vaswani et al.",
    261       "year": 2017,
    262       "relevance": "Foundational transformer architecture underlying all reviewed LLM inference systems"
    263     },
    264     {
    265       "title": "Efficient memory management for large language model serving with PagedAttention",
    266       "authors": "Kwon et al.",
    267       "year": 2023,
    268       "relevance": "vLLM system exemplifying paged KV cache allocation for memory efficiency"
    269     },
    270     {
    271       "title": "FlashAttention: Fast and memory-efficient exact attention with IO-awareness",
    272       "authors": "Dao et al.",
    273       "year": 2022,
    274       "relevance": "Specialized kernel reducing memory I/O costs in attention computation"
    275     },
    276     {
    277       "title": "SGLang: Efficient execution of structured language model programs",
    278       "authors": "Zheng et al.",
    279       "year": 2024,
    280       "relevance": "Frontend-runtime co-design exemplifying structured output optimization and cache-aware scheduling"
    281     },
    282     {
    283       "title": "Mooncake: A KVCache-centric disaggregated architecture for LLM serving",
    284       "authors": "Qin et al.",
    285       "year": 2024,
    286       "relevance": "Distributed disaggregated system exemplifying prefill/decode separation"
    287     },
    288     {
    289       "title": "DeepFlow: Serverless large language model serving at scale",
    290       "authors": "Hu et al.",
    291       "year": 2025,
    292       "relevance": "Serverless distributed system with fine-grained task decomposition for hardware-agnostic scaling"
    293     },
    294     {
    295       "title": "Is the GPU half-empty or half-full? Practical scheduling techniques for LLMs",
    296       "authors": "Kossmann et al.",
    297       "year": 2025,
    298       "relevance": "Addresses job prioritization and scheduling for latency-throughput balance"
    299     },
    300     {
    301       "title": "Taming throughput-latency tradeoff in LLM inference with Sarathi-Serve",
    302       "authors": "Agrawal et al.",
    303       "year": 2024,
    304       "relevance": "System addressing chunked prefill and continuous batching techniques"
    305     }
    306   ],
    307   "engagement_factors": {
    308     "practical_relevance": {
    309       "score": 3,
    310       "justification": "Directly applicable to practitioners; database framework is immediately actionable for inference system design."
    311     },
    312     "surprise_contrarian": {
    313       "score": 1,
    314       "justification": "Frames known techniques in database perspective (useful but not contrarian); does not challenge conventional wisdom."
    315     },
    316     "fear_safety": {
    317       "score": 0,
    318       "justification": "Systems optimization paper; no discussion of AI safety, alignment, or risk concerns."
    319     },
    320     "drama_conflict": {
    321       "score": 0,
    322       "justification": "Straightforward technical tutorial; no controversy, competing claims, or dramatic angles."
    323     },
    324     "demo_ability": {
    325       "score": 3,
    326       "justification": "All systems discussed are open-source (vLLM, SGLang) or publicly described; techniques are implementable."
    327     },
    328     "brand_recognition": {
    329       "score": 3,
    330       "justification": "Top-tier PVLDB venue; Guoliang Li is ACM Fellow; systems reviewed are industry-standard (vLLM from Berkeley, Mooncake from Alibaba)."
    331     }
    332   },
    333   "hn_data": {
    334     "threads": [],
    335     "top_points": 0,
    336     "total_points": 0,
    337     "total_comments": 0
    338   }
    339 }

Impressum · Datenschutz