ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (15809B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Database Perspective on LLM Inference Systems",
      6     "authors": [
      7       "James Pan",
      8       "Guoliang Li"
      9     ],
     10     "year": 2025,
     11     "venue": "PVLDB",
     12     "arxiv_id": null,
     13     "doi": "10.14778/3750601.3750703"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The abstract claims to review inference techniques from a database perspective across request processing, execution, and memory management. The paper delivers on this structure in Sections 2.1-2.4.",
     21         "source": "opus"
     22       },
     23       "causal_claims_justified": {
     24         "applies": false,
     25         "answer": false,
     26         "justification": "The paper makes no causal claims; it describes and categorizes existing techniques.",
     27         "source": "opus"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The paper is appropriately scoped as a tutorial overview and does not overclaim. It explicitly states the intended audience and tutorial duration.",
     33         "source": "opus"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": false,
     37         "answer": false,
     38         "justification": "Pure survey/tutorial presenting no empirical results requiring alternative explanations.",
     39         "source": "opus"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": false,
     43         "answer": false,
     44         "justification": "This is a tutorial/survey paper with no empirical measurements. It reviews existing inference techniques without making claims backed by the authors' own measurements. No proxy-outcome gap exists because no measurements are taken.",
     45         "source": "opus"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No dedicated limitations section. Section 2.5 discusses open problems in the field but not limitations of the tutorial itself.",
     53         "source": "opus"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "No threats to validity discussed for the tutorial's own coverage or methodology.",
     59         "source": "opus"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper states it covers inference from a 'database perspective' and distinguishes its scope from related tutorial [10] which focuses on trustworthiness/quality. It also specifies intended audience and tutorial duration (1.5 hours).",
     65         "source": "opus"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Acknowledgments section lists funding: National Key R&D Program of China, NSF of China, Shenzhen Project, Zhongguancun Lab, Huawei, and BNRist.",
     73         "source": "opus"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Both authors are listed as Tsinghua University. The paper does not evaluate Tsinghua-affiliated products.",
     79         "source": "opus"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Funding sources (government grants, Huawei, academic labs) do not have a direct stake in the tutorial's conclusions. Huawei is listed as a funder but none of its products are prominently featured or evaluated.",
     85         "source": "opus"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests statement is present in the paper.",
     91         "source": "opus"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Key terms (prefill/decode phases, KV cache, attention, MoE, quantization, batching, distributed inference) are explained or well-referenced in context.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Contribution is explicit: 'review how these techniques lower inference costs... from the database perspective of request processing, model execution and optimization, and memory management.' Frames existing techniques through novel database lens.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Extensively cites and discusses prior work in each section (FlashAttention, PagedAttention, vLLM, SGLang, Mooncake, DeepFlow); compares design approaches; positions relative to related tutorial on LLM trustworthiness.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "survey": {
    117       "search_and_selection": {
    118         "search_strategy_reproducible": {
    119           "applies": true,
    120           "answer": false,
    121           "justification": "No search strategy described. This is a curated tutorial, not a systematic review. Paper selection is implicit rather than reproducibly defined.",
    122           "source": "haiku"
    123         },
    124         "inclusion_exclusion_explicit": {
    125           "applies": true,
    126           "answer": false,
    127           "justification": "No explicit inclusion/exclusion criteria stated. Systems and papers discussed are selected by the authors without documenting selection rules.",
    128           "source": "haiku"
    129         },
    130         "prisma_or_structured_protocol": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "No structured protocol (PRISMA, systematic review framework, or equivalent) followed. This is a traditional invited tutorial format.",
    134           "source": "haiku"
    135         },
    136         "search_terms_provided": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "No search terms or query strategy documented; not applicable to a curated tutorial approach.",
    140           "source": "haiku"
    141         },
    142         "databases_listed": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "No databases searched (arXiv, Google Scholar, etc.) are listed, as this is not a systematic literature search.",
    146           "source": "haiku"
    147         },
    148         "screening_process_documented": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "No screening process (abstract review, full-text eligibility assessment, inter-rater agreement) is documented.",
    152           "source": "haiku"
    153         },
    154         "review_scope_justified": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "Scope is justified by the database systems perspective and the problem areas (request processing, optimization, memory management); clear motivation provided for focus areas.",
    158           "source": "haiku"
    159         }
    160       },
    161       "synthesis_quality": {
    162         "conflicting_findings_acknowledged": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Different systems employ competing architectural choices (centralized vs distributed, latency vs throughput optimization) and trade-offs are discussed, though framed as design choices rather than conflicting empirical findings.",
    166           "source": "haiku"
    167         },
    168         "quality_assessment_of_sources": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "No systematic quality assessment, risk-of-bias evaluation, or structured rubric applied to cited systems and papers. All are presented equally without critical evaluation.",
    172           "source": "haiku"
    173         },
    174         "publication_bias_discussed": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "Publication bias not discussed. This is a review of systems rather than papers, but notably many citations are to 2024-2025 preprints; potential bias toward recent/visible work not acknowledged.",
    178           "source": "haiku"
    179         },
    180         "quantitative_synthesis_present": {
    181           "applies": true,
    182           "answer": false,
    183           "justification": "No quantitative synthesis (meta-analysis, performance tables, comparative metrics aggregation). Paper is entirely narrative/descriptive.",
    184           "source": "haiku"
    185         },
    186         "recommendations_supported_by_evidence": {
    187           "applies": false,
    188           "answer": false,
    189           "justification": "No explicit recommendations offered. Section 2.5 identifies open problems (cost estimation, adaptive scheduling, benchmarking) but does not recommend solutions.",
    190           "source": "haiku"
    191         }
    192       }
    193     }
    194   },
    195   "claims": [
    196     {
    197       "claim": "Database systems perspective provides a useful organizing framework for LLM inference optimization",
    198       "evidence": "Paper structures entire review around database concepts (request processing, optimization, memory management); demonstrates how database patterns (batching, scheduling, paging) directly apply to LLM inference.",
    199       "supported": "moderate"
    200     },
    201     {
    202       "claim": "Speculative decoding increases token throughput by using smaller models to verify candidates in a single execution cycle",
    203       "evidence": "Stated in Section 2.1; referenced to [13] (Leviathan et al., ICML'23).",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "PagedAttention enables flexible KV cache management by using non-contiguous memory blocks",
    208       "evidence": "Detailed in Section 2.3; described as dynamic paged-based blockwise allocation; referenced to vLLM implementation.",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "Prefix sharing (radix tree lookup) reduces recomputation in KV cache by identifying shareable token sequences",
    213       "evidence": "Discussed in Section 2.3 cache persistence; SGLang implementation described in Section 2.4.",
    214       "supported": "moderate"
    215     },
    216     {
    217       "claim": "Quantization of KV cache entries can reduce memory burden while preserving attention similarity",
    218       "evidence": "Section 2.3 describes quantization techniques with importance weighting and outlier tracking; referenced to [15].",
    219       "supported": "moderate"
    220     }
    221   ],
    222   "methodology_tags": [
    223     "meta-analysis",
    224     "theoretical"
    225   ],
    226   "key_findings": "This tutorial reviews LLM inference optimization through a database systems lens, organizing techniques across five areas: request processing (operator design, sequence generation), model optimization (kernels, batching, scheduling), memory management (paging, eviction, quantization, cache persistence), diverse inference system architectures serving different objectives (centralized, distributed, serverless), and open challenges in cost estimation and workload-aware scheduling. The database perspective shows how established systems concepts (request lifecycle management, resource optimization, distributed computing) illuminate LLM inference challenges and enable practitioners to design systems balancing latency, throughput, and memory efficiency.",
    227   "red_flags": [
    228     {
    229       "flag": "Not a systematic review",
    230       "detail": "This is a curated tutorial/invited paper. Selection of systems and papers discussed is implicit; no search strategy, inclusion/exclusion criteria, or screening process documented. Reproducibility limited."
    231     },
    232     {
    233       "flag": "No empirical evaluation",
    234       "detail": "Paper reviews existing systems and techniques but conducts no independent benchmarking or comparative evaluation. All claims rest on cited sources."
    235     },
    236     {
    237       "flag": "No quality assessment of sources",
    238       "detail": "Cited systems (vLLM, SGLang, Mooncake, DeepFlow) treated equally without critical evaluation of rigor, reproducibility, or contribution quality."
    239     },
    240     {
    241       "flag": "Heavy reliance on preprints",
    242       "detail": "Many citations (FlashDecoding, MagicPIG, Ring Attention, etc.) are 2024-2025 arXiv preprints, not peer-reviewed publications. Claims rest partially on preliminary work."
    243     },
    244     {
    245       "flag": "No explicit scope boundaries",
    246       "detail": "Paper does not state what it intentionally excludes (e.g., training optimization, model evaluation/quality metrics, fair comparisons). Scope is implicit."
    247     },
    248     {
    249       "flag": "Limited discussion of limitations",
    250       "detail": "Open problems (Section 2.5) are mentioned but no section discusses limitations of the tutorial itself—e.g., venues not covered, assumptions about audience, systems selection bias."
    251     }
    252   ],
    253   "cited_papers": [
    254     {
    255       "title": "vLLM: Efficient Memory Management for Large Language Model Serving with PagedAttention",
    256       "relevance": "Core inference system architecture; introduces paged memory management for KV cache"
    257     },
    258     {
    259       "title": "SGLang: Efficient Execution of Structured Language Model Programs",
    260       "relevance": "Frontend-runtime co-design for structured output generation; prefix caching optimization"
    261     },
    262     {
    263       "title": "FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness",
    264       "relevance": "Specialized kernel design for efficient attention computation"
    265     },
    266     {
    267       "title": "GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints",
    268       "relevance": "Grouped attention design to reduce fundamental compute/memory costs"
    269     },
    270     {
    271       "title": "Mooncake: A KVCache-Centric Disaggregated Architecture for LLM Serving",
    272       "relevance": "Distributed inference architecture; disaggregated prefill/decode design"
    273     },
    274     {
    275       "title": "DeepFlow: Serverless Large Language Model Serving at Scale",
    276       "relevance": "Serverless architecture for disaggregated inference; fine-grained task decomposition"
    277     },
    278     {
    279       "title": "Graph of Thoughts: Solving Elaborate Problems with Large Language Models",
    280       "relevance": "Structured sequence generation technique for improving output quality"
    281     },
    282     {
    283       "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
    284       "relevance": "Multi-pass sequence generation strategy for improved reasoning quality"
    285     },
    286     {
    287       "title": "Fast Inference from Transformers via Speculative Decoding",
    288       "relevance": "Token throughput acceleration via speculative verification"
    289     }
    290   ],
    291   "engagement_factors": {
    292     "practical_relevance": {
    293       "score": 2,
    294       "justification": "Surveys actionable inference systems (vLLM, SGLang, Mooncake) and techniques practitioners deploying LLMs can directly apply."
    295     },
    296     "surprise_contrarian": {
    297       "score": 0,
    298       "justification": "Organizes known techniques into a database framework without challenging any conventional wisdom or presenting unexpected findings."
    299     },
    300     "fear_safety": {
    301       "score": 0,
    302       "justification": "No safety, security, or risk angle is discussed."
    303     },
    304     "drama_conflict": {
    305       "score": 0,
    306       "justification": "A neutral tutorial survey with no controversy, no critique of specific companies, and no conflict."
    307     },
    308     "demo_ability": {
    309       "score": 0,
    310       "justification": "A 4-page tutorial paper with no code, demo, or reproducible artifact."
    311     },
    312     "brand_recognition": {
    313       "score": 1,
    314       "justification": "From Tsinghua University (well-known in CS but not a tech-industry household name) and covers systems like vLLM and SGLang that are known in the MLOps community."
    315     }
    316   },
    317   "hn_data": {
    318     "threads": [],
    319     "top_points": 0,
    320     "total_points": 0,
    321     "total_comments": 0
    322   }
    323 }

Impressum · Datenschutz