scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24534B)
      1 {
      2   "paper": {
      3     "title": "Break the Sequential Dependency of LLM Inference Using Lookahead Decoding",
      4     "authors": ["Yichao Fu", "Peter Bailis", "Ion Stoica", "Hao Zhang"],
      5     "year": 2024,
      6     "arxiv_id": "2402.02057"
      7   },
      8   "checklist": {
      9     "artifacts": {
     10       "code_released": {
     11         "applies": true,
     12         "answer": true,
     13         "justification": "The abstract states 'Our code is avialable at https://github.com/hao-ai-lab/LookaheadDecoding' providing a working GitHub URL."
     14       },
     15       "data_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The paper uses publicly available benchmarks: MT-Bench, GSM8K, HumanEval, MBPP, ClassEval, XSum, and CNN/Daily Mail. All are standard public benchmarks the authors did not modify."
     19       },
     20       "environment_specified": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The paper mentions GPU types (NVIDIA A100 80GB, A100 40GB, RTX 3090), FP16 precision, and frameworks (HuggingFace, FlashAttention, DeepSpeed, Accelerate), but does not provide a requirements.txt, Dockerfile, or detailed dependency/version listing sufficient to recreate the environment."
     24       },
     25       "reproduction_instructions": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No step-by-step reproduction instructions are included in the paper. The algorithms are described in detail and a GitHub link is provided, but the paper itself does not contain a 'Reproducing Results' section or specific commands."
     29       }
     30     },
     31     "statistical_methodology": {
     32       "confidence_intervals_or_error_bars": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "All speedup results (Figures 5, 6, 7, Tables 2, 3) are reported as point estimates (e.g., '1.8x', '4x') without confidence intervals or error bars."
     36       },
     37       "significance_tests": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper makes comparative claims (e.g., Lookahead Decoding vs. autoregressive, vs. TP, vs. PP) but uses no statistical significance tests. Differences are stated by comparing point estimates only."
     41       },
     42       "effect_sizes_reported": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Speedup ratios are reported with baseline context throughout (e.g., '1.8x speedup on MT-Bench', '4x with strong scaling on multiple GPUs'), providing relative improvement over the autoregressive baseline. Table 2 shows ROUGE scores for both methods alongside speedups."
     46       },
     47       "sample_size_justified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No justification is given for why the specific benchmarks, dataset sizes, or number of examples were chosen. For example, GSM8K uses 'the first 1k questions' with no stated rationale."
     51       },
     52       "variance_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No standard deviations, error bars, or variance across runs are reported. The speedup numbers appear to be single-run measurements. The paper does not state results were averaged over multiple runs."
     56       }
     57     },
     58     "evaluation_design": {
     59       "baselines_included": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "The paper compares against HuggingFace greedy search, FlashAttention-augmented autoregressive decoding, tensor parallelism (DeepSpeed), pipeline parallelism (Accelerate), and prompt lookup decoding (Section 5, Table 3)."
     63       },
     64       "baselines_contemporary": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not compare against speculative decoding methods (e.g., Medusa, EAGLE, SpecInfer, REST) despite extensively discussing them in Section 6 as related work. The primary baselines are vanilla autoregressive decoding and standard parallelism methods, which are not the state-of-the-art in LLM inference acceleration."
     68       },
     69       "ablation_study": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Section 5.4 provides an ablation study (Table 3) examining the contribution of the lookahead branch and verification branch separately, with various settings of N, W, G, and with/without prompt as reference."
     73       },
     74       "multiple_metrics": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper reports tokens/s throughput, speedup ratios, compression ratio (S), and ROUGE-1/2/L scores for generation quality (Table 2). Multiple metrics are used across experiments."
     78       },
     79       "human_evaluation": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "Human evaluation is irrelevant here. The paper's claims are about inference speed (measured by throughput and latency) and output distribution preservation (measured by ROUGE scores and exact match), both of which are objectively measurable."
     83       },
     84       "held_out_test_set": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The benchmarks used (MT-Bench, HumanEval, GSM8K, MBPP, ClassEval, XSum, CNN/Daily Mail) are standard evaluation sets. Since the method does not involve training or tuning on these datasets, the test sets are inherently held out."
     88       },
     89       "per_category_breakdown": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Results are broken down by model size (7B, 13B, 34B, 70B), by dataset (MT-Bench, GSM8K, MBPP, HumanEval, ClassEval), by GPU count (1, 4, 8), and by configuration (with/without FlashAttention, LP/TP/PP) in Figures 5, 6, 7."
     93       },
     94       "failure_cases_discussed": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section 5.5 discusses limitations including compute-bound environments causing slowdowns, lower speedups on RTX 3090 GPUs (Figure 8), and diminishing returns from exponential FLOPs investment. The paper also notes TP and PP cause slowdowns (0.75x-0.82x)."
     98       },
     99       "negative_results_reported": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper reports that tensor parallelism and pipeline parallelism cause slowdowns (0.75x-0.82x) in Figures 6-7, and that the method shows lower speedups on less powerful GPUs (RTX 3090 in Figure 8). Table 3 row 5 shows only 1.04x speedup without prompt reference."
    103       }
    104     },
    105     "claims_and_evidence": {
    106       "abstract_claims_supported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The abstract claims 'up to 1.8x on MT-bench and 4x with strong scaling on multiple GPUs in code completion tasks.' Figure 6 shows ~1.8x on MT-Bench with FlashAttention, and ClassEval shows up to ~4x on 8 GPUs. The 'exact' decoding claim is supported by the proof in Appendix B and ROUGE results in Table 2."
    110       },
    111       "causal_claims_justified": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper makes causal claims through ablation (Section 5.4, Table 3) showing the contribution of individual components (lookahead branch, verification branch, prompt as reference). These are controlled single-variable manipulations. The scaling law derivation (Section 4) provides theoretical grounding for the mechanism."
    115       },
    116       "generalization_bounded": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper tests only on LLaMA-2 and CodeLlama model families but makes general claims about 'LLM inference' and 'autoregressive decoding' in the title and abstract. Results are on specific GPU hardware (A100, RTX 3090) with batch size 1. The title claims to 'break the sequential dependency of LLM inference' broadly, not bounded to tested settings."
    120       },
    121       "alternative_explanations_discussed": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper does not discuss alternative explanations for the observed speedups. For example, the higher speedup in code completion vs. chat could be partly due to benchmark-specific patterns rather than the method's general properties. No threats-to-validity section or discussion of confounding factors is present."
    125       }
    126     },
    127     "setup_transparency": {
    128       "model_versions_specified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Specific model names are provided: LLaMA-2-Chat (7B, 13B, 70B), CodeLlama (7B, 13B, 34B), CodeLlama-Inst (7B, 13B, 34B), CodeLlama-Python (7B, 13B). These are specific open-source model releases with known weights, not API-accessed models with version ambiguity."
    132       },
    133       "prompts_provided": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Appendix D provides the full prompt used for LLaMA-2-Chat on summarization tasks with the complete template text. Other benchmarks (HumanEval, MBPP, GSM8K) use standardized prompts from their benchmark definitions."
    137       },
    138       "hyperparameters_reported": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Key hyperparameters are reported: window size W, n-gram size N, max speculations G, FP16 precision, batch size 1, maximum sequence lengths (512 for HumanEval, 2048 for ClassEval), greedy sampling, and optimal configurations in Table 4. Temperature is stated for sampling experiments (Table 2: 0.0 and 1.0)."
    142       },
    143       "scaffolding_described": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No agentic scaffolding is used. This paper proposes a parallel decoding algorithm, not an agent-based system."
    147       },
    148       "data_preprocessing_documented": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Table 1 specifies the datasets, model-dataset pairings, and server configurations. The paper notes using 'the first 1k questions' from GSM8K and states max sequence lengths for code tasks (Section 5). The experimental setup is documented with enough detail."
    152       }
    153     },
    154     "limitations_and_scope": {
    155       "limitations_section_present": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 5.5 'Discussion and Limitation' provides a dedicated discussion of limitations including extra computation requirements, diminishing returns, and compute-bound environments."
    159       },
    160       "threats_to_validity_specific": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section 5.5 discusses specific threats: the method requires extra FLOPs (120x, 80x, 56x for 7B, 13B, 34B respectively), compute-bound environments may cause slowdowns, lower-end GPUs (RTX 3090) show reduced speedups, and batch serving is problematic. These are specific to this work, not generic disclaimers."
    164       },
    165       "scope_boundaries_stated": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper does not explicitly state what the results do NOT show. For example, it doesn't note that results may not hold for other model architectures beyond LLaMA, for larger batch sizes in production serving, or for non-greedy/non-standard sampling methods. The limitations discuss compute requirements but don't bound the scope of the claims."
    169       }
    170     },
    171     "data_integrity": {
    172       "raw_data_available": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No raw experimental data (per-example latencies, per-run throughput measurements) is released. Only aggregated speedup numbers are shown in figures and tables."
    176       },
    177       "data_collection_described": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "The experimental setup describes the hardware (S1: A100 80GB, S2: DGX 8xA100 40GB NVLink), models, datasets, precision (FP16), and batch size (1). Table 1 summarizes the experimental matrix."
    181       },
    182       "recruitment_methods_described": {
    183         "applies": false,
    184         "answer": false,
    185         "justification": "No human participants. The study uses standard benchmarks for automated evaluation."
    186       },
    187       "data_pipeline_documented": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The paper does not document how speedup numbers were measured (e.g., warm-up runs, number of repetitions, whether outliers were excluded). The pipeline from running experiments to producing the reported numbers is not described."
    191       }
    192     },
    193     "conflicts_of_interest": {
    194       "funding_disclosed": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No funding acknowledgment or disclosure section is present in the paper."
    198       },
    199       "affiliations_disclosed": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Author affiliations are clearly stated: Yichao Fu (UCSD), Peter Bailis (Google), Ion Stoica (UC Berkeley), Hao Zhang (UCSD)."
    203       },
    204       "funder_independent_of_outcome": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding is disclosed, so independence cannot be assessed. One author (Peter Bailis) is affiliated with Google, which has a commercial interest in LLM inference acceleration, but no funding relationship is stated."
    208       },
    209       "financial_interests_declared": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No competing interests statement or financial interest disclosure is present in the paper."
    213       }
    214     },
    215     "contamination": {
    216       "training_cutoff_stated": {
    217         "applies": false,
    218         "answer": false,
    219         "justification": "This paper proposes an inference-time decoding algorithm. It does not evaluate a model's knowledge or capability on benchmarks — it measures decoding speed. The benchmarks are used to provide diverse generation tasks, not to test whether the model 'knows' the answers. Contamination is irrelevant to the speedup claims."
    220       },
    221       "train_test_overlap_discussed": {
    222         "applies": false,
    223         "answer": false,
    224         "justification": "Same rationale: the paper measures inference latency/throughput, not model accuracy on benchmarks. Train/test overlap does not affect speedup measurements."
    225       },
    226       "benchmark_contamination_addressed": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "Same rationale: contamination is irrelevant because the paper's claims are about inference speed, not model task performance."
    230       }
    231     },
    232     "human_studies": {
    233       "pre_registered": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No human participants in this study."
    237       },
    238       "irb_or_ethics_approval": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "No human participants in this study."
    242       },
    243       "demographics_reported": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "inclusion_exclusion_criteria": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "randomization_described": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "blinding_described": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "attrition_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       }
    268     },
    269     "cost_and_practicality": {
    270       "inference_cost_reported": {
    271         "applies": true,
    272         "answer": true,
    273         "justification": "The paper reports inference throughput (tokens/s) across all experiments and discusses the extra per-step FLOPs overhead (120x, 80x, 56x for 7B, 13B, 34B models respectively in Section 5.5). Wall-clock speedups are the primary metric."
    274       },
    275       "compute_budget_stated": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "The paper does not state the total computational budget for the experiments (total GPU hours, number of runs, total wall-clock time). Only per-example throughput is reported."
    279       }
    280     }
    281   },
    282   "claims": [
    283     {
    284       "claim": "Lookahead Decoding achieves up to 1.8x speedup on MT-Bench and up to 4x speedup with strong scaling on multiple GPUs in code completion tasks.",
    285       "evidence": "Figure 6 shows 1.8x speedup for 7B model on MT-Bench with FlashAttention on single GPU. Figures 6-7 show up to ~4x speedup on ClassEval with 8 GPUs using Lookahead Parallelism with FlashAttention.",
    286       "supported": "strong"
    287     },
    288     {
    289       "claim": "Lookahead Decoding preserves the output distribution of the original LLM (exact/lossless decoding).",
    290       "evidence": "Theorem A in Appendix B provides a formal proof for sampling verification. Table 2 shows matching ROUGE scores between autoregressive and Lookahead Decoding with greedy search. Appendix E verifies FP32 output matches exactly, and FP16 differences are within numerical error range.",
    291       "supported": "strong"
    292     },
    293     {
    294       "claim": "Lookahead Decoding linearly reduces the number of decoding steps according to per-step log(FLOPs).",
    295       "evidence": "Section 4.2 derives the scaling law formula. Figure 4 shows empirical results on LLaMA-2-Chat-7B on MT-Bench aligning with the theoretical formulation. The authors note the trend 'aligns well with the formulation to some extent.'",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "Lookahead Decoding does not require any auxiliary model or data store, unlike speculative decoding.",
    300       "evidence": "The algorithm (Algorithm 2, Section 3) uses only the base LLM with Jacobi iteration and an n-gram pool generated from the model's own trajectory. No external draft model is needed. This is confirmed by the implementation description.",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "FlashAttention integration provides approximately 20% end-to-end speedup over native PyTorch implementation.",
    305       "evidence": "Section 3.3 states 'Applying FlashAttention to Lookahead Decoding brings about 20% end-to-end speedup.' Figures 6-7 show the differences between w/ flash and w/o flash variants.",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "Lookahead Parallelism introduces near-zero communication per step, unlike tensor and pipeline parallelism.",
    310       "evidence": "Section 3.4 explains that LP distributes disjoint branches to different GPUs without interaction during forward pass. Figures 6-7 show LP achieving speedups while TP/PP show slowdowns (0.75x-0.82x).",
    311       "supported": "moderate"
    312     }
    313   ],
    314   "methodology_tags": ["benchmark-eval"],
    315   "key_findings": "Lookahead Decoding is a parallel decoding algorithm that accelerates LLM inference by reformulating autoregressive decoding as Jacobi iteration, generating and verifying n-grams from the iteration trajectory. On LLaMA-2 and CodeLlama models, it achieves 1.5x-2.3x speedups on a single GPU across multiple benchmarks, and up to 4x speedup with multi-GPU Lookahead Parallelism, while provably preserving the output distribution. The method requires no auxiliary draft model and scales linearly in step compression with logarithmic FLOPs investment, though it requires surplus compute capacity and shows diminishing returns on compute-bound settings.",
    316   "red_flags": [
    317     {
    318       "flag": "Missing comparison with speculative decoding baselines",
    319       "detail": "The paper extensively discusses speculative decoding methods (Medusa, EAGLE, SpecInfer, REST) in the introduction and related work but does not include any direct experimental comparison against them. Since these are the primary alternative approaches for the same problem, their absence from the evaluation makes it impossible to assess relative merit."
    320     },
    321     {
    322       "flag": "No variance or uncertainty quantification",
    323       "detail": "All throughput and speedup results are reported as point estimates without error bars, standard deviations, or confidence intervals. It is unclear whether results are from single runs or averaged, making it impossible to assess result stability."
    324     },
    325     {
    326       "flag": "Batch size 1 only",
    327       "detail": "All experiments use batch size 1. The paper acknowledges in Section 5.5 that compute-bound environments (like larger batch sizes) may cause slowdowns, but does not test this. Real-world serving systems typically use larger batch sizes."
    328     },
    329     {
    330       "flag": "Google affiliation undisclosed as potential conflict",
    331       "detail": "Co-author Peter Bailis is affiliated with Google, which has commercial interest in LLM inference optimization. No conflict of interest statement or funding disclosure is provided."
    332     }
    333   ],
    334   "cited_papers": [
    335     {
    336       "title": "Accelerating large language model decoding with speculative sampling",
    337       "authors": ["C. Chen", "S. Borgeaud", "G. Irving", "J.-B. Lespiau", "L. Sifre", "J. Jumper"],
    338       "year": 2023,
    339       "arxiv_id": "2302.01318",
    340       "relevance": "Foundational speculative decoding method that Lookahead Decoding aims to improve upon by removing the need for a draft model."
    341     },
    342     {
    343       "title": "Fast inference from transformers via speculative decoding",
    344       "authors": ["Y. Leviathan", "M. Kalman", "Y. Matias"],
    345       "year": 2023,
    346       "relevance": "Independently proposed speculative decoding; the paper's theoretical analysis builds on this work's acceptance rate framework."
    347     },
    348     {
    349       "title": "Medusa: Simple LLM inference acceleration framework with multiple decoding heads",
    350       "authors": ["T. Cai", "Y. Li", "Z. Geng", "H. Peng", "J. D. Lee", "D. Chen", "T. Dao"],
    351       "year": 2024,
    352       "relevance": "Alternative LLM inference acceleration using trained decoding heads, a key competitor not directly compared in experiments."
    353     },
    354     {
    355       "title": "EAGLE: Lossless acceleration of LLM decoding by feature extrapolation",
    356       "authors": ["Y. Li", "C. Zhang", "H. Zhang"],
    357       "year": 2023,
    358       "relevance": "Training-based draft model approach for LLM inference acceleration, representing an alternative paradigm to Lookahead Decoding."
    359     },
    360     {
    361       "title": "REST: Retrieval-based speculative decoding",
    362       "authors": ["Z. He", "Z. Zhong", "T. Cai", "J. D. Lee", "D. He"],
    363       "year": 2023,
    364       "arxiv_id": "2311.08252",
    365       "relevance": "Uses retrieval from a datastore for speculation, representing the data-store-based approach that Lookahead Decoding eliminates."
    366     },
    367     {
    368       "title": "SpecInfer: Accelerating generative large language model serving with speculative inference and token tree verification",
    369       "authors": ["X. Miao", "G. Oliaro", "Z. Zhang"],
    370       "year": 2023,
    371       "relevance": "Tree-based multi-model speculative decoding whose sampling verification algorithm Lookahead Decoding adapts."
    372     },
    373     {
    374       "title": "Accelerating transformer inference for translation via parallel decoding",
    375       "authors": ["A. Santilli", "S. Severino", "E. Postolache"],
    376       "year": 2023,
    377       "relevance": "Original Jacobi decoding paper that provides the theoretical foundation for Lookahead Decoding's approach."
    378     },
    379     {
    380       "title": "Online speculative decoding",
    381       "authors": ["X. Liu", "L. Hu", "P. Bailis", "I. Stoica", "Z. Deng", "A. Cheung", "H. Zhang"],
    382       "year": 2023,
    383       "relevance": "Related work on online adaptation of speculative decoding from the same research group."
    384     },
    385     {
    386       "title": "Evaluating large language models trained on code",
    387       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    388       "year": 2021,
    389       "relevance": "Introduces HumanEval benchmark used to evaluate Lookahead Decoding on code completion tasks."
    390     },
    391     {
    392       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    393       "authors": ["L. Zheng", "W.-L. Chiang", "Y. Sheng"],
    394       "year": 2023,
    395       "relevance": "Introduces MT-Bench, the primary multi-turn chat evaluation dataset used throughout the paper's experiments."
    396     },
    397     {
    398       "title": "FlashAttention-2: Faster attention with better parallelism and work partitioning",
    399       "authors": ["T. Dao"],
    400       "year": 2023,
    401       "relevance": "Key infrastructure component integrated with Lookahead Decoding for practical speedups; compatibility is a contribution of this paper."
    402     }
    403   ]
    404 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs