ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28115B)


      1 {
      2   "paper": {
      3     "title": "Fast Inference from Transformers via Speculative Decoding",
      4     "authors": [
      5       "Yaniv Leviathan",
      6       "Matan Kalman",
      7       "Yossi Matias"
      8     ],
      9     "year": 2022,
     10     "venue": "International Conference on Machine Learning (ICML 2023)",
     11     "arxiv_id": "2211.17192",
     12     "doi": "10.48550/arXiv.2211.17192"
     13   },
     14   "scan_version": 2,
     15   "active_modules": [
     16     "experimental_rigor",
     17     "data_leakage"
     18   ],
     19   "methodology_tags": [
     20     "theoretical",
     21     "benchmark-eval"
     22   ],
     23   "key_findings": "Speculative decoding accelerates autoregressive Transformer inference by 2X-3X on T5-XXL (11B) compared to the standard T5X implementation, with mathematically proven identical output distributions. The method uses a smaller approximation model to generate speculative token prefixes that the target model verifies in parallel, requiring no retraining or architecture changes. Approximation models roughly two orders of magnitude smaller than the target (e.g., T5-small 77M for T5-XXL 11B) provide the best speed-cost tradeoff, and even trivial n-gram models yield non-zero acceptance rates.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No source code repository, GitHub link, or code archive is mentioned anywhere in the paper or appendix."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper uses publicly available standard benchmarks: WMT EnDe for translation, CNN/DM for summarization, and lm1b for unconditional generation. All datasets are publicly accessible."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper mentions 'single TPU-v4' and 'T5X codebase' but provides no requirements.txt, dependency versions, or environment specification sufficient to recreate the setup."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The algorithm is described in pseudocode (Algorithm 1) but no runnable implementation or reproduction guide is given."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "All speedup results in Table 2 are reported as single point estimates (e.g., '3.4X', '2.6X') with no confidence intervals, error bars, or uncertainty measures."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No statistical significance tests are performed. Claims of speedup are based on comparing single walltime measurements without any tests for statistical significance."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Speedup factors are reported with clear baseline context (e.g., '2.6X' and '3.4X' relative to standard T5X decoding at 1X). Table 1 reports operations and speed vs baseline. Theoretical and empirical values compared in Table 4."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "α values are measured on '10K tokens generated by Mp' (Section 4.2) but no justification is given for why 10K tokens is sufficient. No power analysis or sample size justification for walltime measurements."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No variance, standard deviation, or spread measures are reported for any results. All walltime speedups and α values are single-run point estimates."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper compares against the standard T5X autoregressive decoding implementation (Roberts et al., 2022) as the baseline for walltime measurements."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "T5X is the standard, optimized implementation for T5 models at the time of writing. The paper also discusses and compares against Blockwise Parallel Decoding (Stern et al., 2018) and SAD (Sun et al., 2021) in the related work."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper systematically tests different approximation models (T5-small, T5-base, T5-large, unigram, bigram) and different γ values, showing how each component affects speedup. Tables 2 and 3 serve as ablation studies over the key parameters."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are reported across multiple metrics: walltime speedup (Table 2), acceptance rate α (Table 3), expected number of generated tokens (Equation 1, Figure 2), total arithmetic operations factor (Theorem 3.11, Table 1), and theoretical vs empirical comparison (Table 4)."
     94       },
     95       "human_evaluation": {
     96         "applies": false,
     97         "answer": false,
     98         "justification": "Human evaluation is irrelevant because the paper mathematically proves that the output distribution is identical to standard decoding (Appendix A.1). There is nothing subjective to evaluate."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper uses standard benchmark test sets: WMT EnDe for translation and CNN/DM for summarization, with existing fine-tuned checkpoints. These are established evaluation splits."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Results are broken down by task (EnDe vs CNN/DM), approximation model size (T5-small, T5-base, T5-large), temperature (0 vs 1), and model family (GPT-like, T5-XXL, LaMDA) in Tables 2 and 3."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 6 discusses when the method fails: 'our method is not helpful for configurations where additional computation resources are not available.' Section 3.4 analyzes increased arithmetic operations. Table 2 shows T5-large as Mq gives lower speedup (1.4X-1.7X) due to high cost coefficient c."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper reports that larger approximation models (T5-large) yield lower speedup despite higher α due to high c values (Table 2: T5-large at 1.4X vs T5-small at 2.6X for EnDe temp=1). Also reports trivial n-gram models yield only small improvements (~1.25X)."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Abstract claims '2X-3X acceleration' supported by Table 2 (2.3X-3.4X range). 'Identical outputs' proven in Appendix A.1. 'Without retraining or architecture changes' follows from algorithm design (uses off-the-shelf models). All claims verified in the body."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper's causal claim — that speculative decoding causes inference speedup — is justified through both formal proofs (Theorem 3.8 for expected walltime improvement) and controlled empirical validation (Table 2, same hardware/model with only the decoding algorithm changed)."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The abstract specifies 'We demonstrate it on T5-XXL.' Section 6 bounds applicability: 'our method is not helpful for configurations where additional computation resources are not available' and 'We tested speculative decoding only in the text modality.' The paper explicitly lists untested domains as future work."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Appendix A.3 discusses why empirical results differ from theoretical predictions: '(1) optimization differences between our implementation and the baseline, and (2) the simplifying assumption that the βs are i.i.d. being only an approximation.' Section 3.4 discusses increased arithmetic operations as a tradeoff."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper measures walltime speedup and claims walltime speedup — there is no proxy gap. The measured quantity (wall-clock time reduction) directly corresponds to the claimed benefit (faster inference). The paper also distinguishes between walltime improvement (Theorem 3.8) and arithmetic operations increase (Theorem 3.11)."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The paper specifies 'standard encoder-decoder T5 version 1.1' with exact model sizes (T5-XXL 11B, T5-small 77M, T5-base 250M, T5-large 800M). GPT-like model architecture is fully described (dim 768, 12 layers, 12 heads, 97M params). LaMDA sizes given (137B, 8B, 2B, 100M)."
    153       },
    154       "prompts_provided": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "The paper does not use prompting. It evaluates standard autoregressive decoding on established NLP tasks (translation, summarization, language modeling) using fine-tuned checkpoints."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Temperature settings (0 and 1), γ values (3, 5, 7), batch size (1), and full architecture details for all models are reported. Table 2 lists γ for each experiment. Section 4.2 describes GPT-like model dimensions and tokenization details."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "No agentic scaffolding is used. The paper presents a decoding algorithm applied to standard autoregressive models."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The data setup is transparent: standard benchmarks (WMT EnDe, CNN/DM, lm1b) with existing checkpoints. Section 4.1 describes the exact model configurations. Section 4.2 describes α measurement on '10K tokens generated by Mp' with Bert tokenization '8k tokens for all models.'"
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 6 (Discussion) includes substantive discussion of limitations: 'One limitation of speculative execution in general, and of speculative decoding in particular, is that latency is improved through increased concurrency at the cost of an increased number of arithmetic operations.'"
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 6 discusses specific limitations: (1) method requires additional computation resources to be available, (2) not helpful when memory bandwidth is not the bottleneck, (3) only tested in text modality. The i.i.d. assumption for βs is noted as only an approximation (Section 3.1)."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 6 explicitly states: 'not helpful for configurations where additional computation resources are not available,' 'in common cases where additional computation resources are available,' and 'We tested speculative decoding only in the text modality, but it might work well in other domains (e.g. images) which would be interesting to experiment with.'"
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "No raw experimental data (walltime measurements, per-token acceptance decisions, profiler traces) is released. Only aggregated results in tables are provided."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 4.1 describes the walltime measurement setup: 'batch size of 1 on a single TPU-v4 for both argmax sampling (temp=0) and standard sampling (temp=1).' Section 4.2 describes α measurement: '10K tokens generated by Mp, for each of the settings.'"
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants. Data sources are standard public benchmarks (WMT EnDe, CNN/DM, lm1b)."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The experimental pipeline is straightforward and documented: use existing model checkpoints → run standard decoding and speculative decoding on same inputs → measure walltime. For α measurement: generate 10K tokens with Mp, compute acceptance rates per Corollary 3.6."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding source is disclosed. The acknowledgments section thanks individuals but mentions no grants, funding agencies, or corporate sponsorship. Authors are Google Research employees but this is not stated as a funding disclosure."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly stated: 'Google Research, Mountain View, CA, USA' for all three authors."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "Google Research employs the authors and has a direct financial interest in faster Transformer inference for its products (LaMDA, PaLM, T5). Google benefits commercially from the demonstrated speedup. The funder is not independent of the outcome."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests statement or financial interest declarations are present in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "The paper evaluates inference speed, not model capability on benchmarks. The output distribution is mathematically proven identical to standard decoding (Appendix A.1), so training data contamination is irrelevant to the findings."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "Same as above — the paper measures walltime acceleration, not model accuracy. Whether the model saw benchmark data during training has no bearing on the speed measurements."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "Contamination is irrelevant because the paper does not evaluate model capability. It evaluates an inference algorithm that produces outputs provably identical to standard decoding."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in this study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "The entire paper is about inference cost/latency. Walltime speedups (2X-3X) reported in Table 2, cost coefficient c estimated from profiler traces (Table 4), and arithmetic operations increase analyzed in Theorem 3.11 and Table 1."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "Hardware is described ('single TPU-v4', 'batch size of 1') but total computational budget (TPU hours, total experiment time) is not stated."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No mention of multiple random seeds. Speculative sampling involves stochastic decisions, but results are reported from single runs without seed sensitivity analysis."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "For α measurement, '10K tokens generated by Mp' is stated (Section 4.2). However, the number of walltime measurement runs (Table 2) is never stated — it's unclear if speedup numbers are single-run or averaged."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The key hyperparameter γ is analyzed theoretically (Section 3.5, Figure 3), and several values are tested empirically. However, no explicit search budget is reported. The selection of γ values for each experiment is not clearly justified beyond theoretical optimality curves."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": true,
    321         "justification": "All tested configurations are reported in Table 2 (three approximation models × two temperatures × two tasks), not just the best. The star (⋆) marks the best configuration, but all results are transparently shown."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors compare their implementation against the T5X baseline but do not discuss potential bias from implementing and optimizing their own method. Appendix A.3 notes 'optimization differences between our implementation and the baseline' but does not discuss this as a source of bias."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "A core contribution: Table 1 and Figure 4 plot the tradeoff between speedup and arithmetic operations increase for various α and γ. Theorem 3.11 formally analyzes the operations factor. The paper explicitly separates walltime improvement from compute cost."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "The paper's claim is about walltime speedup, and it measures walltime directly — there is no proxy gap between the benchmark and the claim. The theoretical analysis (Theorems 3.8, 3.11) provides formal guarantees, and empirical measurements validate the theory (Table 4)."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No scaffolding is involved. The paper evaluates a decoding algorithm applied directly to standard models."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": false,
    352         "answer": false,
    353         "justification": "The paper measures inference speed, not model capability on benchmarks. Output distributions are proven identical to standard decoding, so data leakage has no bearing on the findings."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": false,
    357         "answer": false,
    358         "justification": "Same as above — the paper evaluates an inference acceleration algorithm, not model accuracy. Feature leakage is irrelevant."
    359       },
    360       "non_independence_addressed": {
    361         "applies": false,
    362         "answer": false,
    363         "justification": "Same as above — independence of train/test data is irrelevant when measuring walltime speedup with provably identical outputs."
    364       },
    365       "leakage_detection_method": {
    366         "applies": false,
    367         "answer": false,
    368         "justification": "Leakage detection is inapplicable since the paper does not evaluate model capability on benchmarks."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Speculative decoding produces outputs with the same distribution as standard autoregressive decoding from the target model.",
    375       "evidence": "Formal proof in Appendix A.1 showing P(x = x') = p(x') for any distributions p(x) and q(x), via speculative sampling's accept/reject mechanism.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Speculative decoding achieves 2X-3X walltime speedup on T5-XXL compared to standard T5X implementation.",
    380       "evidence": "Table 2: 2.6X (temp=1) and 3.4X (temp=0) for English-German translation; 2.3X (temp=1) and 3.1X (temp=0) for CNN/DM summarization, all using T5-small as approximation model.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "The method requires no retraining, no architecture changes, and works with off-the-shelf models.",
    385       "evidence": "Algorithm 1 uses existing model checkpoints. Section 4.1: 'We use existing checkpoints for all models.' The algorithm only requires running the target and approximation models as-is.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Approximation models approximately two orders of magnitude smaller than the target model provide the best speed-cost tradeoff.",
    390       "evidence": "Table 2: T5-small (77M, ~140x smaller than T5-XXL 11B) yields highest speedup. T5-large (800M, ~14x smaller) has higher α but lower speedup due to cost coefficient c. Section 3.6 discusses the α vs c tradeoff.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Even trivial n-gram approximation models yield non-zero acceptance rates and measurable speedup.",
    395       "evidence": "Table 3: bigram model achieves α ≈ 0.2 for T5-XXL on EnDe (T=0), yielding theoretical 1.25X speedup since c ≈ 0 (Section 3.6).",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Theoretical predictions of speedup match empirical measurements.",
    400       "evidence": "Table 4 (Appendix A.3) compares expected vs empirical improvement factors. Most values are within 0.3-0.6X of each other, with larger differences attributed to optimization differences and the i.i.d. approximation.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "Company evaluating its own models",
    407       "detail": "All three authors are Google Research employees. The paper evaluates speedups on Google's T5 and LaMDA models. While the theoretical proofs are independently verifiable and the algorithm guarantees identical outputs, the walltime measurements depend on implementation quality which could favor the authors' method."
    408     },
    409     {
    410       "flag": "No variance or confidence intervals on walltime",
    411       "detail": "All speedup measurements in Table 2 are reported as single point estimates (e.g., '3.4X') with no error bars, standard deviations, or confidence intervals. Walltime measurements on hardware can vary significantly across runs. It is unclear whether results represent a single run or averaged measurements."
    412     },
    413     {
    414       "flag": "Single hardware configuration",
    415       "detail": "All walltime measurements are on a single TPU-v4 with batch size 1. Results may not generalize to other hardware (GPUs, different TPU versions), larger batch sizes, or distributed serving configurations commonly used in production."
    416     }
    417   ],
    418   "cited_papers": [
    419     {
    420       "title": "Language models are few-shot learners",
    421       "authors": ["Tom B. Brown", "Benjamin Mann", "Nick Ryder"],
    422       "year": 2020,
    423       "relevance": "GPT-3 paper — foundational work on large autoregressive language models that motivates the need for inference acceleration."
    424     },
    425     {
    426       "title": "PaLM: Scaling language modeling with pathways",
    427       "authors": ["Aakanksha Chowdhery", "Sharan Narang", "Jacob Devlin"],
    428       "year": 2022,
    429       "arxiv_id": "2204.02311",
    430       "relevance": "Large-scale language model demonstrating the scaling trend that makes inference acceleration increasingly important."
    431     },
    432     {
    433       "title": "LaMDA: Language models for dialog applications",
    434       "authors": ["Romal Thoppilan", "Daniel De Freitas", "Jamie Hall"],
    435       "year": 2022,
    436       "arxiv_id": "2201.08239",
    437       "relevance": "Dialog language model used as one of the target models in speculative decoding experiments (137B parameters)."
    438     },
    439     {
    440       "title": "Distilling the knowledge in a neural network",
    441       "authors": ["Geoffrey E. Hinton", "Oriol Vinyals", "Jeff Dean"],
    442       "year": 2015,
    443       "arxiv_id": "1503.02531",
    444       "relevance": "Foundational work on knowledge distillation — an alternative approach to making large model inference more efficient."
    445     },
    446     {
    447       "title": "Blockwise parallel decoding for deep autoregressive models",
    448       "authors": ["Mitchell Stern", "Noam Shazeer", "Jakob Uszkoreit"],
    449       "year": 2018,
    450       "relevance": "Key predecessor to speculative decoding — decodes several tokens in parallel but limited to greedy decoding and requires custom training."
    451     },
    452     {
    453       "title": "Accelerating large language model decoding with speculative sampling",
    454       "authors": ["Charlie Chen", "Sebastian Borgeaud", "Geoffrey Irving"],
    455       "year": 2023,
    456       "arxiv_id": "2302.01318",
    457       "relevance": "Independent concurrent work on speculative decoding showing similar 2X-2.5X improvements on Chinchilla 70B."
    458     },
    459     {
    460       "title": "Exploring the limits of transfer learning with a unified text-to-text transformer",
    461       "authors": ["Colin Raffel", "Noam Shazeer", "Adam Roberts"],
    462       "year": 2020,
    463       "relevance": "T5 paper — the primary model family (T5-XXL 11B) used for empirical walltime evaluation of speculative decoding."
    464     },
    465     {
    466       "title": "Fast transformer decoding: One write-head is all you need",
    467       "authors": ["Noam Shazeer"],
    468       "year": 2019,
    469       "arxiv_id": "1911.02150",
    470       "relevance": "Multi-query attention for faster Transformer decoding — complementary inference acceleration technique."
    471     },
    472     {
    473       "title": "Attention is all you need",
    474       "authors": ["Ashish Vaswani", "Noam Shazeer", "Niki Parmar"],
    475       "year": 2017,
    476       "relevance": "Original Transformer architecture paper — defines the autoregressive models that speculative decoding accelerates."
    477     },
    478     {
    479       "title": "Sparse is enough in scaling transformers",
    480       "authors": ["Sebastian Jaszczur", "Aakanksha Chowdhery", "Afroz Mohiuddin"],
    481       "year": 2021,
    482       "relevance": "Sparsification approach to efficient Transformer inference — alternative technique that could be combined with speculative decoding."
    483     }
    484   ]
    485 }

Impressum · Datenschutz