scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25386B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Fast Inference from Transformers via Speculative Decoding",
      6     "authors": [
      7       "Yaniv Leviathan",
      8       "Matan Kalman",
      9       "Yossi Matias"
     10     ],
     11     "year": 2022,
     12     "venue": "International Conference on Machine Learning",
     13     "arxiv_id": "2211.17192",
     14     "doi": "10.48550/arXiv.2211.17192"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All abstract claims (2-3X speedup, identical outputs, parallel token generation) are supported by Section 4 empirical results and Section 3 theoretical proofs of output distribution equivalence.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Causal claim 'speculative decoding accelerates inference' is justified by empirical measurement on T5X baseline implementation. Controlled comparison with identical model/task setup.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Scope bounded to settings where 'additional computation resources are available' and 'memory bandwidth is the bottleneck' (Section 6). Tested across translation, summarization, dialog; results are task/model dependent.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Section 5 discusses related acceleration methods (distillation, quantization, adaptive computation). Trade-off explicitly stated: 'latency improved through increased concurrency at the cost of increased arithmetic operations.'",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Primary outcome is wall-time speedup (measured on TPU); clearly distinguished from number of arithmetic operations (which increases 1.2-1.6X). No conflation of speed with quality.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 6 Discussion contains explicit limitation: 'One limitation of speculative execution is that latency is improved through increased concurrency at the cost of increased arithmetic operations.' Not a dedicated section but substantive discussion.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific threats: (1) 'Not helpful for configurations where additional computation resources are not available'; (2) i.i.d. β assumption 'being only an approximation' (Appendix A.3); (3) increased memory bandwidth needs.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Clear boundaries: 'in common cases where additional computation resources are available'; 'only in text modality' (Section 6); requires memory-bandwidth bottleneck. Explicitly stated when method fails.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No explicit funding statement provided. Authors' Google Research affiliation is clear, but source of research funding is not stated.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors clearly listed as Google Research, Mountain View, CA in author attribution line.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Google funds the work but the algorithm is general-purpose (works with any models) and not promoting Google-specific products. Method is hardware/model-agnostic.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement, no mention of patents or equity. Financial interests are not declared.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms formally defined: 'speculative decoding' (Section 2), acceptance rate β (Definition 3.1), DLK divergence (Definition 3.2), approximation model Mq vs target Mp (Section 2.1).",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Two main contributions explicitly stated at end of introduction: (1) generalization of speculative execution to stochastic setting with speculative sampling; (2) speculative decoding mechanism for inference acceleration.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 5 systematically compares against prior work: discusses general efficiency approaches, adaptive computation methods, prior speculative execution work (Blockwise Parallel Decoding, SAD), showing how this differs from each.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "Algorithm 1 (pseudocode) provided but no source code released. No repository, GitHub link, or code availability mentioned.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Uses standard public benchmarks (WMT EnDe, CNN/DM, lm1b) and existing model checkpoints from published sources. All data/models publicly available.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Hardware specified (TPU-v4) and batch size (1), but no reproducibility setup provided (no Dockerfile, requirements.txt, installation instructions, or software versions beyond model names).",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "Algorithm 1 describes the method, but no step-by-step instructions for reproducing experiments. No code, no setup guide, no data download instructions provided.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Table 2 reports point estimates only (3.4X, 2.6X, etc.) with no error bars, confidence intervals, or uncertainty quantification across multiple runs.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": false,
    153           "answer": false,
    154           "justification": "Systems performance paper measuring concrete speedups; statistical significance testing not standard for this work type. No hypothesis tests conducted.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Speedup factors clearly reported as effect sizes (2.6X, 3.4X on translation; 2.3X, 3.1X on summarization). Compared against T5X baseline.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "Section 4.2 evaluates acceptance rate α on '10K tokens generated by Mp' but provides no justification for this sample size or discussion of sufficiency.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Table 2 shows single-run measurements with no variance, standard deviation, or confidence intervals. No multiple runs or error bars reported.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Compared against 'robust T5X implementation' (standard baseline). Speculative decoding vs standard decoding comparison shown in Table 2.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "T5X is described as popular, optimized implementation contemporary to this work (2022). Roberts et al. 2022 cited for T5X baseline.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Ablations across: approximation model size (T5-small/base/large), temperature (0 vs 1), γ parameter (varying values), multiple tasks (translation, summarization), and model families (T5, LaMDA, GPT-like).",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Wall-time speedup (primary), acceptance rate α, arithmetic operations increase, memory accesses, α values across different settings (Table 3). Multiple angles measured.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": false,
    203           "answer": false,
    204           "justification": "Systems/efficiency paper measuring machine performance. Human evaluation not applicable.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "Uses standard test sets from benchmarks: WMT test set for translation, CNN/DM test set for summarization. Already separated from training data.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Results broken down by: task type (translation, summarization, dialog), temperature (0 vs 1), approximation model size, and model family (T5, LaMDA, GPT-like). Table 2 and Table 3 provide detailed breakdowns.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Explicitly discussed when method fails: 'not helpful for configurations where additional computation resources are not available.' Trade-off between speedup and increased operations discussed.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Speedup decreases with larger approximation models (T5-large: 1.7X vs T5-small: 3.4X). Trade-off showing increased arithmetic operations (1.2-1.6X increase).",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Model versions clearly specified: T5 version 1.1, LaMDA 137B/8B/2B/100M, GPT-like 97M. Parameter counts provided for all variants.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": false,
    241           "answer": false,
    242           "justification": "Not a prompt-based paper. Tests inference speed on pre-trained models, not prompting. Not applicable.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Key hyperparameters specified: temperature (0 and 1), batch size (1), γ parameter values (varies by task), tokenizer (BERT 8k tokens). Top-40 filter for LaMDA noted.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "No agentic scaffolding. Inference speed measurement, not agentic system. Not applicable.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": false,
    260           "justification": "States tasks are 'finetuned on WMT EnDe' and 'CNN/DM' but preprocessing steps (tokenization details, data filtering, normalization) not documented.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "Uses standard public benchmarks (WMT, CNN/DM, lm1b) and existing published model checkpoints. All raw data/models publicly available.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": false,
    273           "answer": false,
    274           "justification": "Uses existing benchmark datasets, not collecting new data. Data collection procedures not applicable to this work type.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants. Not applicable.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "Pipeline reasonably clear: load pre-trained models, apply speculative decoding algorithm (Algorithm 1), measure wall-time on test data. Could be more detailed but is documented.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": false,
    293           "answer": false,
    294           "justification": "Not evaluating model capabilities on benchmarks, but inference speed. Training cutoff not relevant. Not applicable.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": false,
    299           "answer": false,
    300           "justification": "Not evaluating model capabilities but inference algorithmic speed. Train-test overlap not a concern. Not applicable.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": false,
    305           "answer": false,
    306           "justification": "Uses standard benchmarks that existed before model training. Not evaluating new model capabilities, so contamination risk absent. Not applicable.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants. Not applicable.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants. Not applicable.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants. Not applicable.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants. Not applicable.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants. Not applicable.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants. Not applicable.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants. Not applicable.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "Inference speedup reported (2-3X wall-time), arithmetic operations increase quantified (1.2-1.6X), memory accesses analyzed. Cost trade-offs thoroughly reported.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": true,
    364           "justification": "Hardware specified (TPU-v4), batch size (1), model sizes specified. Could quantify total FLOPs/memory but hardware setup is clear.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Speculative decoding achieves 2-3X wall-time speedup on T5-XXL without changing output distribution",
    373       "evidence": "Table 2 shows 3.4X (temp=0) and 2.6X (temp=1) speedup on translation, 3.1X and 2.3X on summarization. Theorem 3.5 and Appendix A.1 prove output distribution equivalence.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "Acceptance rate α can be computed from distribution divergence as α = 1 - DLK(p, q)",
    378       "evidence": "Theorem 3.5 and Corollary 3.6 provide formal proof. Table 3 empirically validates α values across tasks and models.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Method works with any approximation model size and type without retraining target model",
    383       "evidence": "Section 4 tests T5-small/base/large, GPT-like 6M, LaMDA variants, unigram/bigram models. All work without target model modification.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Even trivial approximation models (bigrams) yield non-negligible speedup",
    388       "evidence": "Section 4.2 shows bigram model achieves α=0.2 for translation, yielding 1.25X speedup with negligible cost. Generalizes to any approximation model.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Speedup depends on acceptance rate α and cost coefficient c, with optimal γ computable numerically",
    393       "evidence": "Theorem 3.8 provides expected speedup formula. Figure 3 shows optimal γ as function of α and c. Empirical results (Table 2) match theoretical predictions.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Method trades off wall-time speedup for increased arithmetic operations and memory bandwidth requirements",
    398       "evidence": "Theorem 3.11 analyzes operation increase factor. Discussion (Section 6) explicitly states this trade-off. Appendix A.3 validates theoretical predictions against empirical runtimes.",
    399       "supported": "strong"
    400     }
    401   ],
    402   "methodology_tags": [
    403     "benchmark-eval",
    404     "theoretical"
    405   ],
    406   "key_findings": "Speculative decoding is a novel algorithm that accelerates autoregressive model inference by speculatively generating multiple token candidates using efficient approximation models in parallel, then verifying them with the large target model. The method achieves 2-3X wall-time speedup on T5-XXL without changing output distribution. Speedup is determined by the acceptance rate α (how well the approximation matches the target), which can be computed from distribution divergence. The method requires available compute resources and works best when memory bandwidth is the bottleneck; it trades wall-time improvements for increased arithmetic operations (1.2-1.6X increase).",
    407   "red_flags": [
    408     {
    409       "flag": "No error bars/confidence intervals",
    410       "detail": "Table 2 reports single-run measurements without variance estimates. No multiple runs or confidence bounds on speedup factors."
    411     },
    412     {
    413       "flag": "Code not released",
    414       "detail": "Algorithm provided as pseudocode but no source code, repository, or reproducibility package available for independent verification."
    415     },
    416     {
    417       "flag": "Sample size unjustified",
    418       "detail": "Acceptance rate α computed on 10K tokens (Section 4.2) without justification for why this sample size is sufficient."
    419     },
    420     {
    421       "flag": "I.I.D. assumption approximation",
    422       "detail": "Theoretical analysis assumes β values are i.i.d. (Equation 1), acknowledged in Appendix A.3 as 'being only an approximation' but impact not quantified."
    423     },
    424     {
    425       "flag": "Limited domain testing",
    426       "detail": "Section 6 states 'tested speculative decoding only in the text modality.' Generalization to images or other modalities unknown."
    427     },
    428     {
    429       "flag": "Funding not disclosed",
    430       "detail": "No explicit funding statement. Google Research affiliation is clear but source and any restrictions on the work not stated."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "Language models are few-shot learners",
    436       "relevance": "GPT-3 baseline model used for comparison; demonstrates scale of target models being accelerated."
    437     },
    438     {
    439       "title": "Exploring the limits of transfer learning with a unified text-to-text transformer",
    440       "relevance": "T5 model family is the primary testbed; establishes baseline models and fine-tuning approach."
    441     },
    442     {
    443       "title": "Scaling up models and data with T5X and SeqIO",
    444       "relevance": "T5X is the main baseline implementation compared against; critical for demonstrating practical speedup."
    445     },
    446     {
    447       "title": "LaMDA: Language Models for Dialog Applications",
    448       "relevance": "137B parameter model used to test speculative decoding at very large scale; dialog task evaluation."
    449     },
    450     {
    451       "title": "Blockwise Parallel Decoding for Deep Autoregressive Models",
    452       "relevance": "Prior speculative execution approach for decoding; directly compared, showing limitations of prior work (greedy-only, requires retraining)."
    453     },
    454     {
    455       "title": "Instantaneous Grammatical Error Correction with Shallow Aggressive Decoding",
    456       "relevance": "Prior speculative decoding work; compared to show generality advantage of this method."
    457     },
    458     {
    459       "title": "Distilling the knowledge in a neural network",
    460       "relevance": "Knowledge distillation as alternative acceleration method; discussed in related work."
    461     },
    462     {
    463       "title": "Dynamic Neural Networks: A Survey",
    464       "relevance": "Adaptive computation methods as alternative; contextualizes speculative decoding among efficiency approaches."
    465     }
    466   ],
    467   "engagement_factors": {
    468     "practical_relevance": {
    469       "score": 3,
    470       "justification": "Directly applicable to production inference systems; widely adopted (Chen et al. 2023 shows independent implementation). Solves real latency bottleneck."
    471     },
    472     "surprise_contrarian": {
    473       "score": 2,
    474       "justification": "Clever algorithmic contribution but builds on known speculative execution concepts from CPU architecture. The generalization to stochastic setting is novel but not shocking."
    475     },
    476     "fear_safety": {
    477       "score": 0,
    478       "justification": "Inference efficiency paper with no AI risk, safety, or alignment implications."
    479     },
    480     "drama_conflict": {
    481       "score": 0,
    482       "justification": "Technical contribution; no controversy, competing claims, or dramatic tension."
    483     },
    484     "demo_ability": {
    485       "score": 2,
    486       "justification": "Requires implementing algorithm and running large models on TPU hardware. Not trivial to reproduce but conceptually demonstrable with pseudocode."
    487     },
    488     "brand_recognition": {
    489       "score": 2,
    490       "justification": "Google Research affiliation provides credibility but not a famous lab (e.g., not DeepMind/OpenAI). Authors not independently famous."
    491     }
    492   },
    493   "hn_data": {
    494     "threads": [
    495       {
    496         "hn_id": "44830408",
    497         "title": "Flipper Zero dark web firmware bypasses rolling code security",
    498         "points": 486,
    499         "comments": 315,
    500         "url": "https://news.ycombinator.com/item?id=44830408",
    501         "created_at": "2025-08-07T21:10:42Z"
    502       },
    503       {
    504         "hn_id": "42217418",
    505         "title": "Samurai: Adapting Segment Anything Model for Zero-Shot Visual Tracking",
    506         "points": 55,
    507         "comments": 0,
    508         "url": "https://news.ycombinator.com/item?id=42217418",
    509         "created_at": "2024-11-22T21:14:30Z"
    510       },
    511       {
    512         "hn_id": "46099881",
    513         "title": "Training Foundation Models on a Full-Stack AMD Platform",
    514         "points": 26,
    515         "comments": 1,
    516         "url": "https://news.ycombinator.com/item?id=46099881",
    517         "created_at": "2025-11-30T20:02:36Z"
    518       },
    519       {
    520         "hn_id": "37387448",
    521         "title": "Fast Inference from Transformers via Speculative Decoding",
    522         "points": 2,
    523         "comments": 2,
    524         "url": "https://news.ycombinator.com/item?id=37387448",
    525         "created_at": "2023-09-05T03:17:05Z"
    526       },
    527       {
    528         "hn_id": "46071379",
    529         "title": "Training Foundation Models on a Full-Stack AMD Platform",
    530         "points": 2,
    531         "comments": 0,
    532         "url": "https://news.ycombinator.com/item?id=46071379",
    533         "created_at": "2025-11-27T17:28:29Z"
    534       }
    535     ],
    536     "top_points": 486,
    537     "total_points": 571,
    538     "total_comments": 318
    539   }
    540 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs