calibration.json (17848B)
1 { 2 "paper_slug": "accelerating-large-language-2023", 3 "total_questions": 50, 4 "agreement_count": 49, 5 "disagreement_count": 1, 6 "agreement_rate": 0.98, 7 "disagreements": [ 8 { 9 "category": "statistical_methodology", 10 "question": "variance_reported", 11 "sonnet_applies": true, 12 "sonnet_answer": true, 13 "opus_applies": true, 14 "opus_answer": false, 15 "opus_justification": "Figure 1 shows standard deviation for mean sampling time, but this appears to be within-run variation across different sequences in a single experimental run, not variance across independent experimental runs or seeds. The paper never mentions repeating experiments across multiple runs or seeds. The schema asks for 'variance or standard deviation reported across experimental runs' and states 'If the paper reports single-run numbers only, NO.' The benchmark quality metrics (ROUGE-2, HumanEval pass rate) are reported as single point estimates with no variance at all.", 16 "sonnet_justification": "Figure 1 shows standard deviation for the mean sampling time metric. The paper notes 'The time taken per SpS/ArS loop has low variance' and the figure caption confirms 'with standard deviation' is displayed.", 17 "direction": "sonnet_generous" 18 } 19 ], 20 "opus_checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "No GitHub link, Zenodo archive, or code repository URL is provided anywhere in the paper. The algorithm is described in pseudocode (Algorithms 1 and 2) but no source code is released." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The paper uses publicly available benchmarks: XSum (Narayan et al., 2018) and HumanEval (Chen et al., 2021). Per schema, 'If the data is a standard public benchmark they didn't modify, YES.'" 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper mentions '16 TPU v4s' and 'Megatron style' sharding but provides no software environment specification (no requirements.txt, Dockerfile, conda environment, or library versions). Hardware alone is insufficient per the schema." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided. Reproduction requires access to proprietary DeepMind infrastructure, Chinchilla weights, and a custom-trained draft model." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": true, 47 "justification": "Figure 1 (left panel) shows standard deviation bars for mean sampling time across different K values. The caption explicitly states 'with standard deviation.'" 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "No statistical significance tests are used. Speedup claims and benchmark score comparisons (e.g., ROUGE-2: 0.112 vs 0.114, HumanEval: 45.1% vs 47.0%) are made without p-values, t-tests, or any formal testing." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "Speedup factors are clearly reported with baseline context: 1.92x, 2.01x, and 2.46x speedups relative to ArS at 14.1ms/token (Table 1). The magnitude of improvement is fully quantified." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "The paper states 11,305 sequences for XSum and 16,400 samples for HumanEval but provides no justification for these specific sizes and no power analysis." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "Figure 1 shows standard deviation for mean sampling time, but this is within-run variation across different sequences in a single experimental run, not variance across independent experimental runs or seeds. The paper never mentions repeating experiments across multiple runs. The benchmark quality metrics (ROUGE-2, HumanEval pass rate) are reported as single point estimates with no spread measures." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Standard auto-regressive sampling (ArS) is the baseline throughout, with direct comparisons in Table 1 for both XSum and HumanEval benchmarks." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "Standard auto-regressive sampling is the natural and current baseline for decoding acceleration work. The paper also discusses concurrent work by Leviathan et al. (2022) in related work." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Figure 1 provides a systematic exploration of the K (lookahead) hyperparameter from 0 to 7, showing how speedup, acceptance rate, and loop time vary. The paper also discusses why a standard 7B model fails as a draft model in distributed settings, comparing draft model choices." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Table 1 reports both quality metrics (ROUGE-2 for XSum, pass rate for HumanEval) and latency metrics (ms/token, speedup factor). Figure 1 additionally reports acceptance rate and total loop time." 90 }, 91 "human_evaluation": { 92 "applies": false, 93 "answer": false, 94 "justification": "The paper's claims concern decoding speed and distributional equivalence, both objectively verifiable through timing measurements and benchmark metrics. Human evaluation is irrelevant to these claims." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "XSum and HumanEval are standard test benchmarks with defined test splits. No model training or fine-tuning was performed on these benchmarks; they are used purely for evaluation." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Results are broken down by task (XSum vs HumanEval), by decoding method (nucleus vs greedy for XSum), and Figure 1 shows separate curves per domain across K values." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "The paper discusses several cases where the approach underperforms: XSum with nucleus at larger K values experiences latency regression (optimal at K=3), larger K increases variance problematic for P90/P99 latencies, and a standard 7B model fails as an efficient draft in distributed settings." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The paper reports that a chinchilla-optimal 7B model 'would provide only a modest speedup' and that serving 7B on 16 TPUs 'actually increases the latency.' The speedup plateauing/regression at larger K is also a negative finding." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The abstract claims '2-2.5x decoding speedup' — Table 1 shows 1.92x-2.46x. 'Without compromising sample quality' — benchmark scores match within noise (ROUGE-2 0.112 vs 0.114, HumanEval 45.1% vs 47.0%). Both claims are well supported." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "The paper's causal claim (SpS causes speedup) is supported by controlled single-variable comparisons (SpS vs ArS on identical model, hardware, and benchmarks). The distributional equivalence claim is formally proven (Theorem 1). The K ablation is a controlled manipulation." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "Results are only on Chinchilla 70B with a specific 4B draft model on 16 TPU v4s, but the title says 'Accelerating Large Language Model Decoding' and the conclusion says the method 'scales well with the appropriate draft model and complements many existing techniques' without bounding these claims to the tested setup." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "No alternative explanations for results are discussed. The hypothesis for HumanEval's higher speedup (code has common sub-sequences) is stated without testing or considering alternatives. No threats-to-validity section exists." 137 } 138 }, 139 "setup_transparency": { 140 "model_versions_specified": { 141 "applies": true, 142 "answer": false, 143 "justification": "'Chinchilla' is named without a specific version, checkpoint, or snapshot date. The draft model is described by architecture hyperparameters (Table 2) but has no version identifier. Per schema, naming a model without version/snapshot is NO." 144 }, 145 "prompts_provided": { 146 "applies": true, 147 "answer": false, 148 "justification": "The paper mentions '1-shot prompt' for XSum and '100-shot' for HumanEval but the actual prompt text is never provided. Per schema, describing prompts without providing actual text is NO." 149 }, 150 "hyperparameters_reported": { 151 "applies": true, 152 "answer": true, 153 "justification": "Table 2 provides draft model architecture hyperparameters (d_model=6144, heads=48, layers=8, params=4B). Table 1 specifies sampling parameters: nucleus p=0.8 for XSum, p=0.95 and temperature=0.8 for HumanEval. K=4 for main results." 154 }, 155 "scaffolding_described": { 156 "applies": false, 157 "answer": false, 158 "justification": "No agentic scaffolding is used. This is a standalone sampling algorithm for transformer decoding, not an agent-based system." 159 }, 160 "data_preprocessing_documented": { 161 "applies": true, 162 "answer": false, 163 "justification": "No data preprocessing steps are documented. For XSum, the 1-shot prompt construction is not described. For HumanEval, the 100-shot setup details are not provided. Maximum sequence lengths are stated (128, 512) but no preprocessing pipeline is described." 164 } 165 }, 166 "limitations_and_scope": { 167 "limitations_section_present": { 168 "applies": true, 169 "answer": false, 170 "justification": "No dedicated limitations or threats-to-validity section exists. The conclusion briefly mentions the method works 'in the small batch size setting' but this is a single clause, not substantive discussion." 171 }, 172 "threats_to_validity_specific": { 173 "applies": true, 174 "answer": false, 175 "justification": "No threats-to-validity are discussed. The paper does not address specific concerns such as dependence on draft model quality, hardware specificity, limited benchmark coverage, or batch size constraints." 176 }, 177 "scope_boundaries_stated": { 178 "applies": true, 179 "answer": false, 180 "justification": "The paper does not explicitly state what results do not show. No statements about what populations/settings/hardware are excluded from the claims. The 'small batch size setting' mention in the conclusion is the only scope qualifier but is not developed." 181 } 182 }, 183 "data_integrity": { 184 "raw_data_available": { 185 "applies": true, 186 "answer": false, 187 "justification": "No raw data (timing logs, TPU profiles, per-sequence benchmark outputs) is made available. Only aggregated results (means, speedup ratios, benchmark scores) are reported." 188 }, 189 "data_collection_described": { 190 "applies": true, 191 "answer": true, 192 "justification": "The paper describes timing data collection: 'The time taken per SpS/ArS loop has low variance, and we can measure it directly from TPU profiles. To obtain the average speedup, standard deviations and other metrics, we log the amount of tokens generated for each speculative loop.' Benchmark parameters (11,305 XSum sequences, 16,400 HumanEval samples) are stated." 193 }, 194 "recruitment_methods_described": { 195 "applies": false, 196 "answer": false, 197 "justification": "No human participants. This is a systems paper using standard benchmarks. The data source is standard public benchmarks (XSum, HumanEval)." 198 }, 199 "data_pipeline_documented": { 200 "applies": true, 201 "answer": false, 202 "justification": "The pipeline from benchmark inputs to final aggregated results is not fully documented. How benchmark data was prepared, how the 1-shot/100-shot prompts were constructed, and how raw timing data was aggregated into the reported metrics are not explained step by step." 203 } 204 }, 205 "conflicts_of_interest": { 206 "funding_disclosed": { 207 "applies": true, 208 "answer": false, 209 "justification": "No funding sources are disclosed. The acknowledgments section thanks DeepMind colleagues but lists no grants, sponsors, or funding agencies. The work was clearly funded by DeepMind/Alphabet but this is not stated." 210 }, 211 "affiliations_disclosed": { 212 "applies": true, 213 "answer": true, 214 "justification": "All six authors are clearly identified as being from DeepMind ('All authors from DeepMind'). They evaluate Chinchilla, a DeepMind model, and the institutional affiliation is transparent." 215 }, 216 "funder_independent_of_outcome": { 217 "applies": true, 218 "answer": false, 219 "justification": "DeepMind employs all authors and owns Chinchilla. DeepMind/Alphabet has a direct commercial interest in demonstrating faster inference for its models. The funder is not independent of the outcome." 220 }, 221 "financial_interests_declared": { 222 "applies": true, 223 "answer": false, 224 "justification": "No competing interests or financial interests statement appears in the paper. Per schema, absence of disclosure is not the same as absence of conflict." 225 } 226 }, 227 "contamination": { 228 "training_cutoff_stated": { 229 "applies": true, 230 "answer": false, 231 "justification": "The training data cutoff for Chinchilla is not stated. While the paper's primary claim is about decoding speed rather than model capability, it does report HumanEval and XSum benchmark scores, and the training cutoff is relevant for assessing whether these scores are meaningful for the distributional equivalence verification." 232 }, 233 "train_test_overlap_discussed": { 234 "applies": true, 235 "answer": false, 236 "justification": "No discussion of whether HumanEval or XSum examples appeared in Chinchilla's training data. Both benchmarks predate Chinchilla's training and could plausibly be contaminated." 237 }, 238 "benchmark_contamination_addressed": { 239 "applies": true, 240 "answer": false, 241 "justification": "HumanEval (published 2021) and XSum (published 2018) were both available before Chinchilla's training. The paper does not address contamination risk, though contamination would affect both ArS and SpS equally, making it less critical for the speedup claim." 242 } 243 }, 244 "human_studies": { 245 "pre_registered": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants. This is a systems paper evaluating a decoding algorithm on standard benchmarks." 249 }, 250 "irb_or_ethics_approval": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants. This is a systems paper evaluating a decoding algorithm on standard benchmarks." 254 }, 255 "demographics_reported": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants. This is a systems paper evaluating a decoding algorithm on standard benchmarks." 259 }, 260 "inclusion_exclusion_criteria": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants. This is a systems paper evaluating a decoding algorithm on standard benchmarks." 264 }, 265 "randomization_described": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants. This is a systems paper evaluating a decoding algorithm on standard benchmarks." 269 }, 270 "blinding_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants. This is a systems paper evaluating a decoding algorithm on standard benchmarks." 274 }, 275 "attrition_reported": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants. This is a systems paper evaluating a decoding algorithm on standard benchmarks." 279 } 280 }, 281 "cost_and_practicality": { 282 "inference_cost_reported": { 283 "applies": true, 284 "answer": true, 285 "justification": "Inference latency is the central metric of the paper. Table 1 reports mean token time in ms/token for both ArS and SpS. Figure 1 shows mean sampling time for 128 tokens across different K values." 286 }, 287 "compute_budget_stated": { 288 "applies": true, 289 "answer": false, 290 "justification": "The paper states the draft model was trained on '16 TPU v4s' but does not specify total training time, TPU hours, FLOPs, or total compute cost. The computational budget for training the draft model or running the experiments is not quantified." 291 } 292 } 293 } 294 }