scan.json (18573B)
1 { 2 "paper": { 3 "title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding", 4 "authors": ["Jacob Devlin", "Ming-Wei Chang", "Kenton Lee", "Kristina Toutanova"], 5 "year": 2018, 6 "venue": "NAACL", 7 "arxiv_id": "1810.04805" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "GitHub repository URL provided: https://github.com/google-research/bert (Section 1)." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "All evaluation datasets are publicly available standard benchmarks (GLUE, SQuAD, SWAG, CoNLL-2003). Pre-training data uses publicly available BooksCorpus and English Wikipedia." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions Cloud TPUs but does not specify software versions or dependencies beyond the tensor2tensor library reference." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided in the paper. Hyperparameters are listed but there is no README-style guide for replicating experiments." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Results are reported as point estimates without confidence intervals or error bars in any table." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "No statistical significance tests are reported. Claims of improvement are based solely on comparing point estimates across systems." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Absolute improvement figures are reported with baseline context throughout (e.g., 'GLUE score to 80.5% (7.7% point absolute improvement)', 'SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement)')." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification for the number of random restarts (5) used for dev set evaluation, or discussion of whether this is sufficient." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "Table 6 reports 'average Dev Set accuracy from 5 random restarts' but no standard deviation or spread measure is provided." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Multiple baselines are compared: OpenAI GPT, ELMo, BiLSTM+ELMo+Attn, and prior SOTA systems across all tasks (Tables 1-4, 7)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include OpenAI GPT (2018) and ELMo (2018), which were the most recent and competitive systems at the time of publication." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Section 5 presents ablation studies: effect of pre-training tasks (Table 5), effect of model size (Table 6), feature-based vs fine-tuning (Table 7), and additional ablations in Appendix C." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Multiple metrics are used: accuracy, F1, EM (exact match), Spearman correlation (STS-B), Matthews correlation (CoLA), and LM perplexity." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation of BERT's outputs is conducted. All evaluation is via automated benchmarks. Human performance baselines are cited from original benchmark papers but not conducted by the authors." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "GLUE test results are obtained from the evaluation server (Table 1). SQuAD test results are from the leaderboard. Dev and test sets are clearly separated." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down per-task across all 11 NLP tasks (Tables 1-4, 7), not just aggregate scores." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": false, 98 "justification": "No qualitative error analysis or failure case discussion is provided. The paper does not examine where BERT performs poorly or what types of examples it gets wrong." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Ablation studies report configurations that hurt performance: removing NSP hurts QNLI substantially (Table 5), LTR model performs much worse on MRPC and SQuAD, adding BiLSTM hurts GLUE tasks." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "All numerical claims in the abstract (GLUE 80.5%, MultiNLI 86.7%, SQuAD v1.1 F1 93.2, SQuAD v2.0 F1 83.1) are supported by results in Tables 1-3." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper claims bidirectionality and NSP cause improvements. These are supported by controlled ablation studies (Section 5.1, Table 5) that isolate individual components while holding other variables constant." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "Claims are bounded to the tested NLP tasks. The paper says 'eleven natural language processing tasks' rather than claiming general language understanding. Results are task-specific." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section A.4 explicitly discusses confounding differences between BERT and GPT (training data size, special tokens, batch size, learning rate selection) and the ablation in Section 5.1 is designed to isolate the effect of bidirectionality from these factors." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Exact model architectures are specified: BERTBASE (L=12, H=768, A=12, 110M params) and BERTLARGE (L=24, H=1024, A=16, 340M params). This is a new model, so version specification means architecture specification." 133 }, 134 "prompts_provided": { 135 "applies": false, 136 "answer": false, 137 "justification": "BERT uses fine-tuning, not prompting. No prompts are used in the experiments." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Pre-training hyperparameters detailed in Section A.2 (batch size 256, 1M steps, Adam lr 1e-4, β1=0.9, β2=0.999, dropout 0.1, warmup 10k steps). Fine-tuning hyperparameters in Section A.3." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used. BERT is a pre-train/fine-tune model, not an agent." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Pre-training data preprocessing is documented: WordPiece tokenization with 30k vocabulary, 15% masking rate, 80/10/10 mask/random/keep strategy, sequence length 128 for 90% of steps then 512. Pre-training corpus described (BooksCorpus + Wikipedia text passages)." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "No dedicated limitations or threats-to-validity section exists in the paper." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No specific threats to validity are discussed." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show or which settings were not tested." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "All evaluation benchmarks (GLUE, SQuAD, SWAG, CoNLL-2003) are publicly available. Pre-training data (BooksCorpus, Wikipedia) is publicly accessible. Pre-trained models released on GitHub." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Pre-training data collection described: BooksCorpus (800M words) and English Wikipedia (2,500M words), extracting only text passages, ignoring lists/tables/headers (Section 3.1)." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. All data sources are standard public benchmarks and corpora." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The data pipeline is documented: corpus selection → WordPiece tokenization → sentence pair sampling → masking procedure → training. Section A.2 provides details." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding or acknowledgments section is present. All authors are from Google AI Language but no explicit funding disclosure." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations clearly listed as 'Google AI Language' with email addresses." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "Google funds the research and has commercial interest in demonstrating strong NLP capabilities. The funder is not independent of the outcome." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "No explicit training data cutoff date is stated. The pre-training corpus (BooksCorpus + Wikipedia) is described but no temporal boundary is given." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of whether any benchmark examples appeared in the pre-training corpus (Wikipedia + BooksCorpus). Some GLUE tasks draw from similar web sources." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "No contamination analysis is performed. Several benchmarks (SQuAD, which uses Wikipedia passages) share a data source with the pre-training corpus, and this overlap is not discussed." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in the study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in the study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in the study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in the study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in the study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "Fine-tuning time is mentioned ('at most 1 hour on a single Cloud TPU') but inference cost/latency is not reported." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": true, 279 "justification": "Pre-training compute is stated: BERTBASE on 4 Cloud TPUs (16 chips) for 4 days, BERTLARGE on 16 Cloud TPUs (64 chips) for 4 days (Section A.2). Fine-tuning time also noted." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "BERT achieves new state-of-the-art on 11 NLP tasks, pushing GLUE score to 80.5% (7.7% absolute improvement over prior SOTA).", 286 "evidence": "Table 1 shows BERTLARGE achieves 82.1 average on GLUE tasks vs. 75.1 for OpenAI GPT. Official GLUE leaderboard score of 80.5 (Section 4.1).", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Bidirectional pre-training is critical: removing it (LTR & No NSP) causes large performance drops, especially on SQuAD (88.5 → 77.8 F1).", 291 "evidence": "Table 5 ablation study shows LTR & No NSP variant drops substantially on all tasks compared to full BERTBASE.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "Next Sentence Prediction pre-training task is beneficial for QA and NLI tasks.", 296 "evidence": "Table 5 shows removing NSP hurts QNLI (88.4 → 84.9), MNLI (84.4 → 83.9), and SQuAD (88.5 → 87.9).", 297 "supported": "strong" 298 }, 299 { 300 "claim": "Scaling to extreme model sizes leads to large improvements even on very small-scale tasks.", 301 "evidence": "Table 6 shows monotonic improvement across model sizes from 3 to 24 layers on MNLI, MRPC, and SST-2, including MRPC with only 3,600 training examples.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "BERT's feature-based approach (concatenating last 4 hidden layers) achieves 96.1 F1 on CoNLL-2003 NER, only 0.3 F1 behind fine-tuning.", 306 "evidence": "Table 7 shows feature-based Concat Last Four Hidden at 96.1 Dev F1 vs. BERTBASE fine-tuning at 96.4.", 307 "supported": "strong" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "BERT introduces bidirectional pre-training via masked language modeling and next sentence prediction, achieving state-of-the-art results on 11 NLP tasks including GLUE (80.5%), SQuAD v1.1 (93.2 F1), and SQuAD v2.0 (83.1 F1). Ablation studies demonstrate that bidirectionality is the primary driver of improvement over unidirectional models like OpenAI GPT. The paper also shows that scaling model size yields consistent gains even on small datasets, and that BERT is effective in both fine-tuning and feature-based settings.", 312 "red_flags": [ 313 { 314 "flag": "No uncertainty quantification", 315 "detail": "Despite reporting averages over 5 random restarts (Table 6), no standard deviations, confidence intervals, or significance tests are reported anywhere in the paper. All comparisons are based on point estimates." 316 }, 317 { 318 "flag": "Potential train/test contamination", 319 "detail": "BERT is pre-trained on Wikipedia, and SQuAD passages come from Wikipedia. The paper does not discuss whether SQuAD test passages appeared in the pre-training corpus." 320 }, 321 { 322 "flag": "Company evaluating own product", 323 "detail": "Google AI Language researchers evaluate a Google-developed model. While this is standard practice, no conflict of interest statement is provided." 324 }, 325 { 326 "flag": "No limitations section", 327 "detail": "The paper contains no discussion of limitations, threats to validity, or scope boundaries, which is a significant omission for a paper making broad claims about language understanding." 328 } 329 ], 330 "cited_papers": [ 331 { 332 "title": "Attention is All You Need", 333 "authors": ["Ashish Vaswani", "Noam Shazeer", "Niki Parmar", "Jakob Uszkoreit", "Llion Jones", "Aidan N Gomez", "Lukasz Kaiser", "Illia Polosukhin"], 334 "year": 2017, 335 "relevance": "Foundational Transformer architecture that BERT builds upon; critical for understanding all subsequent LLM work." 336 }, 337 { 338 "title": "Improving Language Understanding with Unsupervised Learning", 339 "authors": ["Alec Radford", "Karthik Narasimhan", "Tim Salimans", "Ilya Sutskever"], 340 "year": 2018, 341 "relevance": "OpenAI GPT, the primary baseline and direct predecessor to BERT's fine-tuning approach for NLP." 342 }, 343 { 344 "title": "Deep contextualized word representations", 345 "authors": ["Matthew Peters", "Mark Neumann", "Mohit Iyyer", "Matt Gardner", "Christopher Clark", "Kenton Lee", "Luke Zettlemoyer"], 346 "year": 2018, 347 "relevance": "ELMo, the key feature-based baseline representing an alternative pre-training paradigm to BERT." 348 }, 349 { 350 "title": "GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding", 351 "authors": ["Alex Wang", "Amanpreet Singh", "Julian Michael", "Felix Hill", "Omer Levy", "Samuel Bowman"], 352 "year": 2018, 353 "relevance": "Primary benchmark used for evaluating BERT; widely used for LLM evaluation methodology assessment." 354 }, 355 { 356 "title": "SQuAD: 100,000+ Questions for Machine Comprehension of Text", 357 "authors": ["Pranav Rajpurkar", "Jian Zhang", "Konstantin Lopyrev", "Percy Liang"], 358 "year": 2016, 359 "relevance": "Key question answering benchmark used to evaluate BERT; foundational dataset for LLM QA evaluation." 360 }, 361 { 362 "title": "Universal Language Model Fine-tuning for Text Classification", 363 "authors": ["Jeremy Howard", "Sebastian Ruder"], 364 "year": 2018, 365 "relevance": "ULMFiT, an important prior fine-tuning approach for transfer learning in NLP." 366 } 367 ] 368 }