scan-v5.json (26203B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "LLaMA: Open and Efficient Foundation Language Models", 6 "authors": [ 7 "Hugo Touvron", 8 "Thibaut Lavril", 9 "Gautier Izacard", 10 "Xavier Martinet", 11 "Marie-Anne Lachaux", 12 "Timothee Lacroix", 13 "Baptiste Rozière", 14 "Naman Goyal", 15 "Eric Hambro", 16 "Faisal Azhar", 17 "Aurelien Rodriguez", 18 "Armand Joulin", 19 "Edouard Grave", 20 "Guillaume Lample" 21 ], 22 "year": 2023, 23 "venue": "arXiv.org", 24 "arxiv_id": "2302.13971", 25 "doi": null 26 }, 27 "checklist": { 28 "claims_and_evidence": { 29 "abstract_claims_supported": { 30 "applies": true, 31 "answer": true, 32 "justification": "All abstract claims are substantiated: LLaMA-13B outperforming GPT-3 on most benchmarks is shown in Tables 3–9, and LLaMA-65B's competitiveness with Chinchilla-70B and PaLM-540B is demonstrated across common sense reasoning, QA, and MMLU.", 33 "source": "haiku" 34 }, 35 "causal_claims_justified": { 36 "applies": true, 37 "answer": false, 38 "justification": "The paper adopts architectural changes (RMSNorm, SwiGLU, RoPE) and attributes improved training stability and performance to them, but provides no ablation studies to isolate the contribution of each change.", 39 "source": "haiku" 40 }, 41 "generalization_bounded": { 42 "applies": true, 43 "answer": false, 44 "justification": "The abstract claims 'state-of-the-art models' can be trained on public data, but LLaMA-65B underperforms Chinchilla-70B and PaLM-540B on MMLU (63.4 vs 67.5/69.3), which is not acknowledged in the abstract's framing.", 45 "source": "haiku" 46 }, 47 "alternative_explanations_discussed": { 48 "applies": true, 49 "answer": false, 50 "justification": "The paper does not discuss alternative explanations for LLaMA's performance gains, such as whether better data curation alone (rather than more training tokens) explains results; MMLU underperformance is attributed to fewer books/academic papers but no competing hypotheses are considered.", 51 "source": "haiku" 52 }, 53 "proxy_outcome_distinction": { 54 "applies": true, 55 "answer": true, 56 "justification": "The paper consistently frames claims in terms of benchmark accuracy scores and does not conflate benchmark performance with broader notions of general intelligence or real-world capability.", 57 "source": "haiku" 58 } 59 }, 60 "limitations_and_scope": { 61 "limitations_section_present": { 62 "applies": true, 63 "answer": false, 64 "justification": "There is no dedicated limitations or threats-to-validity section. Section 5 covers bias and toxicity but this is an evaluation of model behavior rather than a methodological limitations discussion.", 65 "source": "haiku" 66 }, 67 "threats_to_validity_specific": { 68 "applies": true, 69 "answer": false, 70 "justification": "The paper notes that TriviaQA uses a different split than GPT-3/PaLM and that SIQA benchmark may be unreliable due to high variance, but there is no systematic discussion of threats to the validity of comparative claims.", 71 "source": "haiku" 72 }, 73 "scope_boundaries_stated": { 74 "applies": true, 75 "answer": false, 76 "justification": "The paper mentions instruction finetuning is 'beyond the scope of this paper' and future larger models are planned, but does not explicitly state what the presented results do NOT show or where the findings should not be applied.", 77 "source": "haiku" 78 } 79 }, 80 "conflicts_of_interest": { 81 "funding_disclosed": { 82 "applies": true, 83 "answer": false, 84 "justification": "No funding disclosure statement is present in the paper; the work is identified as from Meta AI by affiliation only.", 85 "source": "haiku" 86 }, 87 "affiliations_disclosed": { 88 "applies": true, 89 "answer": true, 90 "justification": "All authors are listed as affiliated with Meta AI, which is clearly stated on the paper's first page.", 91 "source": "haiku" 92 }, 93 "funder_independent_of_outcome": { 94 "applies": true, 95 "answer": false, 96 "justification": "Meta AI employees are evaluating and promoting models they themselves built; the funder and developers are identical, creating an inherent conflict of interest.", 97 "source": "haiku" 98 }, 99 "financial_interests_declared": { 100 "applies": true, 101 "answer": false, 102 "justification": "There is no competing interests or financial interests declaration anywhere in the paper.", 103 "source": "haiku" 104 } 105 }, 106 "scope_and_framing": { 107 "key_terms_defined": { 108 "applies": true, 109 "answer": false, 110 "justification": "Core terms like 'foundation language models', 'state-of-the-art', and 'competitive' are used throughout without formal definition; the boundary for what counts as competitive performance is not made explicit.", 111 "source": "haiku" 112 }, 113 "intended_contribution_clear": { 114 "applies": true, 115 "answer": true, 116 "justification": "The contribution is explicitly stated: training competitive LLMs using only publicly available data and releasing them to the research community to democratize access to large language models.", 117 "source": "haiku" 118 }, 119 "engagement_with_prior_work": { 120 "applies": true, 121 "answer": true, 122 "justification": "Section 7 provides substantive engagement with prior LLM work covering architecture history, scaling laws, and concurrent open models; throughout the paper results are directly positioned against GPT-3, Chinchilla, PaLM, OPT, and BLOOM.", 123 "source": "haiku" 124 } 125 } 126 }, 127 "type_checklist": { 128 "empirical": { 129 "artifacts": { 130 "code_released": { 131 "applies": true, 132 "answer": true, 133 "justification": "Source code is released at https://github.com/facebookresearch/llama, referenced in the paper's first footnote.", 134 "source": "haiku" 135 }, 136 "data_released": { 137 "applies": true, 138 "answer": true, 139 "justification": "Training data consists entirely of publicly available datasets (CommonCrawl, C4, GitHub, Wikipedia, Gutenberg, Books3, ArXiv, StackExchange); all evaluation benchmarks are standard public benchmarks.", 140 "source": "haiku" 141 }, 142 "environment_specified": { 143 "applies": true, 144 "answer": false, 145 "justification": "The paper describes hardware (2048 A100-80GB GPUs) and references the xformers library but provides no requirements file, Dockerfile, or equivalent environment specification.", 146 "source": "haiku" 147 }, 148 "reproduction_instructions": { 149 "applies": true, 150 "answer": false, 151 "justification": "No step-by-step reproduction instructions are provided in the paper; reproduction would require inferring details from the code repository.", 152 "source": "haiku" 153 } 154 }, 155 "statistical_methodology": { 156 "confidence_intervals_or_error_bars": { 157 "applies": true, 158 "answer": false, 159 "justification": "No confidence intervals or error bars are reported for any benchmark results in Tables 3–14; all results are single point estimates.", 160 "source": "haiku" 161 }, 162 "significance_tests": { 163 "applies": true, 164 "answer": false, 165 "justification": "No statistical significance tests are performed for any comparative claims; performance differences are reported as raw accuracy values without significance testing.", 166 "source": "haiku" 167 }, 168 "effect_sizes_reported": { 169 "applies": true, 170 "answer": true, 171 "justification": "Absolute performance values and relative comparisons (e.g., '13B outperforms GPT-3 on most benchmarks despite being 10x smaller', specific accuracy gaps) are reported throughout the tables.", 172 "source": "haiku" 173 }, 174 "sample_size_justified": { 175 "applies": true, 176 "answer": false, 177 "justification": "The paper relies on established benchmarks without discussing sample sizes, statistical power, or whether the benchmarks are large enough to support the comparative claims made.", 178 "source": "haiku" 179 }, 180 "variance_reported": { 181 "applies": true, 182 "answer": false, 183 "justification": "No variance, standard deviation, or run-to-run variability is reported for any result; all figures are single-run point estimates.", 184 "source": "haiku" 185 } 186 }, 187 "evaluation_design": { 188 "baselines_included": { 189 "applies": true, 190 "answer": true, 191 "justification": "Comprehensive baselines included: GPT-3 (175B), Gopher (280B), Chinchilla (70B), PaLM (8B, 62B, 540B), OPT, GPT-J, and GPT-NeoX across all evaluation sections.", 192 "source": "haiku" 193 }, 194 "baselines_contemporary": { 195 "applies": true, 196 "answer": true, 197 "justification": "All baselines (Chinchilla, PaLM, GPT-3, Gopher) are the leading contemporary models from 2021–2022, directly competitive at the time of this work.", 198 "source": "haiku" 199 }, 200 "ablation_study": { 201 "applies": true, 202 "answer": false, 203 "justification": "No ablation studies are conducted; the paper adopts multiple architectural changes (RMSNorm, SwiGLU, RoPE) from prior work without ablating their individual contributions.", 204 "source": "haiku" 205 }, 206 "multiple_metrics": { 207 "applies": true, 208 "answer": true, 209 "justification": "Evaluation spans 20 benchmarks across eight domains: common sense reasoning (8 tasks), closed-book QA, reading comprehension, math reasoning (MATH, GSM8k), code generation (HumanEval, MBPP), MMLU, and safety benchmarks.", 210 "source": "haiku" 211 }, 212 "human_evaluation": { 213 "applies": false, 214 "answer": false, 215 "justification": "No human evaluation of model outputs is conducted; all evaluation uses automated benchmark scoring.", 216 "source": "haiku" 217 }, 218 "held_out_test_set": { 219 "applies": true, 220 "answer": true, 221 "justification": "All evaluation uses held-out test/dev sets of established benchmarks (NaturalQuestions test split, TriviaQA filtered dev set, MMLU test sets, etc.).", 222 "source": "haiku" 223 }, 224 "per_category_breakdown": { 225 "applies": true, 226 "answer": true, 227 "justification": "MMLU results are broken down by domain (Humanities, STEM, Social Sciences, Other) in Table 9 and per-task in Table 16; code results are broken down by benchmark and metric (pass@1, pass@100).", 228 "source": "haiku" 229 }, 230 "failure_cases_discussed": { 231 "applies": true, 232 "answer": true, 233 "justification": "The paper identifies where LLaMA underperforms: behind Chinchilla/PaLM on MMLU (attributed to fewer books in training), lower BoolQ scores, WinoGrande variance issues, and high hallucination rates on TruthfulQA.", 234 "source": "haiku" 235 }, 236 "negative_results_reported": { 237 "applies": true, 238 "answer": true, 239 "justification": "Negative results are explicitly reported: LLaMA-65B lags Chinchilla-70B and PaLM-540B on MMLU by several points, TruthfulQA correctness rates are low (0.57 max), and SIQA is flagged as potentially unreliable.", 240 "source": "haiku" 241 } 242 }, 243 "setup_transparency": { 244 "model_versions_specified": { 245 "applies": true, 246 "answer": true, 247 "justification": "Table 2 provides exact specifications for all four model sizes: parameter count, hidden dimension, number of heads, layers, learning rate, batch size, and training token count.", 248 "source": "haiku" 249 }, 250 "prompts_provided": { 251 "applies": true, 252 "answer": true, 253 "justification": "Appendix A shows formatted evaluation examples for NaturalQuestions and TriviaQA with the prepended instruction string; evaluation protocol details are provided for each benchmark.", 254 "source": "haiku" 255 }, 256 "hyperparameters_reported": { 257 "applies": true, 258 "answer": true, 259 "justification": "Section 2.3 and Table 2 report optimizer (AdamW), β1=0.9, β2=0.95, weight decay=0.1, gradient clipping=1.0, cosine LR schedule, warmup steps=2000, and per-model learning rates and batch sizes.", 260 "source": "haiku" 261 }, 262 "scaffolding_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "This is a base LLM pretraining paper with no agentic scaffolding or tool use.", 266 "source": "haiku" 267 }, 268 "data_preprocessing_documented": { 269 "applies": true, 270 "answer": true, 271 "justification": "Section 2.1 provides detailed preprocessing documentation for each data source, including deduplication methods, quality filtering heuristics, language identification, tokenizer choice (BPE via SentencePiece), and epoch counts per source.", 272 "source": "haiku" 273 } 274 }, 275 "data_integrity": { 276 "raw_data_available": { 277 "applies": true, 278 "answer": false, 279 "justification": "The preprocessed training data mixture is not released; only the constituent public sources are identified. The specific preprocessed dataset used to train LLaMA cannot be independently retrieved.", 280 "source": "haiku" 281 }, 282 "data_collection_described": { 283 "applies": true, 284 "answer": true, 285 "justification": "Section 2.1 describes data collection and processing for all seven data sources with specific pipelines (CCNet, C4 preprocessing, GitHub license filtering, Wikipedia dumps, BPE tokenization).", 286 "source": "haiku" 287 }, 288 "recruitment_methods_described": { 289 "applies": false, 290 "answer": false, 291 "justification": "No human participants are recruited; this is a pretraining study using web-crawled and curated text data.", 292 "source": "haiku" 293 }, 294 "data_pipeline_documented": { 295 "applies": true, 296 "answer": true, 297 "justification": "The full data pipeline from raw source to tokenized training data is documented, including sampling proportions (Table 1), epoch counts, disk sizes, and preprocessing steps for each source.", 298 "source": "haiku" 299 } 300 }, 301 "contamination": { 302 "training_cutoff_stated": { 303 "applies": true, 304 "answer": false, 305 "justification": "The paper specifies Wikipedia dumps from June–August 2022 and CommonCrawl dumps from 2017–2020, but no single explicit training data cutoff date is stated for the combined corpus.", 306 "source": "haiku" 307 }, 308 "train_test_overlap_discussed": { 309 "applies": true, 310 "answer": false, 311 "justification": "No analysis of potential overlap between training data (including 2022 CommonCrawl, ArXiv, StackExchange) and evaluation benchmarks is performed or discussed.", 312 "source": "haiku" 313 }, 314 "benchmark_contamination_addressed": { 315 "applies": true, 316 "answer": false, 317 "justification": "Many evaluation benchmarks (BoolQ, HellaSwag, MMLU, NaturalQuestions, TriviaQA) were publicly available well before the training data collection window; no decontamination or overlap analysis is reported.", 318 "source": "haiku" 319 } 320 }, 321 "human_studies": { 322 "pre_registered": { 323 "applies": false, 324 "answer": false, 325 "justification": "No human participants.", 326 "source": "haiku" 327 }, 328 "irb_or_ethics_approval": { 329 "applies": false, 330 "answer": false, 331 "justification": "No human participants.", 332 "source": "haiku" 333 }, 334 "demographics_reported": { 335 "applies": false, 336 "answer": false, 337 "justification": "No human participants.", 338 "source": "haiku" 339 }, 340 "inclusion_exclusion_criteria": { 341 "applies": false, 342 "answer": false, 343 "justification": "No human participants.", 344 "source": "haiku" 345 }, 346 "randomization_described": { 347 "applies": false, 348 "answer": false, 349 "justification": "No human participants.", 350 "source": "haiku" 351 }, 352 "blinding_described": { 353 "applies": false, 354 "answer": false, 355 "justification": "No human participants.", 356 "source": "haiku" 357 }, 358 "attrition_reported": { 359 "applies": false, 360 "answer": false, 361 "justification": "No human participants.", 362 "source": "haiku" 363 } 364 }, 365 "cost_and_practicality": { 366 "inference_cost_reported": { 367 "applies": true, 368 "answer": false, 369 "justification": "The paper qualitatively notes that LLaMA-13B runs on a single V100 and smaller models run on a single GPU, but provides no quantitative latency or inference cost measurements.", 370 "source": "haiku" 371 }, 372 "compute_budget_stated": { 373 "applies": true, 374 "answer": true, 375 "justification": "Section 6 and Table 15 explicitly state training compute: 2048 A100-80GB GPUs for approximately 5 months, with per-model GPU-hours (e.g., 1,022,362 GPU-hours for 65B), total power (449 MWh), and carbon emissions (173 tCO2eq).", 376 "source": "haiku" 377 } 378 } 379 } 380 }, 381 "claims": [ 382 { 383 "claim": "LLaMA-13B outperforms GPT-3 (175B) on most benchmarks despite being more than 10x smaller in parameter count.", 384 "evidence": "Tables 3–9 show LLaMA-13B exceeding GPT-3 on BoolQ, PIQA, HellaSwag, WinoGrande, ARC, NaturalQuestions, TriviaQA, RACE, and MMLU (46.9 vs 43.9).", 385 "supported": "strong" 386 }, 387 { 388 "claim": "LLaMA-65B is competitive with Chinchilla-70B and PaLM-540B.", 389 "evidence": "LLaMA-65B outperforms Chinchilla-70B on most common sense reasoning benchmarks (Table 3) and matches PaLM-540B on several tasks, though it lags on MMLU (63.4 vs 67.5/69.3).", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "State-of-the-art language model performance can be achieved using only publicly available data.", 394 "evidence": "All training data sources (CommonCrawl, C4, GitHub, Wikipedia, etc.) are publicly available, and the resulting models match or exceed proprietary-data models on most benchmarks.", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "Training smaller models on more tokens yields better inference efficiency than training larger models to compute-optimality.", 399 "evidence": "The 7B model continues to improve after 1T tokens (beyond Chinchilla's recommended stopping point), and the 13B model achieves GPT-3 quality at far lower inference cost.", 400 "supported": "moderate" 401 }, 402 { 403 "claim": "Toxicity in generated text increases with model size within the LLaMA family.", 404 "evidence": "Table 11 shows RealToxicityPrompts scores increasing from 7B (0.106) to 65B (0.128) on basic prompts and more sharply on respectful prompts (0.081 to 0.141).", 405 "supported": "moderate" 406 }, 407 { 408 "claim": "LLaMA-65B exhibits gender bias in coreference resolution, performing worse on 'gotcha' cases where pronouns don't match occupational stereotypes.", 409 "evidence": "Table 13 shows WinoGender gotcha accuracy drops to 75.0% (her/her/she) and 63.3% (his/him/he) versus overall 77.5%, demonstrating the model relies on occupational gender stereotypes.", 410 "supported": "strong" 411 } 412 ], 413 "methodology_tags": [ 414 "benchmark-eval", 415 "empirical" 416 ], 417 "key_findings": "LLaMA demonstrates that competitive foundation language models can be trained using only publicly available data, with LLaMA-13B outperforming GPT-3 (175B) across most benchmarks at 10x fewer parameters. The work extends Chinchilla scaling insights to the inference-budget regime, showing continued improvement from training smaller models on more tokens. Safety evaluations reveal persistent bias and toxicity issues, with toxicity increasing with scale and significant gender bias in coreference resolution tasks. The compute and carbon footprint are transparently documented, and model weights are released.", 418 "red_flags": [ 419 { 420 "flag": "No ablation studies", 421 "detail": "Three architectural modifications (RMSNorm, SwiGLU, RoPE) are adopted together without any ablation to isolate their individual contributions to performance or training stability." 422 }, 423 { 424 "flag": "No statistical significance testing", 425 "detail": "All comparative claims across 20+ benchmarks are presented as raw accuracy numbers without confidence intervals, significance tests, or error bars, making it impossible to assess which differences are meaningful." 426 }, 427 { 428 "flag": "Benchmark contamination unaddressed", 429 "detail": "Training data includes 2017–2022 CommonCrawl, ArXiv, and StackExchange data that almost certainly contains benchmark examples (MMLU, HellaSwag, BoolQ, etc.); no decontamination analysis is performed." 430 }, 431 { 432 "flag": "Self-evaluation by developer", 433 "detail": "Meta AI employees evaluate their own models with no independent third-party verification; methodology choices (benchmark selection, evaluation protocols) may favor the presented models." 434 }, 435 { 436 "flag": "Inconsistent evaluation splits", 437 "detail": "TriviaQA uses the filtered dev set rather than the unfiltered test set used by GPT-3 and PaLM (acknowledged in Appendix A), making direct comparison potentially misleading." 438 }, 439 { 440 "flag": "Overclaiming on MMLU", 441 "detail": "The abstract claims LLaMA-65B is 'competitive with the best models,' but it underperforms Chinchilla-70B (63.4 vs 67.5) and PaLM-540B (63.4 vs 69.3) on MMLU without this caveat in the abstract." 442 } 443 ], 444 "cited_papers": [ 445 { 446 "title": "Training Compute-Optimal Large Language Models (Chinchilla)", 447 "relevance": "Central to LLaMA's motivation: the paper extends Chinchilla's compute-optimal scaling laws to the inference-budget regime, training smaller models on more tokens." 448 }, 449 { 450 "title": "Language Models are Few-Shot Learners (GPT-3)", 451 "relevance": "Primary baseline throughout; LLaMA-13B is benchmarked against GPT-3-175B as the key competitive comparison." 452 }, 453 { 454 "title": "PaLM: Scaling Language Modeling with Pathways", 455 "relevance": "Key architectural inspiration (SwiGLU) and major comparison baseline across all evaluation tasks." 456 }, 457 { 458 "title": "OPT: Open Pre-trained Transformer Language Models", 459 "relevance": "Most relevant prior open-source LLM comparison; LLaMA is positioned as an improvement over OPT in both performance and openness." 460 }, 461 { 462 "title": "Scaling Laws for Neural Language Models (Kaplan et al.)", 463 "relevance": "Foundational scaling law work that LLaMA builds upon and extends." 464 }, 465 { 466 "title": "Emergent Abilities of Large Language Models (Wei et al.)", 467 "relevance": "Contextualizes why scaling matters for few-shot task performance." 468 }, 469 { 470 "title": "Measuring Massive Multitask Language Understanding (MMLU)", 471 "relevance": "Primary benchmark for knowledge evaluation; results expose LLaMA's data composition limitations." 472 }, 473 { 474 "title": "Evaluating Large Language Models Trained on Code (Codex/HumanEval)", 475 "relevance": "Provides HumanEval benchmark and pass@k methodology used for code generation evaluation." 476 }, 477 { 478 "title": "BLOOM: A 176B-Parameter Open-Access Multilingual Language Model", 479 "relevance": "Contemporary open LLM comparison for carbon footprint analysis and open-source LLM landscape context." 480 } 481 ], 482 "engagement_factors": { 483 "practical_relevance": { 484 "score": 3, 485 "justification": "Directly usable open model weights released publicly; 7B/13B models run on consumer hardware, enabling wide practitioner adoption." 486 }, 487 "surprise_contrarian": { 488 "score": 2, 489 "justification": "Challenges the assumption that proprietary data is necessary for competitive LLM performance; the inference-budget framing of scaling laws is a meaningful reframing." 490 }, 491 "fear_safety": { 492 "score": 1, 493 "justification": "Documents bias and toxicity but frames these as known issues; primary concern is open release enabling misuse, which is not deeply addressed." 494 }, 495 "drama_conflict": { 496 "score": 2, 497 "justification": "Directly challenges closed-source LLM providers (OpenAI, Google) by releasing competitive open weights; fueled significant open vs. closed AI debate in 2023." 498 }, 499 "demo_ability": { 500 "score": 3, 501 "justification": "Openly released weights at GitHub; any researcher can download and run these models immediately on accessible hardware." 502 }, 503 "brand_recognition": { 504 "score": 3, 505 "justification": "Meta AI (Facebook Research), highly cited paper that spawned the entire LLaMA/Llama 2/3 ecosystem and directly influenced Alpaca, Vicuna, and dozens of derivative models." 506 } 507 }, 508 "hn_data": { 509 "threads": [ 510 { 511 "hn_id": "39856808", 512 "title": "LLaMA: Open and Efficient Foundation Language Models (2023)", 513 "points": 2, 514 "comments": 0, 515 "url": "https://news.ycombinator.com/item?id=39856808", 516 "created_at": "2024-03-28T20:19:31Z" 517 } 518 ], 519 "top_points": 2, 520 "total_points": 2, 521 "total_comments": 0 522 } 523 }