scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31996B)
      1 {
      2   "paper": {
      3     "title": "LLaMA: Open and Efficient Foundation Language Models",
      4     "authors": [
      5       "Hugo Touvron",
      6       "Thibaut Lavril",
      7       "Gautier Izacard",
      8       "Xavier Martinet",
      9       "Marie-Anne Lachaux",
     10       "Timothee Lacroix",
     11       "Baptiste Rozière",
     12       "Naman Goyal",
     13       "Eric Hambro",
     14       "Faisal Azhar",
     15       "Aurelien Rodriguez",
     16       "Armand Joulin",
     17       "Edouard Grave",
     18       "Guillaume Lample"
     19     ],
     20     "year": 2023,
     21     "venue": "arXiv",
     22     "arxiv_id": "2302.13971"
     23   },
     24   "scan_version": 2,
     25   "active_modules": ["experimental_rigor", "data_leakage"],
     26   "methodology_tags": ["benchmark-eval"],
     27   "key_findings": "LLaMA is a family of foundation language models (7B–65B parameters) trained exclusively on publicly available data, demonstrating that state-of-the-art performance does not require proprietary datasets. LLaMA-13B outperforms GPT-3 (175B) on most benchmarks despite being 10× smaller, and LLaMA-65B is competitive with Chinchilla-70B and PaLM-540B. The paper challenges the Chinchilla scaling assumption by showing that smaller models trained on more tokens can match or exceed larger models, shifting the optimization target from training compute to inference efficiency. Responsible AI evaluations reveal increasing toxicity with model scale and societal biases captured from web training data.",
     28   "checklist": {
     29     "artifacts": {
     30       "code_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper provides a GitHub link: https://github.com/facebookresearch/llama (footnote 1, Section 1)."
     34       },
     35       "data_released": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "All training data sources are publicly available and explicitly identified in Section 2.1 and Table 1: CommonCrawl (via CCNet), C4, public GitHub, Wikipedia, Gutenberg, Books3, ArXiv, and StackExchange. The paper emphasizes 'only using data that is publicly available, and compatible with open sourcing.'"
     39       },
     40       "environment_specified": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper describes hardware (2048 A100-80GB GPUs) and mentions the xformers library and SentencePiece tokenizer, but does not provide a requirements.txt, Dockerfile, or detailed software environment specification with library versions."
     44       },
     45       "reproduction_instructions": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "While the paper describes the training procedure and hyperparameters in detail, there are no step-by-step reproduction instructions, scripts, or a 'Reproducing Results' section. The GitHub repository is linked but the paper itself lacks specific instructions."
     49       }
     50     },
     51     "statistical_methodology": {
     52       "confidence_intervals_or_error_bars": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "All results in Tables 3–10 are reported as single point estimates with no confidence intervals, error bars, or uncertainty notation."
     56       },
     57       "significance_tests": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No statistical significance tests are used. Claims of 'outperforms' and 'competitive with' are based solely on comparing raw numbers across models (e.g., 'LLaMA-65B outperforms Chinchilla-70B on all reported benchmarks but BoolQ')."
     61       },
     62       "effect_sizes_reported": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The comparison tables (Tables 3–10) provide absolute scores for all baseline models alongside LLaMA results, providing full baseline context for assessing magnitude of improvements (e.g., BoolQ: LLaMA-65B 85.3 vs Chinchilla 83.7 vs GPT-3 60.5). The parameter efficiency comparison (13B vs 175B) also provides practical effect size context."
     66       },
     67       "sample_size_justified": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No justification is given for the number of evaluation examples used. Standard benchmark sizes are used without discussion of whether they provide sufficient statistical power for the claims made."
     71       },
     72       "variance_reported": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No variance, standard deviation, or multi-run results are reported. All benchmark results appear to be from single runs. The only exception is pass@k for code generation, which uses sampling-based estimates but does not report variance across independent runs."
     76       }
     77     },
     78     "evaluation_design": {
     79       "baselines_included": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Extensive baselines are included: GPT-3 (175B), Gopher (280B), Chinchilla (70B), PaLM (8B/62B/540B), OPT, GPT-J, GPT-Neo, LaMDA, and Minerva. Numbers are taken from the corresponding papers (Section 3)."
     83       },
     84       "baselines_contemporary": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Baselines include Chinchilla (2022), PaLM (2022), and OPT (2022), which were the most recent and competitive models at time of publication. Older models (GPT-3, 2020) are also included for completeness."
     88       },
     89       "ablation_study": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No ablation study is conducted. The paper adopts architectural components from prior work (pre-normalization from GPT-3, SwiGLU from PaLM, RoPE from GPT-Neo) but does not ablate them to measure individual contributions. The scaling analysis (4 model sizes) is not an ablation."
     93       },
     94       "multiple_metrics": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The paper evaluates across 20+ benchmarks spanning commonsense reasoning (8 tasks), question answering (exact match), reading comprehension, mathematical reasoning (MATH, GSM8k), code generation (pass@1, pass@k), MMLU, toxicity (RealToxicityPrompts), bias (CrowS-Pairs, WinoGender), and truthfulness (TruthfulQA)."
     98       },
     99       "human_evaluation": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "All evaluations are automated: benchmark accuracy, exact match, pass@k, PerspectiveAPI for toxicity, and perplexity-based bias measurement. No human evaluation of model outputs is conducted."
    103       },
    104       "held_out_test_set": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Standard benchmark test splits are used. Appendix A specifies: Natural Questions uses 'the test split used for open-domain question answering containing 3610 questions.' TriviaQA uses the dev set (test server unavailable). MMLU uses 'test sets' (Table 16)."
    108       },
    109       "per_category_breakdown": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Extensive breakdowns are provided: per-benchmark results across all model sizes (Tables 3–10), per-domain MMLU results (Table 16 in appendix, 57 tasks), per-category bias results (Table 12, 9 categories), and per-pronoun WinoGender results (Table 13)."
    113       },
    114       "failure_cases_discussed": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The paper discusses where LLaMA underperforms: MMLU gap vs Chinchilla/PaLM (Section 3.6, attributed to limited books/academic data), increasing toxicity with model size (Table 11), gender bias in co-reference resolution (Section 5.3, gotcha examples), and hallucination tendency (TruthfulQA, Section 5.4)."
    118       },
    119       "negative_results_reported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "LLaMA-65B falls behind Chinchilla-70B and PaLM-540B on MMLU (Section 3.6). Toxicity increases with model size (Table 11). Gender bias is demonstrated via WinoGender gotcha cases (Table 13). SIQA shows high variance suggesting benchmark unreliability (Section 3.7). TruthfulQA scores are acknowledged as low (Section 5.4)."
    123       }
    124     },
    125     "claims_and_evidence": {
    126       "abstract_claims_supported": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Abstract claims are supported: (1) 'train on trillions of tokens' — confirmed by Table 1 (1.0T–1.4T tokens). (2) 'LLaMA-13B outperforms GPT-3 on most benchmarks' — confirmed by Tables 3–8. (3) 'LLaMA-65B is competitive with Chinchilla-70B and PaLM-540B' — confirmed across benchmarks. (4) 'using publicly available datasets exclusively' — confirmed by Section 2.1."
    130       },
    131       "causal_claims_justified": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper's main causal claim is that training smaller models on more tokens (beyond Chinchilla-optimal) yields competitive performance. This is supported by systematic variation of model sizes (7B–65B) with controlled training procedures, training loss curves (Figure 1), and benchmark performance evolution during training (Figure 2). The study design (same architecture, same data, different scales) provides adequate evidence for this existence proof."
    135       },
    136       "generalization_bounded": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Claims are generally bounded to tested settings. The abstract specifies exact model sizes and comparison targets. Claims are made about specific benchmarks with specific model sizes. The paper frames its contribution as an existence proof ('it is possible to train state-of-the-art models using publicly available datasets') rather than claiming universal superiority."
    140       },
    141       "alternative_explanations_discussed": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "Several alternative explanations are discussed: MMLU underperformance attributed to limited books/academic data in training (Section 3.6). Toxicity-size relationship 'may only apply within a model family' based on Chinchilla vs Gopher comparison (Section 5.1). PerspectiveAPI methodology differences affecting cross-paper comparisons (Section 5.1). SIQA variance suggesting benchmark unreliability (Section 3.7)."
    145       },
    146       "proxy_outcome_distinction": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "The paper's claims match the granularity of its measurements. It claims competitive benchmark performance and demonstrates it on specific benchmarks. It does not frame benchmark accuracy as a broader proxy for 'intelligence' or 'understanding.' The inference efficiency argument is clearly framed in terms of specific model sizes and benchmark scores."
    150       }
    151     },
    152     "setup_transparency": {
    153       "model_versions_specified": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Exact model architectures are specified in Table 2 (parameters, dimensions, heads, layers, learning rates, batch sizes, token counts for all 4 model sizes). Baseline models are identified by exact sizes (GPT-3 175B, Chinchilla 70B, PaLM 8B/62B/540B, etc.) with results from their corresponding papers."
    157       },
    158       "prompts_provided": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Appendix A provides formatted prompt examples for Natural Questions and TriviaQA (Figure 3), including the prefix string 'Answer these questions:\\n'. The paper references the lm-evaluation-harness framework (Gao et al., 2021) for other benchmarks. Evaluation methodology is described, including likelihood normalization approaches from Brown et al. (2020)."
    162       },
    163       "hyperparameters_reported": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 2.3 reports optimizer hyperparameters: AdamW with β1=0.9, β2=0.95, weight decay 0.1, gradient clipping 1.0, 2000 warmup steps, cosine learning rate schedule. Table 2 provides per-model learning rates and batch sizes. Code generation evaluation specifies temperature 0.1 for pass@1 and 0.8 for pass@100."
    167       },
    168       "scaffolding_described": {
    169         "applies": false,
    170         "answer": false,
    171         "justification": "No agentic scaffolding is used. The models are evaluated directly on benchmarks without tool use, retrieval, or multi-step workflows."
    172       },
    173       "data_preprocessing_documented": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 2.1 provides detailed preprocessing documentation for each data source: CCNet pipeline for CommonCrawl (deduplication, language identification, quality filtering), C4 preprocessing (heuristic filtering), GitHub (license filtering, line length heuristics, boilerplate removal, file-level deduplication), Wikipedia (hyperlink/comment removal), Books (90% overlap deduplication), ArXiv (section/bibliography removal, macro expansion), StackExchange (HTML tag removal, score sorting). BPE tokenization with SentencePiece is described."
    177       }
    178     },
    179     "limitations_and_scope": {
    180       "limitations_section_present": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 5 ('Bias, Toxicity and Misinformation') is a dedicated multi-subsection evaluation of model limitations, covering RealToxicityPrompts (5.1), CrowS-Pairs bias (5.2), WinoGender bias (5.3), and TruthfulQA (5.4). While not titled 'Limitations,' it serves as a substantive limitations evaluation with multiple subsections and quantitative analysis."
    184       },
    185       "threats_to_validity_specific": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Specific threats are discussed: 'We do not have control over the pipeline used by the third-party PerspectiveAPI, making comparison with previous models difficult' (Section 5.1). MMLU underperformance attributed to specific data composition ('limited amount of books and academic papers,' Section 3.6). SIQA benchmark reliability questioned due to high variance (Section 3.7). Bias evaluations 'are not sufficient to fully understand the risks' (Section 5)."
    189       },
    190       "scope_boundaries_stated": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "The paper does not explicitly state what the results do NOT show. While it acknowledges specific weaknesses (MMLU, toxicity), it does not draw explicit boundaries around the scope of its performance claims or state what settings/populations/tasks are excluded from the claims."
    194       }
    195     },
    196     "data_integrity": {
    197       "raw_data_available": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The processed training data is not released. While the source datasets are publicly available (CommonCrawl, C4, etc.), the specific processed and filtered versions used for training are not provided. Model outputs on benchmarks are not released for independent verification."
    201       },
    202       "data_collection_described": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Section 2.1 describes data collection in detail for each source: CommonCrawl dumps from 2017–2020, C4 from Raffel et al. 2020, GitHub from Google BigQuery (Apache/BSD/MIT licensed), Wikipedia June–August 2022 (20 languages), Gutenberg and Books3, ArXiv LaTeX files, and StackExchange (28 largest websites). Sampling proportions and disk sizes are provided in Table 1."
    206       },
    207       "recruitment_methods_described": {
    208         "applies": false,
    209         "answer": false,
    210         "justification": "No human participants. Data sources are standard public datasets and benchmarks."
    211       },
    212       "data_pipeline_documented": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "The full data pipeline is documented in Section 2.1: raw data sources → preprocessing (deduplication, language identification, quality filtering per source) → BPE tokenization with SentencePiece → training data (~1.4T tokens). Each source has its filtering steps described. Table 1 shows final sampling proportions and epochs."
    216       }
    217     },
    218     "conflicts_of_interest": {
    219       "funding_disclosed": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No explicit funding statement is provided. The Acknowledgments section thanks individual Meta AI colleagues but does not disclose funding sources or grants."
    223       },
    224       "affiliations_disclosed": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "Author affiliations are clearly stated as 'Meta AI' in the paper header."
    228       },
    229       "funder_independent_of_outcome": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "Meta AI is both the employer/funder and the developer of the models being evaluated. Meta has a commercial interest in demonstrating that its models are competitive with proprietary alternatives (GPT-3, PaLM, Chinchilla)."
    233       },
    234       "financial_interests_declared": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No competing interests or financial interests statement is provided anywhere in the paper."
    238       }
    239     },
    240     "contamination": {
    241       "training_cutoff_stated": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "Training data time periods are partially stated: CommonCrawl 2017–2020, Wikipedia June–August 2022. However, GitHub, ArXiv, StackExchange, and Books sources lack specific time period information. No single training data cutoff date is explicitly stated."
    245       },
    246       "train_test_overlap_discussed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "No discussion of whether benchmark examples appeared in the training data. Many evaluation benchmarks (BoolQ 2019, PIQA 2020, HellaSwag 2019, ARC 2018, etc.) predate the CommonCrawl collection and could have been included."
    250       },
    251       "benchmark_contamination_addressed": {
    252         "applies": true,
    253         "answer": false,
    254         "justification": "Benchmark contamination is not addressed despite significant risk. Many benchmarks (PIQA, BoolQ, HellaSwag, WinoGrande, ARC, OBQA, NaturalQuestions, TriviaQA, HumanEval) were published before the training data collection period and their contents are available on the web. No decontamination analysis is performed."
    255       }
    256     },
    257     "human_studies": {
    258       "pre_registered": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in the study."
    262       },
    263       "irb_or_ethics_approval": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in the study."
    267       },
    268       "demographics_reported": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in the study."
    272       },
    273       "inclusion_exclusion_criteria": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in the study."
    277       },
    278       "randomization_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in the study."
    282       },
    283       "blinding_described": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in the study."
    287       },
    288       "attrition_reported": {
    289         "applies": false,
    290         "answer": false,
    291         "justification": "No human participants in the study."
    292       }
    293     },
    294     "cost_and_practicality": {
    295       "inference_cost_reported": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "Inference cost is not systematically reported. The paper mentions that LLaMA-13B 'can be run on a single V100 GPU during inference' (Section 3.2) and that training processes '380 tokens/sec/GPU' (Section 2.4), but no inference latency, throughput, or cost-per-query numbers are provided."
    299       },
    300       "compute_budget_stated": {
    301         "applies": true,
    302         "answer": true,
    303         "justification": "Section 6 and Table 15 provide detailed compute budgets: GPU-hours for each model (7B: 82,432; 13B: 135,168; 33B: 530,432; 65B: 1,022,362), total power consumption (36–449 MWh), and carbon emissions (14–173 tCO2eq). Hardware specified as A100-80GB GPUs. Total development estimated at 2,638 MWh over ~5 months on 2048 GPUs."
    304       }
    305     },
    306     "experimental_rigor": {
    307       "seed_sensitivity_reported": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single training runs for each model size."
    311       },
    312       "number_of_runs_stated": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The number of experimental runs is not stated. Given the massive compute cost, results are likely from single runs, but this is not explicitly acknowledged."
    316       },
    317       "hyperparameter_search_budget": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "No hyperparameter search budget is reported. Architecture and hyperparameters are adopted from prior work (GPT-3, PaLM, GPT-Neo), but the paper does not state whether any configuration search was performed or how many configurations were tried."
    321       },
    322       "best_config_selection_justified": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "Architecture choices are justified by citing prior successful models: pre-normalization from GPT-3, SwiGLU from PaLM, RoPE from GPT-Neo (Section 2.2). All four model sizes are reported rather than cherry-picking the best. The approach follows Chinchilla scaling laws for training duration."
    326       },
    327       "multiple_comparison_correction": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "No statistical tests are performed, so no multiple comparison correction is applied. The paper makes comparative claims across 20+ benchmarks and 4+ baseline models without any statistical testing or correction."
    331       },
    332       "self_comparison_bias_addressed": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The authors do not acknowledge the bias of evaluating their own system. While they use baseline numbers from other papers (reducing re-implementation bias), they do not discuss the potential for favorable evaluation choices or setup advantages."
    336       },
    337       "compute_budget_vs_performance": {
    338         "applies": true,
    339         "answer": true,
    340         "justification": "This is a core contribution. The paper explicitly argues that compute-performance tradeoffs favor smaller models trained longer (challenging Chinchilla scaling laws). Figure 1 shows training loss vs tokens. Table 15 provides compute budgets. Performance is compared across model sizes (7B–65B) against larger models (GPT-3 175B, PaLM 540B), directly addressing the compute-performance relationship."
    341       },
    342       "benchmark_construct_validity": {
    343         "applies": true,
    344         "answer": false,
    345         "justification": "The paper does not discuss whether benchmarks actually measure what is claimed, with one minor exception: SIQA is noted as showing 'a lot of variance in performance, that may indicate that this benchmark is not reliable' (Section 3.7). No systematic discussion of construct validity for the other 19+ benchmarks."
    346       },
    347       "scaffold_confound_addressed": {
    348         "applies": false,
    349         "answer": false,
    350         "justification": "No scaffolding is involved. Models are evaluated directly on benchmarks without tools, agents, or multi-step workflows."
    351       }
    352     },
    353     "data_leakage": {
    354       "temporal_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "Temporal leakage is not discussed. Many benchmarks (HellaSwag 2019, PIQA 2020, BoolQ 2019, etc.) were created before the CommonCrawl dumps (2017–2020), meaning benchmark content could appear in training data. No temporal analysis is provided."
    358       },
    359       "feature_leakage_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "Feature leakage is not discussed. No analysis of whether evaluation setup provides information not available in real usage scenarios."
    363       },
    364       "non_independence_addressed": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "Non-independence of training and test data is not addressed. CommonCrawl data could contain web pages that include benchmark questions and answers, creating train-test overlap."
    368       },
    369       "leakage_detection_method": {
    370         "applies": true,
    371         "answer": false,
    372         "justification": "No leakage detection or prevention method is used. No canary strings, membership inference tests, n-gram overlap analysis, or decontamination pipeline is described."
    373       }
    374     }
    375   },
    376   "claims": [
    377     {
    378       "claim": "LLaMA-13B outperforms GPT-3 (175B) on most benchmarks despite being 10× smaller",
    379       "evidence": "Tables 3–8 show LLaMA-13B exceeding GPT-3 on commonsense reasoning (7/8 tasks), NaturalQuestions (20.1 vs 14.6 zero-shot), TriviaQA (56.6 vs 43.5 zero-shot), RACE, MATH/GSM8k, code generation (HumanEval 15.8 vs 14.0), and MMLU (46.9 vs 43.9).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "LLaMA-65B is competitive with Chinchilla-70B and PaLM-540B",
    384       "evidence": "Tables 3–9 show LLaMA-65B outperforming Chinchilla on all commonsense benchmarks except BoolQ, surpassing PaLM-540B on most except BoolQ and WinoGrande. On MMLU (Table 9), LLaMA-65B (63.4%) trails Chinchilla (67.5%) and PaLM-540B (69.3%).",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "State-of-the-art performance can be achieved using publicly available datasets exclusively",
    389       "evidence": "Section 2.1 describes all data sources (CommonCrawl, C4, public GitHub, Wikipedia, Gutenberg, Books3, ArXiv, StackExchange) — all publicly available. Model performance is competitive with Chinchilla and PaLM which use proprietary data.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Performance continues to improve when training beyond Chinchilla-optimal token counts",
    394       "evidence": "Figure 1 shows training loss continuing to decrease for all model sizes up to 1.0T–1.4T tokens. Figure 2 shows benchmark performance continuing to improve during training. The 7B model trained on 1T tokens (5× the Chinchilla-recommended 200B) still improves.",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "Brief instruction finetuning improves MMLU performance",
    399       "evidence": "Table 10 shows LLaMA-I (65B) achieving 68.9% on MMLU vs LLaMA-65B at 63.4%, a 5.5-point improvement from instruction finetuning following Chung et al. (2022) protocol.",
    400       "supported": "moderate"
    401     },
    402     {
    403       "claim": "Toxicity increases with model size",
    404       "evidence": "Table 11 shows toxicity scores increasing from 0.106 (7B) to 0.128 (65B) for basic prompts and from 0.081 (7B) to 0.141 (65B) for respectful prompts. However, the authors note PerspectiveAPI methodology differences limit comparisons.",
    405       "supported": "moderate"
    406     }
    407   ],
    408   "red_flags": [
    409     {
    410       "flag": "Company evaluating own product",
    411       "detail": "All authors are Meta AI employees evaluating Meta's own LLaMA models. While baselines use numbers from other papers (reducing some bias), the evaluation setup, benchmark selection, and presentation choices are controlled by the model developers."
    412     },
    413     {
    414       "flag": "No error bars or uncertainty quantification",
    415       "detail": "All results across 20+ benchmarks are reported as single point estimates with no confidence intervals, standard deviations, or multi-run statistics. Given the massive compute cost, single-run results are understandable but claims of 'outperforming' specific models by small margins (e.g., 1–3 points) are not statistically validated."
    416     },
    417     {
    418       "flag": "No contamination analysis",
    419       "detail": "The models are trained on web data (CommonCrawl 2017–2020) that could contain benchmark questions and answers. Many benchmarks (HellaSwag, PIQA, BoolQ, ARC, WinoGrande, NaturalQuestions, TriviaQA) predate the training data. No decontamination analysis is performed despite this significant risk."
    420     },
    421     {
    422       "flag": "No ablation of architectural choices",
    423       "detail": "The paper adopts three architectural changes from prior work (pre-normalization, SwiGLU, RoPE) without ablating their individual contributions. It is unclear how much each modification contributes to the final performance."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "Language models are few-shot learners",
    429       "authors": ["Tom B. Brown", "Benjamin Mann", "Nick Ryder"],
    430       "year": 2020,
    431       "relevance": "GPT-3 paper; foundational work on large language model few-shot capabilities and primary baseline for LLaMA comparisons."
    432     },
    433     {
    434       "title": "Palm: Scaling language modeling with pathways",
    435       "authors": ["Aakanksha Chowdhery", "Sharan Narang", "Jacob Devlin"],
    436       "year": 2022,
    437       "relevance": "PaLM scaling paper; key baseline for LLaMA and source of architectural choices (SwiGLU activation)."
    438     },
    439     {
    440       "title": "Training compute-optimal large language models",
    441       "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"],
    442       "year": 2022,
    443       "relevance": "Chinchilla scaling laws paper; LLaMA directly challenges its recommendations by training smaller models longer."
    444     },
    445     {
    446       "title": "OPT: Open pre-trained transformer language models",
    447       "authors": ["Susan Zhang", "Stephen Roller", "Naman Goyal"],
    448       "year": 2022,
    449       "arxiv_id": "2205.01068",
    450       "relevance": "Open-source LLM release from Meta; key comparison point for open model availability and carbon footprint."
    451     },
    452     {
    453       "title": "Evaluating large language models trained on code",
    454       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    455       "year": 2021,
    456       "relevance": "Codex/HumanEval paper; introduces the code generation benchmark used to evaluate LLaMA and establishes pass@k methodology."
    457     },
    458     {
    459       "title": "BLOOM: A 176b-parameter open-access multilingual language model",
    460       "authors": ["Teven Le Scao", "Angela Fan", "Christopher Akiki"],
    461       "year": 2022,
    462       "arxiv_id": "2211.05100",
    463       "relevance": "Open-access multilingual LLM; comparison point for open model release and carbon footprint analysis."
    464     },
    465     {
    466       "title": "Measuring massive multitask language understanding",
    467       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart"],
    468       "year": 2020,
    469       "relevance": "MMLU benchmark paper; widely-used LLM evaluation benchmark where LLaMA shows relative weakness."
    470     },
    471     {
    472       "title": "Emergent abilities of large language models",
    473       "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"],
    474       "year": 2022,
    475       "arxiv_id": "2206.07682",
    476       "relevance": "Studies the effect of scaling on LLM abilities; directly relevant to LLaMA's scaling analysis and claims about model size vs capability."
    477     },
    478     {
    479       "title": "Scaling laws for neural language models",
    480       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"],
    481       "year": 2020,
    482       "arxiv_id": "2001.08361",
    483       "relevance": "Establishes neural scaling laws that LLaMA builds upon, showing how model and dataset size relate to performance."
    484     },
    485     {
    486       "title": "Realtoxicityprompts: Evaluating neural toxic degeneration in language models",
    487       "authors": ["Samuel Gehman", "Suchin Gururangan", "Maarten Sap"],
    488       "year": 2020,
    489       "arxiv_id": "2009.11462",
    490       "relevance": "Toxicity evaluation benchmark used to assess LLaMA's potential for generating toxic content."
    491     },
    492     {
    493       "title": "Program synthesis with large language models",
    494       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    495       "year": 2021,
    496       "relevance": "MBPP code generation benchmark paper; used to evaluate LLaMA's code generation capabilities."
    497     }
    498   ]
    499 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs