scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31278B)
      1 {
      2   "paper": {
      3     "title": "The Llama 3 Herd of Models",
      4     "authors": ["Llama Team", "AI @ Meta"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2407.21783"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "Llama 3 405B performs comparably to GPT-4 across a wide range of tasks including general knowledge, coding, math, and multilingual benchmarks. The paper demonstrates scaling laws that accurately predict downstream performance across four orders of magnitude of compute. Human evaluations show Llama 3 405B is competitive with GPT-4o and Claude 3.5 Sonnet. The paper also presents preliminary multimodal (vision, video, speech) extensions that are competitive but not yet released.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The paper states models are publicly released under the Llama 3 Community License at https://llama.meta.com. FP8 quantization kernels are released at https://github.com/pytorch/FBGEMM. Evaluation configurations and data are released on GitHub and Huggingface (Section 5.1)."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper states 'we are releasing the data generated as part of evaluations with publicly available benchmarks which can be found on Huggingface' (Section 5.1). Standard public benchmarks are used for evaluation. However, the pre-training data itself is not released."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "While hardware is described in detail (16K H100 GPUs, 80GB HBM3, etc.) in Section 3.3, there are no environment specifications (requirements.txt, Dockerfile, library versions) for reproducing the evaluation pipeline."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper provides extensive methodological detail but no step-by-step reproduction instructions. Evaluation setup details are deferred to external links ('can be found on our Github repository here') but these are not inline instructions."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "The paper reports 95% confidence intervals for most benchmark results (Tables 9-14, 17-22), following Madaan et al. (2024b). The CI formula is explicitly given: CI(S) = 1.96 × sqrt(S × (1-S) / N)."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Despite reporting confidence intervals, the paper does not perform formal statistical significance tests when claiming one model outperforms another. Comparisons are made by comparing point estimates and CIs but no p-values or tests are used."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Results are reported as absolute scores on benchmarks with baseline comparisons throughout (e.g., Table 2 shows Llama 3 405B at 87.3 MMLU vs GPT-4 at 85.1). The magnitude of differences is clear from the reported numbers with baselines."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification is provided for the number of evaluation examples or human evaluation sample sizes. The human evaluation uses ~7,000 prompts but the choice is not justified with power analysis."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper reports confidence intervals based on the binomial approximation from benchmark size, but does not report variance across multiple experimental runs or seeds. Results appear to be single-run evaluations."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Extensive baselines are included throughout: GPT-4, GPT-4o, Claude 3.5 Sonnet, Gemini, Mixtral, Mistral, Gemma, Nemotron 4 340B, and Llama 2 models (Tables 2, 9-22)."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Baselines include the most competitive contemporary models at the time of publication: GPT-4o, Claude 3.5 Sonnet, Gemini 1.5 Pro, and Gemma 2 (July 2024)."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Multiple ablation-style analyses are presented: scaling law experiments (Section 3.2.1), data mix experiments (Section 3.1.2), annealing experiments (Section 3.1.3), long-context data mix ablations (Section 4.3.4), and DPO modifications (Section 4.1.4)."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper reports results across dozens of metrics including MMLU, HumanEval, MBPP, GSM8K, MATH, ARC, GPQA, IFEval, MGSM, pass@1, word error rate, BLEU, and human evaluation win rates."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Section 5.3 describes extensive human evaluations comparing Llama 3 405B against GPT-4, GPT-4o, and Claude 3.5 Sonnet using ~7,000 prompts with 7-point preference ratings across multiple capabilities (Figure 17)."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The paper reports on standard benchmark test sets and explicitly states the pre-training team was 'strongly incentivized to prevent contamination' (Section 10). Evaluation sets were held out from post-training via exact match decontamination (Section 5.2)."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down extensively by category: commonsense reasoning, knowledge, reading comprehension, math/reasoning, code, multilingual, tool use, long context (Tables 8-22). Human evaluation results are broken down by capability (Figure 17)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Adversarial benchmarks (Section 5.1.3) show where models struggle. Negative results on code interpreter abuse (10.4% compliance, Section 5.4.5), prompt injection susceptibility (21.7%, Figure 22), and safety violations are discussed."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Several negative results are reported: annealing improvements on 405B were 'negligible' (Section 3.1.3); markdown was found 'harmful' (Section 3.1.1); PPO was found to require more compute and perform worse than DPO (Section 4.1.4); training 405B on its own generated data was 'not helpful (and can even degrade performance)' (Section 4.3.1)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims 'comparable quality to leading language models such as GPT-4 on a plethora of tasks.' Table 2 and human evaluations (Figure 17) support this — Llama 3 405B is competitive with but does not uniformly beat GPT-4o and Claude 3.5 Sonnet."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Causal claims are generally backed by controlled ablations. E.g., scaling law experiments (Section 3.2.1) use controlled compute budgets, data mix experiments test individual variables, and DPO modifications are tested with before/after comparisons. The paper is appropriately cautious in framing."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Claims are generally bounded to the tested benchmarks and settings. The paper specifies model sizes (8B, 70B, 405B), specific benchmarks, and settings. Multimodal results are explicitly noted as 'still under development and not yet ready for release.' The paper avoids overly broad claims."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 5.1.4 discusses contamination as an alternative explanation for benchmark performance. Section 5.1.2 examines robustness to prompt format and label order. Section 5.1.3 tests adversarial alternatives. Human evaluation limitations regarding annotator bias are discussed (Section 5.3)."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper reports specific benchmark metrics without inflating them to broader claims. It says 'performs on par with leading language models' based on specific benchmarks, not 'achieves AGI.' Human evaluations are framed as measuring specific capabilities (coding, reasoning, multilingual) rather than general intelligence."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Llama 3 model specifications are given in detail (Table 3: 8B, 70B, 405B parameters, architecture details). Competitor models are identified with specific names (GPT-4 0125 API version, GPT-4o API version, Claude 3.5 Sonnet API version — Section 5.3). Table 2 specifies GPT-4 (0125)."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Several actual prompts are provided: the system prompt for steerability (Section 4.3.7), ASR/AST system prompts (Section 8.3.1), video evaluation prompts (Section 7.7). The paper references external repos for full evaluation configurations. The steerability system prompt example is verbatim."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Extensive hyperparameters are reported: learning rates (Table 3, Sections 3.4.1, 4.1.3, 4.1.4), batch sizes, DPO β=0.1, NLL loss coefficient 0.2, temperature settings for rejection sampling, RoPE frequency, context lengths, and parallelism configurations (Table 4)."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "The paper does not use agentic scaffolding. Models are evaluated directly on benchmarks. Tool use is a capability of the model itself, not an external scaffold."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3.1 provides extensive detail on data preprocessing: PII filtering, text extraction, URL/document/line-level deduplication, heuristic filtering, model-based quality filtering, knowledge classification, and multilingual processing. Post-training data processing is detailed in Section 4.2.3."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 5.4.8 is titled 'Limitations' and discusses safety limitations. Section 5.3 acknowledges limitations of human evaluations. The conclusion (Section 10) notes 'development of high-quality foundation models is still in its infancy.'"
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Specific threats are discussed: contamination analysis (Section 5.1.4) identifies benchmarks where contamination may inflate scores (e.g., PiQA, HellaSwag). Section 5.3 notes 'human evaluations can still be influenced by personal biases, backgrounds, and preferences.' Safety limitations note models 'may still generate harmful content.'"
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "The paper explicitly states multimodal models 'are still under development and not yet ready for release.' Footnote 9 states Llama 3 'has not been optimized or safety tuned for use cases' beyond 8 supported languages. Section 5.4.8 states 'no testing can be guaranteed to be exhaustive.'"
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "Pre-training data is not released. Evaluation data generated from public benchmarks is released on Huggingface, but the raw training data, preference annotations, and synthetic data are proprietary."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Pre-training data collection is described in detail (Section 3.1): web crawling, HTML parsing, deduplication methods, quality filtering. Post-training data collection via human annotations is described in Section 4.2.1 with statistics in Tables 6-7."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "The paper mentions using 'human annotators' and 'data vendors' for preference data and safety data, but does not describe how annotators were recruited, their qualifications, compensation, or potential biases from the recruitment process."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The data pipeline is documented in detail: Section 3.1 describes web data curation (PII filtering → text extraction → deduplication → heuristic filtering → model-based filtering). Post-training data pipeline is in Section 4.2.3 (topic classification → quality scoring → difficulty scoring → semantic deduplication)."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No explicit funding disclosure or acknowledgments section listing funding sources. The work is clearly done at Meta, but there is no formal funding statement."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The paper is clearly attributed to 'Llama Team, AI @ Meta' with the full contributor list in the appendix. All contributors are identified as Meta employees."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Meta is both the funder and the organization whose product (Llama 3) is being evaluated. Meta has a direct commercial interest in Llama 3 performing well. The funder is not independent of the outcome."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement is provided. Meta employees evaluating Meta's own product have clear financial interests that are not formally declared."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "Section 3.1 states 'we create our dataset for language model pre-training from a variety of data sources containing knowledge until the end of 2023.'"
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "Section 5.1.4 presents a detailed contamination analysis using 8-gram overlap between evaluation sets and the pre-training corpus, following Singh et al. (2024). Table 15 reports contamination percentages and estimated performance gains for each benchmark."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "Section 5.1.4 explicitly addresses benchmark contamination with quantitative analysis. The organizational design separates the pre-training data team from the evaluation team (Section 10). Annealing data explicitly excludes 'training sets from commonly used benchmarks' (Section 3.1.3)."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "The paper includes human evaluations (Section 5.3) and safety studies with human participants (Section 5.4.5 uplift testing), but none are pre-registered."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No IRB or ethics board approval is mentioned for the human evaluation studies or the cybersecurity/biosecurity uplift testing involving human participants."
    252       },
    253       "demographics_reported": {
    254         "applies": true,
    255         "answer": false,
    256         "justification": "Human evaluation annotator demographics are not reported. For the cybersecurity uplift study, participants are categorized as 'expert' (31) and 'novice' (31) based on offensive security experience, but detailed demographics are absent."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": true,
    260         "answer": true,
    261         "justification": "For the cybersecurity uplift study (Section 5.4.5), participants were 'categorized into expert (31 subjects) and novice (31 subjects) cohorts based on their offensive security experience.' CBRN study participants were 'recruited based on previous experience in relevant areas of scientific or operational expertise.'"
    262       },
    263       "randomization_described": {
    264         "applies": true,
    265         "answer": true,
    266         "justification": "The CBRN uplift study describes assignment to 'control' or 'LLM' conditions (Section 5.4.5). The cybersecurity study uses a two-stage within-subjects design where participants complete challenges first without then with LLM assistance."
    267       },
    268       "blinding_described": {
    269         "applies": true,
    270         "answer": false,
    271         "justification": "No blinding is described for any of the human studies. In the uplift testing, participants clearly know whether they have LLM access. Human evaluation annotators are not blinded to model identity."
    272       },
    273       "attrition_reported": {
    274         "applies": true,
    275         "answer": false,
    276         "justification": "No attrition information is provided for any human study. The uplift study starts with 62 volunteers but does not report whether all completed both stages."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Section 6 reports inference throughput and latency for both BF16 and FP8 quantization (Figures 24, 27), including tokens/sec and time-to-first-token. MFU is reported in Table 4."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "The paper states the flagship model was 'pre-trained using 3.8 × 10^25 FLOPs' on 'up to 16K H100 GPUs' (Section 3.3.1). Training stability data covers a 54-day snapshot period (Table 5). BF16 MFU of 38-43% is reported."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No seed sensitivity analysis is reported. Results appear to be from single training runs. The paper acknowledges CIs from finite samples but does not report variance across seeds."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The number of evaluation runs is not explicitly stated. CIs are computed analytically from benchmark size rather than from multiple runs."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "While scaling law experiments are described (Section 3.2.1), the total hyperparameter search budget for the final model configuration is not stated. The paper mentions 'careful tuning' of parallelism configurations but no search budget."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Scaling law experiments (Section 3.2.1) provide principled selection of model size (402B→405B). Data mix selection uses scaling law experiments. Post-training uses multiple rounds with progressive improvement. Model averaging selects from various configurations (Section 4.1.5)."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The paper reports results across dozens of benchmarks and makes many comparisons without any correction for multiple comparisons (no Bonferroni, Holm, or similar correction)."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Meta evaluates its own Llama 3 models against competitors. While they report reproducing competitor results where possible and selecting the best score, the paper does not explicitly acknowledge the systematic bias of self-evaluation."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "The scaling law analysis (Section 3.2.1, Figures 2-4) explicitly relates compute budget to performance. The paper discusses training smaller models 'for much longer than is compute-optimal' and the resulting performance tradeoffs."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "The paper discusses robustness of benchmarks to design choices (Section 5.1.2), adversarial benchmarks (Section 5.1.3), and contamination analysis (Section 5.1.4). Human evaluations are used as a complement to automated metrics, acknowledging that 'well-designed human evaluations closely reflect the user experience.'"
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "Models are evaluated directly on benchmarks without external scaffolding. Tool use evaluation tests the model's native capability. No scaffold confound exists."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "The paper states pre-training data goes 'until the end of 2023' (Section 3.1) and the contamination analysis (Section 5.1.4) addresses temporal overlap between benchmarks and training data."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No explicit discussion of feature leakage (e.g., whether evaluation setup provides hints not available in real usage). The focus is on data contamination rather than feature leakage."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "While deduplication is described for training data, the paper does not discuss whether evaluation examples share structural similarities with training data beyond n-gram overlap."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "Section 5.1.4 applies 8-gram overlap analysis following Singh et al. (2024), computing contamination percentages and estimated performance gains for each benchmark (Table 15). Post-training decontamination uses exact match with benchmark prompts."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "Llama 3 405B performs on par with leading language models such as GPT-4 across a variety of tasks.",
    364       "evidence": "Table 2 shows Llama 3 405B achieves 87.3 MMLU (vs GPT-4 85.1), 89.0 HumanEval (vs GPT-4 86.6), 96.8 GSM8K (vs GPT-4 94.2). Human evaluations (Figure 17) show approximate parity with GPT-4 (0125).",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "Scaling law predictions extrapolated over four orders of magnitude accurately forecast the flagship model's performance.",
    369       "evidence": "Section 3.2.1 and Figure 4 show the two-step scaling law prediction 'only slightly underestimates the final performance' on ARC Challenge, extrapolating from 10^22 to 3.8×10^25 FLOPs.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "Llama 3 8B and 70B outperform competing models of similar sizes on most benchmarks.",
    374       "evidence": "Tables 9-14 and Figure 12 show Llama 3 8B outperforms Mistral 7B and Gemma 7B on virtually every category. Llama 3 70B outperforms Mixtral 8x22B on most benchmarks.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "FP8 quantization has very limited impact on model response quality.",
    379       "evidence": "Figure 26 shows the reward score distribution for FP8 vs BF16 is nearly identical across 100,000 responses. FP8 inference provides up to 50% throughput improvement during pre-fill (Figure 27).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Llama 3 release does not provide significant uplift for cybersecurity or CBRNE attacks.",
    384       "evidence": "Section 5.4.5: cybersecurity uplift study with 62 participants showed 'insignificant uplift.' CBRN study showed 'no significant uplift in performance.' Autonomous attack agents 'fail to effectively use' reconnaissance information.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "DPO performed better than PPO for alignment, especially on instruction-following benchmarks.",
    389       "evidence": "Section 4.1.4 states 'DPO required less compute for large-scale models and performed better, especially on instruction following benchmarks like IFEval.'",
    390       "supported": "weak"
    391     },
    392     {
    393       "claim": "Training the 405B model on its own generated data is not helpful and can degrade performance.",
    394       "evidence": "Section 4.3.1 states 'our initial experiments revealed that training Llama 3 405B on its own generated data is not helpful (and can even degrade performance).' No quantitative evidence provided.",
    395       "supported": "weak"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "Company evaluating its own product",
    401       "detail": "Meta employees evaluate Llama 3 models across all benchmarks. While they reproduce competitor results 'where possible,' the paper does not acknowledge the systematic bias of self-evaluation. Baselines are run through Meta's pipeline, and the 'best score' selection favors their setup."
    402     },
    403     {
    404       "flag": "Selective competitor availability",
    405       "detail": "For 405B class comparisons, many competitor numbers are unavailable ('it is not possible to (re)compute benchmark values' for comparable models). Category averages are omitted for 405B precisely because not all numbers are available. This makes comprehensive comparison impossible."
    406     },
    407     {
    408       "flag": "Human evaluation opacity",
    409       "detail": "Human evaluation annotators, their demographics, training, and potential biases are not described. Only a 'small set of researchers who do not contribute to model development' had access (Section 10), but their identities and potential biases are unknown."
    410     },
    411     {
    412       "flag": "Safety benchmarks are internal and non-reproducible",
    413       "detail": "Section 5.4.4 explicitly acknowledges 'these safety benchmarks are internal to Meta' and results are 'not reproducible externally.' Competitor safety results are anonymized ('Comp. 1', 'Comp. 2', 'Comp. 3'), preventing independent verification."
    414     },
    415     {
    416       "flag": "No seed sensitivity or multi-run analysis",
    417       "detail": "For a model trained with 3.8×10^25 FLOPs, no seed sensitivity analysis is performed. CIs are derived analytically from benchmark size rather than empirical variance across runs, underestimating true uncertainty."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "Evaluating large language models trained on code",
    423       "authors": ["Mark Chen"],
    424       "year": 2021,
    425       "arxiv_id": "2107.03374",
    426       "relevance": "Introduces HumanEval, a key code generation benchmark used extensively in Llama 3 evaluation."
    427     },
    428     {
    429       "title": "Training language models to follow instructions with human feedback",
    430       "authors": ["Long Ouyang"],
    431       "year": 2022,
    432       "arxiv_id": "2203.02155",
    433       "relevance": "Foundational RLHF paper that Llama 3's post-training methodology builds upon."
    434     },
    435     {
    436       "title": "Direct preference optimization: Your language model is secretly a reward model",
    437       "authors": ["Rafael Rafailov"],
    438       "year": 2023,
    439       "relevance": "Core alignment method used in Llama 3 post-training, preferred over PPO for scalability."
    440     },
    441     {
    442       "title": "Constitutional AI: harmlessness from AI feedback",
    443       "authors": ["Yuntao Bai"],
    444       "year": 2022,
    445       "arxiv_id": "2212.08073",
    446       "relevance": "Influential safety alignment approach; Llama 3's rejection sampling draws from this work."
    447     },
    448     {
    449       "title": "Llama 2: Open foundation and fine-tuned chat models",
    450       "authors": ["Hugo Touvron"],
    451       "year": 2023,
    452       "arxiv_id": "2307.09288",
    453       "relevance": "Direct predecessor to Llama 3; many training recipes are extended from this work."
    454     },
    455     {
    456       "title": "Code llama: Open foundation models for code",
    457       "authors": ["Baptiste Rozière"],
    458       "year": 2023,
    459       "arxiv_id": "2308.12950",
    460       "relevance": "Code expert training recipe that Llama 3's code capabilities build upon."
    461     },
    462     {
    463       "title": "Toolformer: Language models can teach themselves to use tools",
    464       "authors": ["Timo Schick"],
    465       "year": 2024,
    466       "relevance": "Influential tool-use approach; Llama 3 differs by relying on human annotations rather than self-taught tool use."
    467     },
    468     {
    469       "title": "Training compute-optimal large language models",
    470       "authors": ["Jordan Hoffmann"],
    471       "year": 2022,
    472       "arxiv_id": "2203.15556",
    473       "relevance": "Chinchilla scaling laws that inform Llama 3's model sizing and compute allocation decisions."
    474     },
    475     {
    476       "title": "Purple llama cyberseceval: A secure coding benchmark for language models",
    477       "authors": ["Manish Bhatt"],
    478       "year": 2023,
    479       "arxiv_id": "2312.04724",
    480       "relevance": "Safety evaluation framework used to assess Llama 3's cybersecurity risks."
    481     },
    482     {
    483       "title": "Many-shot jailbreaking",
    484       "authors": ["Cem Anil"],
    485       "year": 2024,
    486       "relevance": "Long-context jailbreaking attack that Llama 3 specifically mitigates in safety finetuning."
    487     },
    488     {
    489       "title": "Scaling laws for neural language models",
    490       "authors": ["Jared Kaplan"],
    491       "year": 2020,
    492       "arxiv_id": "2001.08361",
    493       "relevance": "Foundational scaling laws paper that Llama 3's compute-optimal training decisions build upon."
    494     },
    495     {
    496       "title": "Augmented language models: a survey",
    497       "authors": ["Grégoire Mialon"],
    498       "year": 2023,
    499       "arxiv_id": "2302.07842",
    500       "relevance": "Survey of tool-augmented LLMs relevant to Llama 3's tool use capabilities."
    501     }
    502   ]
    503 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs