scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (31903B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "The Llama 3 Herd of Models",
      6     "authors": [
      7       "Abhimanyu Dubey",
      8       "Abhinav Jauhri",
      9       "Abhinav Pandey",
     10       "Abhishek Kadian",
     11       "Ahmad Al-Dahle"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv",
     15     "arxiv_id": "2407.21783",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All major abstract claims—comparable quality to GPT-4, native multilingual/coding/reasoning/tool support, competitive multimodal performance—are backed by extensive benchmark tables (Tables 2, 9–22, 29–32) and human evaluation results (Figure 17).",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper claims 'performance gains are primarily driven by improvements in data quality and diversity' (Section 3.2) but lacks controlled ablations isolating data quality from the ~50× compute increase and architectural changes applied simultaneously.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper makes broad capability claims ('delivers comparable quality to leading language models') that are not adequately bounded to specific benchmarks or settings; several key benchmarks show >85% contamination rates, undermining these generalizations.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper attributes performance improvements exclusively to data quality, scale, and simplicity without discussing alternative explanations such as benchmark-specific overfitting, contamination effects, or post-training data curation artifacts.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Benchmark scores (MMLU, HumanEval, etc.) are treated as direct measures of capability throughout without discussing the gap between benchmark performance and real-world utility or known limitations of these proxies.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 5.4.8 'Limitations' explicitly acknowledges residual safety risks and testing incompleteness; Section 5.3 acknowledges human evaluation may be influenced by annotator personal biases.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section 5.1.4 specifically quantifies benchmark contamination per-dataset with estimated performance gains; Section 5.4.8 specifically notes limitations for 'languages beyond English'; the CIs formula limitations are discussed (bounded scores, subsampling not the only source of variation).",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly states multimodal models 'are still under development and not yet ready for release,' that 8B/70B were 'intended for use in English,' and that safety guarantees extend only to 8 supported languages.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding disclosure is present; the paper is entirely authored by Meta AI employees evaluating their own model without acknowledging this as a potential conflict of interest.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper clearly states 'Llama Team, AI @ Meta' and the acknowledgements section lists all contributors as Meta employees.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Meta employees are evaluating Meta's own flagship model; some safety comparisons use internal benchmarks with anonymized competitors, preventing independent verification of claimed superiority.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement, patent disclosures, or financial interests declarations are included anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms including 'post-training,' 'pre-training,' 'foundation models,' 'SFT,' 'DPO,' 'rejection sampling,' 'effective training time,' and '4D parallelism' are explicitly defined in context throughout the paper.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The introduction clearly states contributions: new foundation models (8B/70B/405B), detailed training infrastructure, safety analysis, preliminary multimodal experiments, and public release of pre-trained and post-trained weights.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 9 provides comprehensive related work covering scale trends, small models, architectures, open source models, post-training methods, and multimodality, positioning Llama 3 relative to Llama 2 and contemporary models throughout.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Model weights for all three sizes are publicly released; FP8 kernel code is on GitHub (pytorch/FBGEMM); evaluation setup is referenced on GitHub; though full training pipeline code is not open-sourced, the primary artifact (trained models) is available.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All primary evaluations use standard publicly available benchmarks (MMLU, HumanEval, MBPP, GSM8K, etc.) used unmodified; the paper also states evaluation data generated from public benchmarks is released on HuggingFace.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Hardware specifications (H100 GPUs, cluster configurations) are provided, but no requirements.txt, Dockerfile, or equivalent software dependency specification is included.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions for training are provided; training 405B on Meta's proprietary cluster with 16K H100 GPUs is inherently not reproducible by external researchers.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "95% CIs are reported for all pre-trained and post-trained benchmark results (Tables 9–22) using CI = 1.96√(S(1−S)/N) with explicit discussion of the formula and its assumptions in Section 5.1.1.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "While CIs are reported, formal hypothesis tests (t-tests, bootstrap) are not used for comparative claims; the paper relies on non-overlapping CIs as informal significance indicators without correction for multiple comparisons.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Specific improvement magnitudes are consistently reported: annealing improved 8B on GSM8K by 24.0% and MATH by 6.4%; FP8 provides 'up to 50%' throughput improvement; Llama Guard reduces violations by 65% on average.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Benchmark sizes are inherited from prior work without justification; the human evaluation describes difficulty distribution (10% easy, 30% medium, 60% hard) but provides no power analysis; the CBRNE uplift study mentions a power analysis but doesn't report the specifics.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "95% CIs are reported for all benchmark scores; human evaluation win rates include 95% CIs (Figure 17); training MFU variance across configurations is shown in Table 4.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Extensive baseline comparisons against GPT-4, GPT-4o, Claude 3.5 Sonnet, Gemini 1.5 Pro, Mistral, Mixtral 8x22B, Gemma, Nemotron 4 340B throughout all evaluation tables.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines include GPT-4o and Claude 3.5 Sonnet (July 2024 contemporaries); specific API versions are cited (GPT-4 0125, GPT-4o API version, Claude 3.5 Sonnet API version) in Section 5.3.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Ablation studies include annealing data quality effects (Section 3.1.3), parallelism configurations (Table 4), FP8 vs BF16 (Figure 26), DPO with/without NLL regularization, long-context data mixing (0.1% optimal), and safety data ratio effects (Figure 18).",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Dozens of benchmarks across commonsense reasoning, knowledge, reading comprehension, math, code generation, long context, multilingual, tool use, adversarial robustness, and safety are evaluated simultaneously.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Section 5.3 describes human evaluation with ~7,000 prompts across 6 capabilities using pairwise comparisons with 7-point scale against GPT-4, GPT-4o, and Claude 3.5 Sonnet; prosody modeling also uses human preference evaluation (Table 35).",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Standard benchmark test sets are used; post-training data is decontaminated via exact match with benchmark prompts; annealing explicitly excludes any training sets from commonly used benchmarks (Section 3.1.3).",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down by capability category (Figure 12), by programming language for code (Table 19), by language for multilingual (Table 20), by safety category (Table 26), and by harm type for Llama Guard (Tables 25–26).",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 5.1.3 shows systematic performance degradation on adversarial vs non-adversarial benchmarks; Section 5.4.6 documents specific red-teaming failure modes including multi-turn escalation, persona attacks, and code interpreter abuse patterns.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Explicitly reported: training 405B on own data 'is not helpful (and can even degrade performance)'; annealing improvements 'are negligible' for 405B; naive long-context SFT 'resulted in significant regressions'; FP8 without mitigations produces corrupted responses despite strong benchmark scores.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Specific model versions are stated throughout: 'Llama 3.1 8B/70B/405B,' 'GPT-4 (0125 API version),' 'GPT-4o (API version),' 'Claude 3.5 Sonnet (API version)' in Section 5.3; competitor evaluation dates are specified.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Multiple actual prompts provided: full steerability system prompt example (Section 4.3.7), video QA inference prompts (Section 7.7), ASR/AST system prompts (Section 8.3.1), and adversarial prompt examples (Table 23).",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Comprehensive hyperparameter reporting: learning rates, batch sizes, optimizer (AdamW), warmup steps, cosine schedule details for pre-training (Section 3.4.1), DPO β=0.1, NLL coefficient 0.2, SFT learning rate 1e-5 (Sections 4.1.3–4.1.4), all training configurations in Table 3.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Tool use scaffolding explicitly described: Python objects with methods, JSON format for API calls, Python interpreter as executor, specific tools (Brave Search, Wolfram Alpha API, Python interpreter) with system prompt control in Section 4.3.5.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 3.1 extensively documents URL-level, document-level (MinHash), and line-level de-duplication; heuristic filtering (n-gram coverage ratio, dirty word counting, KL divergence); model-based quality filtering with fasttext and DistilRoBERTa classifiers.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Training data (15T tokens from proprietary web crawl and curated sources) is not publicly released; human annotation data and internal preference datasets are also not released.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 3.1 provides extensive detail on web data curation; Section 4.2 describes human annotation procedures with data statistics (Tables 6, 7) including turn counts, token counts per capability domain, and annotation protocols.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": false,
    282           "justification": "Human annotators are referenced throughout but their recruitment criteria, qualifications, compensation, geographic distribution, or screening processes are not described in the paper.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The complete data pipeline is documented from raw web crawl through quality filtering, de-duplication, knowledge classification, and data mix optimization (Sections 3.1–3.1.3), plus the full post-training data pipeline (Sections 4.2–4.2.3) including rejection sampling and quality scoring.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "Section 3.1 explicitly states the dataset contains 'knowledge until the end of 2023' as the training data cutoff.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "Section 5.1.4 provides a dedicated contamination analysis using 8-gram overlap following Singh et al. (2024), reporting per-dataset contamination percentages and estimated performance gains from contamination.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "Table 15 reports contamination levels per benchmark; notably AGIEval shows 98% contamination with +16.3pp estimated gain for 405B, and BIG-Bench Hard shows 95% contamination with +41pp estimated gain—results the paper presents without removing these benchmarks from main evaluations.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "The cybersecurity and CBRNE uplift studies involve 62 and additional human participants but no pre-registration is mentioned; the study design was developed internally without public pre-registration.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": true,
    321           "answer": false,
    322           "justification": "Human participant studies (uplift testing with internal volunteers) are described in Section 5.4.5 without any mention of IRB approval or ethics board review.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": true,
    327           "answer": false,
    328           "justification": "The uplift study reports only expert/novice categorization (31 each) based on offensive security experience; no demographic information (age, gender, background distribution) is provided.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": true,
    333           "answer": true,
    334           "justification": "Section 5.4.5 states participants 'were categorized into expert and novice cohorts based on their offensive security experience' and 'recruited based on previous experience in relevant areas of scientific or operational expertise.'",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": true,
    339           "answer": true,
    340           "justification": "Section 5.4.5 describes random assignment to 'control' or 'LLM' conditions; the two-stage design is described with participants completing different challenges in each stage.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": true,
    345           "answer": false,
    346           "justification": "Participants clearly know whether they have LLM access by design; no blinding of participants or evaluators is described; SME evaluators who scored attack plans also could not be blinded to condition.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": true,
    351           "answer": false,
    352           "justification": "No attrition or dropout information is reported for the 62-volunteer uplift study or other human evaluation tasks.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Section 6 reports detailed throughput-latency tradeoffs (Figures 24, 27) including tokens/second and time-to-first-token for BF16 and FP8 inference across different batch sizes and pipeline configurations.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Pre-training compute is stated as '3.8 × 10^25 FLOPs' for 405B; hardware is 'up to 16K H100 GPUs, each running at 700W TDP'; BF16 MFU is reported as 38–43% in Table 4.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Llama 3 405B delivers comparable quality to GPT-4 across a variety of tasks",
    375       "evidence": "Human evaluation shows win rates within margin of error vs GPT-4 on most capabilities (Figure 17); Table 2 shows 405B scores of 87.3 MMLU vs 85.1 for GPT-4, 89.0 HumanEval vs 86.6",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Llama 3 8B and 70B are best-in-class models outperforming alternatives with similar parameter counts",
    380       "evidence": "Table 2 shows 8B outperforming Gemma 2 9B and Mistral 7B across all reported benchmarks; 70B outperforms Mixtral 8x22B; results include 95% CIs throughout Tables 9–21",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Annealing on high-quality code and math data improves the 8B model by 24% on GSM8K and 6.4% on MATH validation sets",
    385       "evidence": "Section 3.1.3 directly reports these numbers; improvements are described as 'negligible' for the 405B model, showing a clear scale interaction effect",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Llama 3 does not provide significant uplift for cybersecurity attacks or CBRNE weapon planning compared to web-only access",
    390       "evidence": "Section 5.4.5 reports a two-stage study with 62 volunteers showing 'insignificant uplift' for cybersecurity; CBRNE study with expert teams shows 'no significant uplift' with robust Delphi process for SME scoring",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "FP8 quantization provides up to 50% throughput improvement vs BF16 inference with negligible quality impact",
    395       "evidence": "Figure 27 shows throughput-latency comparison; Figure 26 shows reward score distributions between BF16 and FP8 are nearly identical across 100,000 responses",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Several key benchmarks show high training contamination that may inflate reported scores",
    400       "evidence": "Table 15 shows AGIEval 98% contaminated (+16.3pp estimated gain for 405B) and BIG-Bench Hard 95% contaminated (+41pp estimated gain); these are primary general capability benchmarks used throughout the paper",
    401       "supported": "strong"
    402     },
    403     {
    404       "claim": "Llama 3-V 405B outperforms GPT-4V on all image understanding benchmarks",
    405       "evidence": "Table 29 shows Llama 3-V 405B scoring higher than GPT-4V on MMMU (64.5 vs 56.4), VQAv2 (80.2 vs 77.2), AI2 Diagram (94.1 vs 78.2), ChartQA (85.8 vs 78.4), TextVQA (84.8 vs 78.0), DocVQA (92.6 vs 88.4)",
    406       "supported": "strong"
    407     }
    408   ],
    409   "methodology_tags": [
    410     "benchmark-eval",
    411     "empirical"
    412   ],
    413   "key_findings": "Llama 3 presents a family of dense Transformer language models (8B, 70B, 405B parameters) trained on 15T multilingual tokens that achieve performance comparable to GPT-4 and are publicly released. Key technical contributions include improved data curation with multi-stage filtering and de-duplication, a 4D parallelism training infrastructure achieving 38–43% MFU on 16K H100 GPUs, and a multi-stage post-training pipeline using SFT, rejection sampling, and DPO with iterative rounds. Contamination analysis reveals high benchmark overlap for several key evaluations (AGIEval 98%, BIG-Bench Hard 95%), with estimated gains of up to 41 percentage points for 405B, substantially undermining claims based on these benchmarks. Safety uplift testing shows no significant risk increase for cybersecurity or CBRNE attacks compared to internet-only access.",
    414   "red_flags": [
    415     {
    416       "flag": "Severe benchmark contamination not acted upon",
    417       "detail": "AGIEval (98% contaminated, +16.3pp estimated gain for 405B) and BIG-Bench Hard (95% contaminated, +41pp estimated gain) are presented as key capability evidence despite the paper's own contamination analysis showing likely inflation of these scores."
    418     },
    419     {
    420       "flag": "Self-evaluation with non-reproducible safety benchmarks",
    421       "detail": "Safety comparisons use internal benchmarks 'not reproducible externally'; competitor models are anonymized, preventing independent verification of claimed Pareto-dominance on safety-helpfulness tradeoffs."
    422     },
    423     {
    424       "flag": "Causal overclaiming on data quality",
    425       "detail": "The paper claims performance gains are 'primarily driven by improvements in data quality' but conducts no controlled experiment separating data quality from the ~50× compute increase and various other simultaneous changes."
    426     },
    427     {
    428       "flag": "Annotator process opacity",
    429       "detail": "Human annotations are central to post-training quality but annotator recruitment criteria, qualifications, geographic distribution, compensation, and quality assurance procedures are not described, making assessment of annotation quality impossible."
    430     },
    431     {
    432       "flag": "Missing conflicts of interest disclosure",
    433       "detail": "No competing interests statement is included despite Meta employees evaluating Meta's own commercial product; financial interests and patents are not disclosed."
    434     }
    435   ],
    436   "cited_papers": [
    437     {
    438       "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models",
    439       "relevance": "Direct predecessor; establishes baseline for Llama 3 improvements in data scale (1.8T → 15T tokens), compute (50×), and post-training methodology"
    440     },
    441     {
    442       "title": "Training Compute-Optimal Large Language Models (Chinchilla)",
    443       "relevance": "Scaling laws foundation used to determine optimal model size and training token count for 405B model via IsoFLOPs analysis"
    444     },
    445     {
    446       "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model",
    447       "relevance": "Core alignment algorithm used throughout Llama 3 post-training pipeline, preferred over PPO for stability and performance at scale"
    448     },
    449     {
    450       "title": "GPT-4 Technical Report",
    451       "relevance": "Primary comparison target; Llama 3 is explicitly positioned as achieving comparable quality to GPT-4 while being openly released"
    452     },
    453     {
    454       "title": "Measuring Massive Multitask Language Understanding (MMLU)",
    455       "relevance": "Primary general knowledge benchmark used for pre-trained and post-trained model evaluation throughout; also subject to contamination concerns"
    456     },
    457     {
    458       "title": "Many-Shot Jailbreaking",
    459       "relevance": "Directly motivated long-context safety mitigations in Section 5.4.4; cited as evidence that long-context models are vulnerable to in-context jailbreaking"
    460     },
    461     {
    462       "title": "Llama Guard: LLM-based Input-Output Safeguard for Human-AI Conversations",
    463       "relevance": "Predecessor to Llama Guard 3 released alongside Llama 3; describes the system-level safety classification approach extended in this work"
    464     },
    465     {
    466       "title": "Scaling Laws for Neural Language Models",
    467       "relevance": "Foundation for compute-optimal model size determination; Llama 3 extends these with a two-stage methodology for predicting downstream benchmark performance"
    468     },
    469     {
    470       "title": "Evaluation Data Contamination in LLMs: How Do We Measure It and (When) Does It Matter?",
    471       "relevance": "Methodology adopted for Section 5.1.4 contamination analysis; the 8-gram overlap approach and estimated performance gain framework come directly from this work"
    472     }
    473   ],
    474   "engagement_factors": {
    475     "practical_relevance": {
    476       "score": 3,
    477       "justification": "Models are publicly downloadable with weights; practitioners can immediately fine-tune or deploy for production use cases across a wide range of applications."
    478     },
    479     "surprise_contrarian": {
    480       "score": 1,
    481       "justification": "Dense transformer matching frontier closed models is consistent with prevailing expectations in mid-2024; no major paradigm challenges or counter-intuitive findings emerge."
    482     },
    483     "fear_safety": {
    484       "score": 2,
    485       "justification": "Structured CBRNE and cybersecurity uplift testing addresses genuine AI safety concerns; contamination findings raise questions about whether benchmark-based AI progress claims are reliable."
    486     },
    487     "drama_conflict": {
    488       "score": 1,
    489       "justification": "Open vs closed source narrative is present but understated; competitors are anonymized in safety comparisons, reducing the conflict angle."
    490     },
    491     "demo_ability": {
    492       "score": 3,
    493       "justification": "Models are immediately usable via public download on HuggingFace and llama.meta.com; Llama 3 is actively deployed in numerous products and research projects."
    494     },
    495     "brand_recognition": {
    496       "score": 3,
    497       "justification": "Meta/Llama is one of the most recognized AI brands globally; the Llama series has tens of millions of downloads and defines the open-weights LLM ecosystem."
    498     }
    499   },
    500   "hn_data": {
    501     "threads": [
    502       {
    503         "hn_id": "41131642",
    504         "title": "Beating GPT-4o and Claude 3.5 on SWE-bench Lite through repeated sampling",
    505         "points": 5,
    506         "comments": 0,
    507         "url": "https://news.ycombinator.com/item?id=41131642",
    508         "created_at": "2024-08-01T17:49:40Z"
    509       },
    510       {
    511         "hn_id": "41153305",
    512         "title": "Large Language Monkeys: Scaling Inference Compute with Repeated Sampling",
    513         "points": 2,
    514         "comments": 0,
    515         "url": "https://news.ycombinator.com/item?id=41153305",
    516         "created_at": "2024-08-04T13:12:54Z"
    517       },
    518       {
    519         "hn_id": "41130857",
    520         "title": "Large Language Monkeys: Scaling Inference Compute with Repeated Sampling",
    521         "points": 2,
    522         "comments": 0,
    523         "url": "https://news.ycombinator.com/item?id=41130857",
    524         "created_at": "2024-08-01T16:28:59Z"
    525       },
    526       {
    527         "hn_id": "44866644",
    528         "title": "Improving Generative Ad Text on Facebook Using Reinforcement Learning",
    529         "points": 2,
    530         "comments": 0,
    531         "url": "https://news.ycombinator.com/item?id=44866644",
    532         "created_at": "2025-08-11T17:05:08Z"
    533       },
    534       {
    535         "hn_id": "42363641",
    536         "title": "The Llama 3 Herd of Models",
    537         "points": 1,
    538         "comments": 0,
    539         "url": "https://news.ycombinator.com/item?id=42363641",
    540         "created_at": "2024-12-09T06:56:28Z"
    541       },
    542       {
    543         "hn_id": "41129377",
    544         "title": "The Llama 3 Herd of Models",
    545         "points": 1,
    546         "comments": 0,
    547         "url": "https://news.ycombinator.com/item?id=41129377",
    548         "created_at": "2024-08-01T14:21:40Z"
    549       },
    550       {
    551         "hn_id": "41439259",
    552         "title": "Help Finding LLM and Proof Based Refactoring Reference",
    553         "points": 1,
    554         "comments": 0,
    555         "url": "https://news.ycombinator.com/item?id=41439259",
    556         "created_at": "2024-09-03T21:11:48Z"
    557       },
    558       {
    559         "hn_id": "39546939",
    560         "title": "StableLM 1.6B Technical Report – includes all data, training, strategy",
    561         "points": 1,
    562         "comments": 1,
    563         "url": "https://news.ycombinator.com/item?id=39546939",
    564         "created_at": "2024-02-29T06:43:10Z"
    565       }
    566     ],
    567     "top_points": 5,
    568     "total_points": 15,
    569     "total_comments": 1
    570   }
    571 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs