scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29043B)
      1 {
      2   "paper": {
      3     "title": "Gemma 2: Improving Open Language Models at a Practical Size",
      4     "authors": ["Gemma Team", "Google DeepMind"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2408.00118",
      8     "doi": "10.48550/arXiv.2408.00118"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Gemma 2 introduces 2B, 9B, and 27B parameter models using knowledge distillation, interleaved local-global attention, and grouped-query attention. Distillation from a larger teacher model significantly improves small model performance (e.g., 60.3→67.7 average on 3 benchmarks for 2B). The 27B model is competitive with LLaMA-3 70B despite being 2.5× smaller. Extensive safety evaluations show low memorization rates and limited dangerous capabilities.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No source code or training code repository is provided. The paper releases model weights but not training or evaluation code."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "Training data is not released. The paper states data comes from 'web documents, code, and science articles' but provides no dataset download or detailed composition."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, requirements files, or dependency lists are provided. Hardware is described (TPU types) but no software environment details."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The paper describes architecture and training at a high level but lacks sufficient detail to reproduce results."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Main benchmark results in Tables 12-13 report only point estimates with no confidence intervals or error bars. The Chatbot Arena table (Table 14) includes 95% CIs for Elo scores, but the core pre-training and post-training evaluations lack uncertainty quantification."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are used despite numerous comparative claims (e.g., 'outperforms Qwen1.5 32B'). All comparisons are based on raw score differences."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Results are reported with baseline context throughout (e.g., Table 6: 60.3 vs 67.7 for distilled vs from-scratch; Table 13 provides full comparison tables with absolute scores for all models). The reader can compute effect magnitudes."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for the number of benchmarks chosen, the number of ablation configurations, or the sample sizes in human evaluations (100 participants for persuasion studies, 500 multi-turn scenarios). No power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Table 11 reports standard deviations of MMLU across formatting variations, and some persuasion tables include ± figures. However, the core benchmark results in Tables 12-13 report single-run numbers with no variance across seeds or runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Extensive baselines are included: Gemma 1, Mistral 7B, LLaMA-3 8B/70B, Qwen1.5 32B, and multiple Gemini models for safety evaluations (Tables 12-14, 19-25)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include LLaMA-3 (2024), Qwen1.5 (2024), GPT-4o (2024), Claude 3.5 Sonnet, and other contemporary models. These were state-of-the-art at time of publication."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 5 provides ablations: distillation vs from-scratch (Table 6), distillation impact across model sizes (Table 7), GQA vs MHA (Table 8), wide vs deep (Table 9), sliding window size (Table 10), and formatting impact (Table 11)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Evaluation spans 17+ benchmarks (MMLU, GSM8K, ARC-c, HellaSwag, Winogrande, MATH, HumanEval, MBPP, etc.) plus human evaluations (Chatbot Arena Elo, safety win rates, instruction following, multi-turn satisfaction)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Multiple human evaluations: LMSYS Chatbot Arena (Table 14), side-by-side preference studies (Table 15), multi-turn scenarios with 500 conversations rated by humans (Table 16), and extensive persuasion studies with human participants (Tables 22-25)."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Standard public benchmarks are used as test sets. Section 3.1 states they 'decontaminate evaluation sets from our pre-training data mixture.' Human evaluations use 'held-out collections of single-turn prompts' and 'a diverse, held-out set of 500 scenarios.'"
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 13 provides per-benchmark breakdowns across 17 benchmarks. Safety evaluations (Table 18) break down across multiple safety benchmarks. Memorization is broken down by data source (Figure 1)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 9 acknowledges limitations in 'factuality, robustness to adversarial attacks, reasoning, and alignment.' Safety evaluations explicitly test for dangerous capabilities and report where the model fails (e.g., 0/10 end-to-end self-proliferation challenges, 0/13 Hack the Box). The persuasion studies show mixed results."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Several negative or neutral results: GQA shows minimal performance change vs MHA (Table 8), the 2B model is less format-robust than larger models (Table 11), Web of Lies shows Gemma 2 is 'significantly weaker than a human baseline at persuading participants of the incorrect answer' (Table 25), and Money Talks shows 'no significant difference' from baselines (Table 24)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims models 'deliver the best performance for their size, and even offer competitive alternatives to models that are 2-3× bigger.' Tables 12-14 support this: 27B competitive with LLaMA-3 70B, 9B Elo similar to GPT-4-0314."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The main causal claim is that knowledge distillation improves performance. This is supported by controlled ablation: Table 6 compares from-scratch vs distilled with matched training tokens (500B), Table 7 shows the effect across model sizes, and Table 8-9 compare architectural choices. These are single-variable manipulations."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title says 'Improving Open Language Models at a Practical Size' which is broad. Results are primarily on English benchmarks (Section 3.1: 'primarily-English data', 'not trained specifically for state-of-the-art multilingual capabilities'). The abstract's claim of 'state-of-the-art' is not bounded to English or specific domains."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No discussion of alternative explanations for the observed improvements. The distillation gains could partly be due to data quality differences, teacher model selection, or other confounds. Section 6.1 acknowledges models are 'likely in the same Pareto curve' but does not explore alternatives for the distillation gains."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper uses benchmark scores as proxies for 'performance' and 'capabilities' without discussing the gap between benchmark results and real-world utility. The abstract claims models are 'state-of-the-art' based on automated benchmarks, without acknowledging these are proxies for actual language understanding and generation quality."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Baseline models are often specified only by name without exact versions: 'gpt4o-2024-05-13' is specified for human evals, but many baselines use generic names like 'LLaMA-3 70B', 'Mistral 7B', 'Qwen1.5 32B' without exact checkpoint identifiers."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "No actual prompts used for benchmark evaluation are provided. Table 5 shows the dialogue formatting template, but the actual evaluation prompts for benchmarks are not included."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Architecture hyperparameters are detailed in Tables 1-3 (model dimensions, layers, heads, attention spans, vocabulary size, sharding). Soft-cap values are specified (50.0 for attention, 30.0 for final layer). Training token counts are stated. However, optimizer settings (learning rate, schedule) are not provided."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The paper evaluates base language models and instruction-tuned variants, not agent systems."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "Section 3.1 gives only a high-level description of data filtering: 'reduce the risk of unwanted or unsafe utterances, filter out certain personal information, decontaminate evaluation sets.' No specific filtering criteria, counts, or pipeline stages are documented."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations section. Section 9 (Discussion and Conclusion) mentions 'many limitations to these models' including factuality, robustness, reasoning, and alignment in a single sentence, but this is not a substantive limitations discussion."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No specific threats to validity are discussed. The brief mention in Section 9 of limitations is entirely generic ('future research is required to investigate and improve factuality, robustness to adversarial attacks, reasoning, and alignment')."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 3.1 states 'Our models are not multimodal and are not trained specifically for state-of-the-art multilingual capabilities.' Section 1 frames the work as primarily about improving small model performance. The safety section notes 'all Gemma 2 users should conduct rigorous safety testing specific to their use case.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw data is available. Training data is not released, benchmark evaluation outputs are not provided, and human evaluation raw data is not shared."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "Training data collection is described only vaguely: 'a variety of data sources, including web documents, code, and science articles.' No details on specific sources, time period, or collection methodology."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "For human persuasion studies, participants were recruited through Prolific (Section 8.4). LMSYS Chatbot Arena uses public blind evaluations. The multi-turn evaluation used 'human raters' tasked with specific scenarios."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The data pipeline from raw collection to final training data is not documented. Section 3.1 mentions filtering and decontamination but provides no counts, stages, or specific criteria."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding disclosure or acknowledgments section listing grants or sponsors. The work is by Google DeepMind but there is no explicit funding statement."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The paper clearly states 'Gemma Team, Google DeepMind' as the author affiliation. The extensive contributor list identifies Google DeepMind employees."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Google DeepMind funds this research and has a direct commercial interest in demonstrating Gemma 2's strong performance. The funder is not independent of the outcome."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is included. All authors are Google DeepMind employees with potential equity/financial interests in demonstrating strong model performance."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No explicit training data cutoff date is stated. Token counts are given (13T for 27B, 8T for 9B, 2T for 2B) but the temporal range of the training data is not specified."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Section 3.1 states they 'decontaminate evaluation sets from our pre-training data mixture,' indicating train/test overlap was addressed, though the specific decontamination method is not detailed."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "While decontamination is mentioned, the paper does not discuss whether specific benchmarks (e.g., HumanEval from 2021, MMLU from 2020) were available before training data collection. The decontamination method is not described in detail."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "The paper includes human participant studies (persuasion studies on Prolific, multi-turn evaluations) but no pre-registration is mentioned."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No IRB or ethics board approval is mentioned for the human participant studies (Prolific persuasion studies, human preference evaluations)."
    253       },
    254       "demographics_reported": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "No demographics reported for any human participants: not for Prolific participants in persuasion studies, not for human raters in preference evaluations, not for Chatbot Arena voters."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "No inclusion or exclusion criteria stated for human participants in any of the evaluation studies."
    263       },
    264       "randomization_described": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No randomization procedure described for the human evaluation studies. Chatbot Arena uses 'blind side by side evaluations' but the assignment procedure is not described."
    268       },
    269       "blinding_described": {
    270         "applies": true,
    271         "answer": true,
    272         "justification": "Table 14 states models were evaluated through 'blind side by side evaluations by human raters.' The Hidden Agenda study (Section 8.4) describes participants being told they interact with a 'generic chatbot' without knowing the model's secret goal."
    273       },
    274       "attrition_reported": {
    275         "applies": true,
    276         "answer": false,
    277         "justification": "No attrition or dropout information reported for any of the human studies. Sample sizes are stated (100 participants for persuasion, 500 scenarios for multi-turn) but no dropout information."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost or latency reported for any model size. GQA is noted to be 'faster at inference time' but no actual timing numbers are given."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Training infrastructure is detailed in Table 3 (chip counts and configurations). Token counts are specified (13T, 8T, 2T). Carbon footprint is estimated at 1247.61 tCO2eq (Section 3.4). However, total GPU/TPU hours are not explicitly stated."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No results reported across multiple random seeds. All benchmark results appear to be single-run evaluations."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is not stated for any benchmark evaluation. It is unclear whether results are single-run or averaged."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Section 4 mentions 'tuned hyperparameters' and 'different hyperparameters' for model merging but does not report the search budget or number of configurations tried."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "Section 3.1 mentions 'The final data mixture was determined through ablations similar to the approach in Gemini 1.0' and Section 4 mentions merging models with different hyperparameters, but the selection process is not documented."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No multiple comparison corrections applied despite comparisons across 17+ benchmarks and multiple model pairs."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "Google DeepMind evaluates its own Gemma 2 models against competitors without acknowledging potential self-comparison bias. Some baseline numbers are taken from other sources (HuggingFace leaderboard, LLaMA-3 blogpost) but others are run by the authors."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The paper notes the 27B model is 'trained on 2/3rds less data' than LLaMA-3 70B but does not systematically compare performance as a function of compute budget across models."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the benchmarks used actually measure the capabilities claimed. The paper uses standard benchmarks without questioning their validity."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved — the paper evaluates base and instruction-tuned models directly on benchmarks."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of temporal leakage despite using benchmarks (HumanEval 2021, MMLU 2020) that predate the model's training. The training data cutoff is not even stated."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether evaluation setups leak information. Few-shot examples are used in benchmarks but no analysis of whether this introduces leakage."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether training and test data share structural similarities despite training on web data that likely overlaps with benchmark source materials."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "Section 3.1 states they 'decontaminate evaluation sets from our pre-training data mixture,' indicating a concrete decontamination pipeline was applied, though its specific method is not detailed."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Knowledge distillation significantly improves small model performance compared to training from scratch.",
    365       "evidence": "Table 6: 2B model average improves from 60.3 to 67.7 (3 benchmarks) when distilled vs trained from scratch on 500B tokens. Table 7: perplexity improvements of 2-3 points across model sizes (200M-1B) with distillation.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Gemma 2 27B outperforms comparably-sized models and is competitive with models 2-3× bigger.",
    370       "evidence": "Table 12: 27B outperforms Qwen1.5 32B on all 5 benchmarks. Competitive with LLaMA-3 70B (e.g., MMLU 75.2 vs 79.2, GSM8K 74.0 vs 76.9). Table 14: Elo 1218 for 27B vs 1206 for LLaMA-3 70B on Chatbot Arena.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Gemma 2 9B is comparable to GPT-4-0314 on Chatbot Arena.",
    375       "evidence": "Table 14: Gemma 2 9B IT Elo 1187, GPT-4-0314 Elo 1186. However, confidence intervals overlap (+3/-5 vs +2/-3).",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Gemma 2 memorizes significantly less than prior models.",
    380       "evidence": "Figure 1 shows memorization rates below 0.1% for all Gemma 2 sizes, lower than Gemma 1, PaLM 2 Small, and Gemini 1.5 Flash.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Gemma 2 models produce safer outputs than GPT-4o on safety prompts.",
    385       "evidence": "Table 15: All Gemma 2 models show positive win/loss ratios against GPT-4o on safety (e.g., 9B: 57.8% safety score vs GPT-4o baseline). However, this is based on a held-out safety prompt set whose composition and size are not disclosed.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "GQA has negligible impact on performance while improving inference speed.",
    390       "evidence": "Table 8: MHA 50.3 vs GQA 50.8 average across 4 benchmarks. Claims 'increased speed at inference time' but no actual speed numbers provided.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "Company evaluating its own product",
    397       "detail": "Google DeepMind employees evaluate Gemma 2 against competitors. While some baselines are from external leaderboards, the evaluation setup, benchmark selection, and reporting are controlled by the model developer."
    398     },
    399     {
    400       "flag": "Selective benchmark reporting",
    401       "detail": "Table 13 footnotes reveal evaluation methodology differences: some baselines use LLaMA-3's evaluation setup which gives +3-4% higher scores. The paper selectively chooses which evaluation protocol to use for different baselines."
    402     },
    403     {
    404       "flag": "No uncertainty quantification on main results",
    405       "detail": "Core benchmark results (Tables 12-13) report only point estimates. Without variance or confidence intervals, it is impossible to assess whether observed differences are meaningful or within noise."
    406     },
    407     {
    408       "flag": "Missing training details",
    409       "detail": "Critical training details are omitted: optimizer, learning rate schedule, training data composition ratios, data cutoff date, decontamination method details. These are essential for understanding and reproducing the results."
    410     },
    411     {
    412       "flag": "Opaque data filtering",
    413       "detail": "Both pre-training and post-training data filtering are described only in vague terms ('reduce the risk of unwanted or unsafe utterances'). No quantitative information on how much data was filtered or what criteria were applied."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Evaluating large language models trained on code",
    419       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    420       "year": 2021,
    421       "arxiv_id": "2107.03374",
    422       "relevance": "Defines HumanEval benchmark used extensively in LLM code generation evaluation."
    423     },
    424     {
    425       "title": "GPT-4 technical report",
    426       "authors": ["J. Achiam", "S. Adler", "S. Agarwal"],
    427       "year": 2023,
    428       "arxiv_id": "2303.08774",
    429       "relevance": "Major LLM technical report establishing evaluation methodology patterns for frontier models."
    430     },
    431     {
    432       "title": "Training compute-optimal large language models",
    433       "authors": ["J. Hoffmann", "S. Borgeaud", "A. Mensch"],
    434       "year": 2022,
    435       "arxiv_id": "2203.15556",
    436       "relevance": "Chinchilla scaling laws paper; Gemma 2 trains 50× beyond compute-optimal predictions, directly testing scaling assumptions."
    437     },
    438     {
    439       "title": "Distilling the knowledge in a neural network",
    440       "authors": ["G. Hinton", "O. Vinyals", "J. Dean"],
    441       "year": 2015,
    442       "arxiv_id": "1503.02531",
    443       "relevance": "Foundational knowledge distillation paper; core technique used for Gemma 2 2B and 9B training."
    444     },
    445     {
    446       "title": "On-policy distillation of language models: Learning from self-generated mistakes",
    447       "authors": ["R. Agarwal", "N. Vieillard", "Y. Zhou"],
    448       "year": 2024,
    449       "relevance": "On-policy distillation technique used in Gemma 2 post-training SFT phase."
    450     },
    451     {
    452       "title": "Evaluating frontier models for dangerous capabilities",
    453       "authors": ["M. Phuong", "M. Aitchison", "E. Catt"],
    454       "year": 2024,
    455       "arxiv_id": "2403.13793",
    456       "relevance": "Methodology for dangerous capability evaluations (cyber, CBRN, self-proliferation) used to assess Gemma 2 safety."
    457     },
    458     {
    459       "title": "Chatbot arena: An open platform for evaluating LLMs by human preference",
    460       "authors": ["W.-L. Chiang", "L. Zheng", "Y. Sheng"],
    461       "year": 2024,
    462       "relevance": "Human preference evaluation platform used as primary post-training evaluation for Gemma 2."
    463     },
    464     {
    465       "title": "Scalable extraction of training data from (production) language models",
    466       "authors": ["M. Nasr", "N. Carlini", "J. Hayase"],
    467       "year": 2023,
    468       "arxiv_id": "2311.17035",
    469       "relevance": "Training data extraction attacks; motivates Gemma 2's memorization evaluation methodology."
    470     },
    471     {
    472       "title": "Measuring massive multitask language understanding",
    473       "authors": ["D. Hendrycks", "C. Burns", "S. Basart"],
    474       "year": 2020,
    475       "arxiv_id": "2009.03300",
    476       "relevance": "Defines MMLU benchmark, one of the primary evaluation metrics used."
    477     },
    478     {
    479       "title": "Ethical and social risks of harm from language models",
    480       "authors": ["L. Weidinger", "J. Mellor", "M. Rauh"],
    481       "year": 2021,
    482       "arxiv_id": "2112.04359",
    483       "relevance": "Framework for LLM risk assessment referenced in Gemma 2's safety impact assessment."
    484     },
    485     {
    486       "title": "Model evaluation for extreme risks",
    487       "authors": ["T. Shevlane", "S. Farquhar", "B. Garfinkel"],
    488       "year": 2023,
    489       "arxiv_id": "2305.15324",
    490       "relevance": "Framework for evaluating dangerous AI capabilities; informs Gemma 2's assurance evaluation approach."
    491     }
    492   ]
    493 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs