scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (36142B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Gemma 2: Improving Open Language Models at a Practical Size",
      6     "authors": [
      7       "Gemma Team, Google DeepMind"
      8     ],
      9     "year": 2024,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2408.00118",
     12     "doi": "10.48550/arXiv.2408.00118"
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The abstract claims models 'deliver the best performance for their size, and even offer competitive alternatives to models that are 2-3× bigger.' Tables 12-14 support this: 27B competitive with LLaMA-3 70B, 9B Elo similar to GPT-4-0314.",
     20         "source": "opus"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The main causal claim is that knowledge distillation improves performance. This is supported by controlled ablation: Table 6 compares from-scratch vs distilled with matched training tokens (500B), Table 7 shows the effect across model sizes, and Table 8-9 compare architectural choices. These are single-variable manipulations.",
     26         "source": "opus"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The title says 'Improving Open Language Models at a Practical Size' which is broad. Results are primarily on English benchmarks (Section 3.1: 'primarily-English data', 'not trained specifically for state-of-the-art multilingual capabilities'). The abstract's claim of 'state-of-the-art' is not bounded to English or specific domains.",
     32         "source": "opus"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No discussion of alternative explanations for the observed improvements. The distillation gains could partly be due to data quality differences, teacher model selection, or other confounds. Section 6.1 acknowledges models are 'likely in the same Pareto curve' but does not explore alternatives for the distillation gains.",
     38         "source": "opus"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper uses benchmark scores as proxies for 'performance' and 'capabilities' without discussing the gap between benchmark results and real-world utility. The abstract claims models are 'state-of-the-art' based on automated benchmarks, without acknowledging these are proxies for actual language understanding and generation quality.",
     44         "source": "opus"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "There is no dedicated limitations section. Section 9 (Discussion and Conclusion) mentions 'many limitations to these models' including factuality, robustness, reasoning, and alignment in a single sentence, but this is not a substantive limitations discussion.",
     52         "source": "opus"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No specific threats to validity are discussed. The brief mention in Section 9 of limitations is entirely generic ('future research is required to investigate and improve factuality, robustness to adversarial attacks, reasoning, and alignment').",
     58         "source": "opus"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Section 3.1 states 'Our models are not multimodal and are not trained specifically for state-of-the-art multilingual capabilities.' Section 1 frames the work as primarily about improving small model performance. The safety section notes 'all Gemma 2 users should conduct rigorous safety testing specific to their use case.'",
     64         "source": "opus"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding disclosure or acknowledgments section listing grants or sponsors. The work is by Google DeepMind but there is no explicit funding statement.",
     72         "source": "opus"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper clearly states 'Gemma Team, Google DeepMind' as the author affiliation. The extensive contributor list identifies Google DeepMind employees.",
     78         "source": "opus"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Google DeepMind funds this research and has a direct commercial interest in demonstrating Gemma 2's strong performance. The funder is not independent of the outcome.",
     84         "source": "opus"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests or financial interests statement is included. All authors are Google DeepMind employees with potential equity/financial interests in demonstrating strong model performance.",
     90         "source": "opus"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Knowledge distillation is defined with equations (Section 3.2), architectural terms (GQA, sliding window attention, logit soft-capping, RMSNorm) are explained, and 'memorization' is explicitly defined with a footnote clarifying the restricted meaning used.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The introduction clearly states the contribution: a new family of open LLMs (2B, 9B, 27B) using knowledge distillation and architectural improvements (local-global attention, GQA) to exceed prior performance at practical scales.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper cites and contextualizes prior work on scaling laws (Hoffmann et al.), knowledge distillation (Hinton et al.), architectural innovations (Beltagy, Ainslie), and contemporary open models (LLaMA-3, Mistral), clearly positioning contributions relative to existing work.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "empirical": {
    116       "artifacts": {
    117         "code_released": {
    118           "applies": true,
    119           "answer": false,
    120           "justification": "No source code or training code repository is provided. The paper releases model weights but not training or evaluation code.",
    121           "source": "opus"
    122         },
    123         "data_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "Training data is not released. The paper states data comes from 'web documents, code, and science articles' but provides no dataset download or detailed composition.",
    127           "source": "opus"
    128         },
    129         "environment_specified": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "No environment specifications, requirements files, or dependency lists are provided. Hardware is described (TPU types) but no software environment details.",
    133           "source": "opus"
    134         },
    135         "reproduction_instructions": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "No step-by-step reproduction instructions are provided. The paper describes architecture and training at a high level but lacks sufficient detail to reproduce results.",
    139           "source": "opus"
    140         }
    141       },
    142       "statistical_methodology": {
    143         "confidence_intervals_or_error_bars": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "Main benchmark results in Tables 12-13 report only point estimates with no confidence intervals or error bars. The Chatbot Arena table (Table 14) includes 95% CIs for Elo scores, but the core pre-training and post-training evaluations lack uncertainty quantification.",
    147           "source": "opus"
    148         },
    149         "significance_tests": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "No statistical significance tests are used despite numerous comparative claims (e.g., 'outperforms Qwen1.5 32B'). All comparisons are based on raw score differences.",
    153           "source": "opus"
    154         },
    155         "effect_sizes_reported": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Results are reported with baseline context throughout (e.g., Table 6: 60.3 vs 67.7 for distilled vs from-scratch; Table 13 provides full comparison tables with absolute scores for all models). The reader can compute effect magnitudes.",
    159           "source": "opus"
    160         },
    161         "sample_size_justified": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "No justification for the number of benchmarks chosen, the number of ablation configurations, or the sample sizes in human evaluations (100 participants for persuasion studies, 500 multi-turn scenarios). No power analysis.",
    165           "source": "opus"
    166         },
    167         "variance_reported": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "Table 11 reports standard deviations of MMLU across formatting variations, and some persuasion tables include ± figures. However, the core benchmark results in Tables 12-13 report single-run numbers with no variance across seeds or runs.",
    171           "source": "opus"
    172         }
    173       },
    174       "evaluation_design": {
    175         "baselines_included": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Extensive baselines are included: Gemma 1, Mistral 7B, LLaMA-3 8B/70B, Qwen1.5 32B, and multiple Gemini models for safety evaluations (Tables 12-14, 19-25).",
    179           "source": "opus"
    180         },
    181         "baselines_contemporary": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Baselines include LLaMA-3 (2024), Qwen1.5 (2024), GPT-4o (2024), Claude 3.5 Sonnet, and other contemporary models. These were state-of-the-art at time of publication.",
    185           "source": "opus"
    186         },
    187         "ablation_study": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Section 5 provides ablations: distillation vs from-scratch (Table 6), distillation impact across model sizes (Table 7), GQA vs MHA (Table 8), wide vs deep (Table 9), sliding window size (Table 10), and formatting impact (Table 11).",
    191           "source": "opus"
    192         },
    193         "multiple_metrics": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Evaluation spans 17+ benchmarks (MMLU, GSM8K, ARC-c, HellaSwag, Winogrande, MATH, HumanEval, MBPP, etc.) plus human evaluations (Chatbot Arena Elo, safety win rates, instruction following, multi-turn satisfaction).",
    197           "source": "opus"
    198         },
    199         "human_evaluation": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Multiple human evaluations: LMSYS Chatbot Arena (Table 14), side-by-side preference studies (Table 15), multi-turn scenarios with 500 conversations rated by humans (Table 16), and extensive persuasion studies with human participants (Tables 22-25).",
    203           "source": "opus"
    204         },
    205         "held_out_test_set": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "Standard public benchmarks are used as test sets. Section 3.1 states they 'decontaminate evaluation sets from our pre-training data mixture.' Human evaluations use 'held-out collections of single-turn prompts' and 'a diverse, held-out set of 500 scenarios.'",
    209           "source": "opus"
    210         },
    211         "per_category_breakdown": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Table 13 provides per-benchmark breakdowns across 17 benchmarks. Safety evaluations (Table 18) break down across multiple safety benchmarks. Memorization is broken down by data source (Figure 1).",
    215           "source": "opus"
    216         },
    217         "failure_cases_discussed": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Section 9 acknowledges limitations in 'factuality, robustness to adversarial attacks, reasoning, and alignment.' Safety evaluations explicitly test for dangerous capabilities and report where the model fails (e.g., 0/10 end-to-end self-proliferation challenges, 0/13 Hack the Box). The persuasion studies show mixed results.",
    221           "source": "opus"
    222         },
    223         "negative_results_reported": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Several negative or neutral results: GQA shows minimal performance change vs MHA (Table 8), the 2B model is less format-robust than larger models (Table 11), Web of Lies shows Gemma 2 is 'significantly weaker than a human baseline at persuading participants of the incorrect answer' (Table 25), and Money Talks shows 'no significant difference' from baselines (Table 24).",
    227           "source": "opus"
    228         }
    229       },
    230       "setup_transparency": {
    231         "model_versions_specified": {
    232           "applies": true,
    233           "answer": false,
    234           "justification": "Baseline models are often specified only by name without exact versions: 'gpt4o-2024-05-13' is specified for human evals, but many baselines use generic names like 'LLaMA-3 70B', 'Mistral 7B', 'Qwen1.5 32B' without exact checkpoint identifiers.",
    235           "source": "opus"
    236         },
    237         "prompts_provided": {
    238           "applies": true,
    239           "answer": false,
    240           "justification": "No actual prompts used for benchmark evaluation are provided. Table 5 shows the dialogue formatting template, but the actual evaluation prompts for benchmarks are not included.",
    241           "source": "opus"
    242         },
    243         "hyperparameters_reported": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "Architecture hyperparameters are detailed in Tables 1-3 (model dimensions, layers, heads, attention spans, vocabulary size, sharding). Soft-cap values are specified (50.0 for attention, 30.0 for final layer). Training token counts are stated. However, optimizer settings (learning rate, schedule) are not provided.",
    247           "source": "opus"
    248         },
    249         "scaffolding_described": {
    250           "applies": false,
    251           "answer": false,
    252           "justification": "No agentic scaffolding is used. The paper evaluates base language models and instruction-tuned variants, not agent systems.",
    253           "source": "opus"
    254         },
    255         "data_preprocessing_documented": {
    256           "applies": true,
    257           "answer": false,
    258           "justification": "Section 3.1 gives only a high-level description of data filtering: 'reduce the risk of unwanted or unsafe utterances, filter out certain personal information, decontaminate evaluation sets.' No specific filtering criteria, counts, or pipeline stages are documented.",
    259           "source": "opus"
    260         }
    261       },
    262       "data_integrity": {
    263         "raw_data_available": {
    264           "applies": true,
    265           "answer": false,
    266           "justification": "No raw data is available. Training data is not released, benchmark evaluation outputs are not provided, and human evaluation raw data is not shared.",
    267           "source": "opus"
    268         },
    269         "data_collection_described": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "Training data collection is described only vaguely: 'a variety of data sources, including web documents, code, and science articles.' No details on specific sources, time period, or collection methodology.",
    273           "source": "opus"
    274         },
    275         "recruitment_methods_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "For human persuasion studies, participants were recruited through Prolific (Section 8.4). LMSYS Chatbot Arena uses public blind evaluations. The multi-turn evaluation used 'human raters' tasked with specific scenarios.",
    279           "source": "opus"
    280         },
    281         "data_pipeline_documented": {
    282           "applies": true,
    283           "answer": false,
    284           "justification": "The data pipeline from raw collection to final training data is not documented. Section 3.1 mentions filtering and decontamination but provides no counts, stages, or specific criteria.",
    285           "source": "opus"
    286         }
    287       },
    288       "contamination": {
    289         "training_cutoff_stated": {
    290           "applies": true,
    291           "answer": false,
    292           "justification": "No explicit training data cutoff date is stated. Token counts are given (13T for 27B, 8T for 9B, 2T for 2B) but the temporal range of the training data is not specified.",
    293           "source": "opus"
    294         },
    295         "train_test_overlap_discussed": {
    296           "applies": true,
    297           "answer": true,
    298           "justification": "Section 3.1 states they 'decontaminate evaluation sets from our pre-training data mixture,' indicating train/test overlap was addressed, though the specific decontamination method is not detailed.",
    299           "source": "opus"
    300         },
    301         "benchmark_contamination_addressed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "While decontamination is mentioned, the paper does not discuss whether specific benchmarks (e.g., HumanEval from 2021, MMLU from 2020) were available before training data collection. The decontamination method is not described in detail.",
    305           "source": "opus"
    306         }
    307       },
    308       "human_studies": {
    309         "pre_registered": {
    310           "applies": true,
    311           "answer": false,
    312           "justification": "The paper includes human participant studies (persuasion studies on Prolific, multi-turn evaluations) but no pre-registration is mentioned.",
    313           "source": "opus"
    314         },
    315         "irb_or_ethics_approval": {
    316           "applies": true,
    317           "answer": false,
    318           "justification": "No IRB or ethics board approval is mentioned for the human participant studies (Prolific persuasion studies, human preference evaluations).",
    319           "source": "opus"
    320         },
    321         "demographics_reported": {
    322           "applies": true,
    323           "answer": false,
    324           "justification": "No demographics reported for any human participants: not for Prolific participants in persuasion studies, not for human raters in preference evaluations, not for Chatbot Arena voters.",
    325           "source": "opus"
    326         },
    327         "inclusion_exclusion_criteria": {
    328           "applies": true,
    329           "answer": false,
    330           "justification": "No inclusion or exclusion criteria stated for human participants in any of the evaluation studies.",
    331           "source": "opus"
    332         },
    333         "randomization_described": {
    334           "applies": true,
    335           "answer": false,
    336           "justification": "No randomization procedure described for the human evaluation studies. Chatbot Arena uses 'blind side by side evaluations' but the assignment procedure is not described.",
    337           "source": "opus"
    338         },
    339         "blinding_described": {
    340           "applies": true,
    341           "answer": true,
    342           "justification": "Table 14 states models were evaluated through 'blind side by side evaluations by human raters.' The Hidden Agenda study (Section 8.4) describes participants being told they interact with a 'generic chatbot' without knowing the model's secret goal.",
    343           "source": "opus"
    344         },
    345         "attrition_reported": {
    346           "applies": true,
    347           "answer": false,
    348           "justification": "No attrition or dropout information reported for any of the human studies. Sample sizes are stated (100 participants for persuasion, 500 scenarios for multi-turn) but no dropout information.",
    349           "source": "opus"
    350         }
    351       },
    352       "cost_and_practicality": {
    353         "inference_cost_reported": {
    354           "applies": true,
    355           "answer": false,
    356           "justification": "No inference cost or latency reported for any model size. GQA is noted to be 'faster at inference time' but no actual timing numbers are given.",
    357           "source": "opus"
    358         },
    359         "compute_budget_stated": {
    360           "applies": true,
    361           "answer": true,
    362           "justification": "Training infrastructure is detailed in Table 3 (chip counts and configurations). Token counts are specified (13T, 8T, 2T). Carbon footprint is estimated at 1247.61 tCO2eq (Section 3.4). However, total GPU/TPU hours are not explicitly stated.",
    363           "source": "opus"
    364         }
    365       },
    366       "experimental_rigor": {
    367         "seed_sensitivity_reported": {
    368           "applies": true,
    369           "answer": false,
    370           "justification": "No results reported across multiple random seeds. All benchmark results appear to be single-run evaluations.",
    371           "source": "opus"
    372         },
    373         "number_of_runs_stated": {
    374           "applies": true,
    375           "answer": false,
    376           "justification": "The number of experimental runs is not stated for any benchmark evaluation. It is unclear whether results are single-run or averaged.",
    377           "source": "opus"
    378         },
    379         "hyperparameter_search_budget": {
    380           "applies": true,
    381           "answer": false,
    382           "justification": "Section 4 mentions 'tuned hyperparameters' and 'different hyperparameters' for model merging but does not report the search budget or number of configurations tried.",
    383           "source": "opus"
    384         },
    385         "best_config_selection_justified": {
    386           "applies": true,
    387           "answer": false,
    388           "justification": "Section 3.1 mentions 'The final data mixture was determined through ablations similar to the approach in Gemini 1.0' and Section 4 mentions merging models with different hyperparameters, but the selection process is not documented.",
    389           "source": "opus"
    390         },
    391         "multiple_comparison_correction": {
    392           "applies": true,
    393           "answer": false,
    394           "justification": "No multiple comparison corrections applied despite comparisons across 17+ benchmarks and multiple model pairs.",
    395           "source": "opus"
    396         },
    397         "self_comparison_bias_addressed": {
    398           "applies": true,
    399           "answer": false,
    400           "justification": "Google DeepMind evaluates its own Gemma 2 models against competitors without acknowledging potential self-comparison bias. Some baseline numbers are taken from other sources (HuggingFace leaderboard, LLaMA-3 blogpost) but others are run by the authors.",
    401           "source": "opus"
    402         },
    403         "compute_budget_vs_performance": {
    404           "applies": true,
    405           "answer": false,
    406           "justification": "The paper notes the 27B model is 'trained on 2/3rds less data' than LLaMA-3 70B but does not systematically compare performance as a function of compute budget across models.",
    407           "source": "opus"
    408         },
    409         "benchmark_construct_validity": {
    410           "applies": true,
    411           "answer": false,
    412           "justification": "No discussion of whether the benchmarks used actually measure the capabilities claimed. The paper uses standard benchmarks without questioning their validity.",
    413           "source": "opus"
    414         },
    415         "scaffold_confound_addressed": {
    416           "applies": false,
    417           "answer": false,
    418           "justification": "No scaffolding is involved — the paper evaluates base and instruction-tuned models directly on benchmarks.",
    419           "source": "opus"
    420         }
    421       },
    422       "data_leakage": {
    423         "temporal_leakage_addressed": {
    424           "applies": true,
    425           "answer": false,
    426           "justification": "No discussion of temporal leakage despite using benchmarks (HumanEval 2021, MMLU 2020) that predate the model's training. The training data cutoff is not even stated.",
    427           "source": "opus"
    428         },
    429         "feature_leakage_addressed": {
    430           "applies": true,
    431           "answer": false,
    432           "justification": "No discussion of whether evaluation setups leak information. Few-shot examples are used in benchmarks but no analysis of whether this introduces leakage.",
    433           "source": "opus"
    434         },
    435         "non_independence_addressed": {
    436           "applies": true,
    437           "answer": false,
    438           "justification": "No discussion of whether training and test data share structural similarities despite training on web data that likely overlaps with benchmark source materials.",
    439           "source": "opus"
    440         },
    441         "leakage_detection_method": {
    442           "applies": true,
    443           "answer": true,
    444           "justification": "Section 3.1 states they 'decontaminate evaluation sets from our pre-training data mixture,' indicating a concrete decontamination pipeline was applied, though its specific method is not detailed.",
    445           "source": "opus"
    446         }
    447       }
    448     }
    449   },
    450   "claims": [
    451     {
    452       "claim": "Knowledge distillation improves 2B model average benchmark performance by 7.4 percentage points over training from scratch (67.7% vs 60.3%)",
    453       "evidence": "Table 6 shows controlled comparison of 2B model trained from scratch vs. with distillation from a 7B model, both on 500B tokens—10× the compute-optimal budget",
    454       "supported": "strong"
    455     },
    456     {
    457       "claim": "Gemma 2 27B is competitive with LLaMA-3 70B despite being 2.5× smaller and trained on 2/3 less data",
    458       "evidence": "Table 12 shows Gemma 2 27B at 75.2% MMLU vs. LLaMA-3 70B at 79.2%, and 74.0% vs. 76.9% on GSM8K, with comparable ARC-C and HellaSwag scores",
    459       "supported": "moderate"
    460     },
    461     {
    462       "claim": "Gemma 2 instruction-tuned models achieve top open-model rankings on LMSYS Chatbot Arena at all parameter scales",
    463       "evidence": "Table 14 shows Gemma 2 27B at Elo 1218 outranking LLaMA-3 70B at 1206, Gemma 2 9B at 1187 comparable to GPT-4-0314 at 1186, Gemma 2 2B at 1126 above GPT-3.5-Turbo at 1116",
    464       "supported": "strong"
    465     },
    466     {
    467       "claim": "Gemma 2 memorizes significantly less training data than comparable prior models",
    468       "evidence": "Figure 1 shows Gemma 2 memorization rates below 0.1% (log scale) across all model sizes; approximate memorization rate lower than exact memorization rate of Gemma 1",
    469       "supported": "strong"
    470     },
    471     {
    472       "claim": "Deeper 9B architecture (46 layers) outperforms wider 9B architecture by 1.2pp at same parameter count",
    473       "evidence": "Table 9 shows deep 9B at 52.0% vs. wide 9B at 50.8% average over 4 benchmarks, described as 'consistent across benchmarks'",
    474       "supported": "moderate"
    475     },
    476     {
    477       "claim": "Gemma 2 models produce safer outputs than GPT-4o on held-out safety prompts regardless of model size",
    478       "evidence": "Table 15 shows Gemma 2 win rates of 53–57.8% on safety prompts against GPT-4o baseline; evaluation uses company's own safety policy definitions and human raters",
    479       "supported": "weak"
    480     }
    481   ],
    482   "methodology_tags": [
    483     "benchmark-eval",
    484     "empirical"
    485   ],
    486   "key_findings": "Gemma 2 demonstrates that knowledge distillation is an effective training objective for small LLMs, with the 2B model achieving a 7.4pp improvement over training from scratch on the same token budget. The 27B model achieves near-parity with LLaMA-3 70B on standard benchmarks despite being 2.5× smaller, and the instruction-tuned variants rank highest among open models on the LMSYS Chatbot Arena. Safety evaluations reveal low memorization rates (<0.1%) and limited dangerous capabilities on CTF and self-proliferation tasks. However, human participant studies on persuasion and social capabilities lack key methodological safeguards—no IRB disclosure, no participant demographics, and no pre-registration—limiting interpretability of those findings.",
    487   "red_flags": [
    488     {
    489       "flag": "Self-evaluation by developer",
    490       "detail": "Google DeepMind evaluates its own commercial models with no independent third-party verification; the organization has direct financial interest in favorable results, particularly for Chatbot Arena rankings."
    491     },
    492     {
    493       "flag": "Missing training hyperparameters",
    494       "detail": "Learning rate, batch size, warmup schedule, and weight decay are entirely absent, making it impossible to reproduce training even with equivalent hardware and data."
    495     },
    496     {
    497       "flag": "No IRB for deceptive human studies",
    498       "detail": "The hidden agenda study deliberately deceives participants about the model's goal (covertly persuading them to click links, run code, find personal information) with no mention of IRB approval or ethics review."
    499     },
    500     {
    501       "flag": "No training data cutoff stated",
    502       "detail": "No temporal cutoff for training data is disclosed, making it impossible to assess whether the model was trained on benchmark examples released before data collection ended."
    503     },
    504     {
    505       "flag": "Overbroad state-of-the-art claims",
    506       "detail": "'Best performance for their size' and 'state-of-the-art' are asserted without bounding to the specific benchmarks tested or acknowledging that comparisons exclude models with non-permissive licenses."
    507     },
    508     {
    509       "flag": "No inference latency despite GQA motivation",
    510       "detail": "GQA is adopted explicitly for inference speed benefits, but no latency or throughput numbers are reported, making the practical advantage unverifiable."
    511     }
    512   ],
    513   "cited_papers": [
    514     {
    515       "title": "Training Compute-Optimal Large Language Models (Hoffmann et al., 2022)",
    516       "relevance": "Foundational scaling laws used to contextualize training token budgets and justify over-training beyond the compute-optimal regime via distillation"
    517     },
    518     {
    519       "title": "Distilling the Knowledge in a Neural Network (Hinton et al., 2015)",
    520       "relevance": "Core technique used to train Gemma 2 2B and 9B models; the paper frames its key contribution as applying distillation at scale for open LLMs"
    521     },
    522     {
    523       "title": "LLaMA 3 Model Card (AI@Meta, 2024)",
    524       "relevance": "Primary comparison baseline throughout the paper; the 70B model is the main competitor for establishing Gemma 2 27B's competitive position"
    525     },
    526     {
    527       "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference (Chiang et al., 2024)",
    528       "relevance": "Human evaluation platform on which Gemma 2 achieves its top open-model ranking claim; central to post-training evaluation"
    529     },
    530     {
    531       "title": "Evaluating Frontier Models for Dangerous Capabilities (Phuong et al., 2024)",
    532       "relevance": "Provides the full methodology for CTF, self-proliferation, and capability evaluations; this paper defers to it for methodological details"
    533     },
    534     {
    535       "title": "Scalable Extraction of Training Data from Production Language Models (Nasr et al., 2023)",
    536       "relevance": "Motivates and informs the memorization evaluation methodology used in Section 7"
    537     },
    538     {
    539       "title": "GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints (Ainslie et al., 2023)",
    540       "relevance": "Grouped-query attention mechanism adopted in Gemma 2 architecture for inference efficiency"
    541     },
    542     {
    543       "title": "Longformer: The Long-Document Transformer (Beltagy et al., 2020)",
    544       "relevance": "Local sliding window attention mechanism alternated with global attention in Gemma 2 architecture"
    545     }
    546   ],
    547   "engagement_factors": {
    548     "practical_relevance": {
    549       "score": 3,
    550       "justification": "Open-weight models released for public use; the 2B and 9B variants are deployable on consumer hardware and directly usable by practitioners immediately."
    551     },
    552     "surprise_contrarian": {
    553       "score": 1,
    554       "justification": "Knowledge distillation for LLMs is well-established; competitive performance vs. larger models is the primary result, which is incremental confirmation rather than a contrarian finding."
    555     },
    556     "fear_safety": {
    557       "score": 2,
    558       "justification": "Evaluations of persuasion, CBRN knowledge, and self-proliferation directly engage AI risk concerns; the deceptive 'hidden agenda' study raises questions about model misuse potential."
    559     },
    560     "drama_conflict": {
    561       "score": 1,
    562       "justification": "Competitive framing against Meta LLaMA and OpenAI GPT-4 provides a mild competitive narrative, but no major controversy or disputed claims."
    563     },
    564     "demo_ability": {
    565       "score": 3,
    566       "justification": "Models are released on HuggingFace and Google's platforms and can be used immediately without any barrier to access."
    567     },
    568     "brand_recognition": {
    569       "score": 3,
    570       "justification": "Google DeepMind with explicit Gemma/Gemini branding; extremely high recognition in both research and practitioner communities."
    571     }
    572   },
    573   "hn_data": {
    574     "threads": [
    575       {
    576         "hn_id": "41421591",
    577         "title": "Inductive or deductive? Rethinking the fundamental reasoning abilities of LLMs",
    578         "points": 107,
    579         "comments": 169,
    580         "url": "https://news.ycombinator.com/item?id=41421591",
    581         "created_at": "2024-09-02T00:49:06Z"
    582       },
    583       {
    584         "hn_id": "15289917",
    585         "title": "Benefits of Napping in Healthy Adults (2009) [pdf]",
    586         "points": 88,
    587         "comments": 38,
    588         "url": "https://news.ycombinator.com/item?id=15289917",
    589         "created_at": "2017-09-20T00:43:51Z"
    590       },
    591       {
    592         "hn_id": "30180281",
    593         "title": "Computational Thinking and Thinking about Computing (2008)",
    594         "points": 4,
    595         "comments": 1,
    596         "url": "https://news.ycombinator.com/item?id=30180281",
    597         "created_at": "2022-02-02T17:01:25Z"
    598       },
    599       {
    600         "hn_id": "24385172",
    601         "title": "Benefits of napping: nap length, time of day, age, and experience with napping",
    602         "points": 4,
    603         "comments": 0,
    604         "url": "https://news.ycombinator.com/item?id=24385172",
    605         "created_at": "2020-09-05T17:15:40Z"
    606       },
    607       {
    608         "hn_id": "44790511",
    609         "title": "The Space of AI: Real-World Lessons on AI's Impact on Developers",
    610         "points": 3,
    611         "comments": 0,
    612         "url": "https://news.ycombinator.com/item?id=44790511",
    613         "created_at": "2025-08-04T19:40:00Z"
    614       },
    615       {
    616         "hn_id": "40656984",
    617         "title": "Large Language Models' Detection of Political Orientation in Newspapers",
    618         "points": 2,
    619         "comments": 0,
    620         "url": "https://news.ycombinator.com/item?id=40656984",
    621         "created_at": "2024-06-12T11:49:20Z"
    622       },
    623       {
    624         "hn_id": "22864688",
    625         "title": "Benefits of Napping in Healthy Adults",
    626         "points": 2,
    627         "comments": 0,
    628         "url": "https://news.ycombinator.com/item?id=22864688",
    629         "created_at": "2020-04-14T11:24:01Z"
    630       },
    631       {
    632         "hn_id": "41278284",
    633         "title": "Inductive or Deductive? Rethinking the Fundamental Reasoning Abilities of LLMs",
    634         "points": 1,
    635         "comments": 0,
    636         "url": "https://news.ycombinator.com/item?id=41278284",
    637         "created_at": "2024-08-17T21:50:29Z"
    638       },
    639       {
    640         "hn_id": "41260958",
    641         "title": "Y Social: An LLM-Powered Social Media Digital Twin",
    642         "points": 1,
    643         "comments": 0,
    644         "url": "https://news.ycombinator.com/item?id=41260958",
    645         "created_at": "2024-08-15T22:07:50Z"
    646       },
    647       {
    648         "hn_id": "44858350",
    649         "title": "The Space of AI: Real-World Lessons on AI's Impact on Developers",
    650         "points": 1,
    651         "comments": 0,
    652         "url": "https://news.ycombinator.com/item?id=44858350",
    653         "created_at": "2025-08-10T21:14:15Z"
    654       }
    655     ],
    656     "top_points": 107,
    657     "total_points": 213,
    658     "total_comments": 208
    659   }
    660 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs