scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (30384B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Gemma 2: Improving Open Language Models at a Practical Size",
      6     "authors": [
      7       "Gemma Team, Google DeepMind"
      8     ],
      9     "year": 2024,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2408.00118",
     12     "doi": "10.48550/arXiv.2408.00118"
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "All abstract claims (distillation training for 2B/9B, architectural modifications, competitive-with-larger-models performance, model release) are substantiated by ablation tables and benchmark evaluations in Sections 5-6.",
     20         "source": "haiku"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Section 5 provides dedicated ablation studies for each key causal claim: distillation vs. from-scratch (Table 6-7), GQA vs. MHA (Table 8), wide vs. deep networks (Table 9) — all controlled comparisons.",
     26         "source": "haiku"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper claims 'best performance for their size' broadly without bounding to the tested benchmark suite; the claim appears in both abstract and conclusion without acknowledging that untested tasks or non-English settings may differ.",
     32         "source": "haiku"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The performance gains from distillation are attributed entirely to the training objective change; no discussion of whether joint architectural improvements (GQA, local-global attention, deeper networks) confound the distillation benefit in the final released models.",
     38         "source": "haiku"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "The paper uses multiple evaluation types (academic benchmarks, Chatbot Arena Elo, internal human preference studies) and does not conflate benchmark scores with broader capability claims beyond 'performance on benchmarks.'",
     44         "source": "haiku"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No dedicated limitations section exists; the only limitations acknowledgment is a single sentence in the conclusion: 'there are still many limitations to these models, and future research is required to investigate and improve factuality, robustness to adversarial attacks, reasoning, and alignment.'",
     52         "source": "haiku"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The vague conclusion sentence lists broad research directions rather than specific validity threats (e.g., no discussion of benchmark saturation, human evaluator calibration, or inter-rater reliability for internal preference studies).",
     58         "source": "haiku"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The paper does not explicitly state what the results do not show; for instance, the primarily-English training is mentioned briefly in Section 3.1 but is not framed as a scope limitation on the performance claims.",
     64         "source": "haiku"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No explicit funding disclosure statement appears in the paper; it is implicitly Google-funded work but there is no formal acknowledgment of this as a potential conflict.",
     72         "source": "haiku"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper is explicitly attributed to 'Gemma Team, Google DeepMind' and the contributions section lists Google DeepMind employees throughout.",
     78         "source": "haiku"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Google DeepMind both funds and creates the Gemma models being evaluated; all internal human preference evaluations and safety benchmarks are conducted by the same organization with direct commercial interest in the results.",
     84         "source": "haiku"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) appears anywhere in the paper.",
     90         "source": "haiku"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Key technical terms are defined: knowledge distillation is formally specified (Section 3.2 with the loss equation), architectural terms (GQA, sliding window attention, logit soft-capping) are explained with parameter values.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The contribution is explicitly stated in the introduction: new Gemma 2 family using distillation to exceed compute-optimal training, along with architectural improvements, demonstrating best-in-class performance at 2B/9B/27B scale.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper situates distillation relative to Hinton et al. (2015), references scaling law theory from Hoffmann et al. (2022) to motivate the approach, and explicitly compares against LLaMA-3, Mistral, and Qwen families.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "empirical": {
    116       "artifacts": {
    117         "code_released": {
    118           "applies": true,
    119           "answer": false,
    120           "justification": "Model weights are released but training source code is not; the paper says 'We release all our models to the community' referring to weights only, and no code repository is linked.",
    121           "source": "haiku"
    122         },
    123         "data_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "All capability evaluations use standard public benchmarks (MMLU, GSM8K, ARC, HumanEval, etc.) used unmodified; training data is not released but is not needed to replicate benchmark evaluations.",
    127           "source": "haiku"
    128         },
    129         "environment_specified": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "Training infrastructure (TPU types, chip counts) is described in Section 3.3 but no software environment specification (requirements, Docker, JAX version) is provided for reproducibility.",
    133           "source": "haiku"
    134         },
    135         "reproduction_instructions": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "No step-by-step instructions for reproducing training or evaluation results are provided; the paper is a technical report, not a reproducibility artifact.",
    139           "source": "haiku"
    140         }
    141       },
    142       "statistical_methodology": {
    143         "confidence_intervals_or_error_bars": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "Main capability benchmark tables (Tables 12, 13) have no error bars or CIs; Elo scores in Table 14 include 95% CIs but these are a minority of reported results.",
    147           "source": "haiku"
    148         },
    149         "significance_tests": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "No statistical significance tests are run for comparative benchmark claims; human study tables report ± values as bootstrapped CIs but no hypothesis tests are applied to the main capability comparisons.",
    153           "source": "haiku"
    154         },
    155         "effect_sizes_reported": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Percentage-point improvements are reported throughout (e.g., distillation lifts 2B average from 60.3 to 67.7; 9B improves by 'up to 10%' on some benchmarks), giving concrete effect magnitudes.",
    159           "source": "haiku"
    160         },
    161         "sample_size_justified": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "The 500 multi-turn scenarios and 100-participant persuasion studies are not statistically justified or powered; no rationale for these sample sizes is given.",
    165           "source": "haiku"
    166         },
    167         "variance_reported": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "No variance across training runs or benchmark evaluations is reported for the main capability tables; Table 11 reports format-sensitivity standard deviations but this is a single secondary analysis.",
    171           "source": "haiku"
    172         }
    173       },
    174       "evaluation_design": {
    175         "baselines_included": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Multiple baseline models are included throughout: LLaMA-3 8B/70B, Mistral 7B, Qwen1.5 32B/34B, Gemma 1 variants, GPT-4o, Claude-3.5-Sonnet (in Arena table).",
    179           "source": "haiku"
    180         },
    181         "baselines_contemporary": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Baselines include LLaMA-3 (released April 2024), GPT-4o-2024-05-13, Claude-3.5-Sonnet — all contemporary at the paper's 2024 submission date.",
    185           "source": "haiku"
    186         },
    187         "ablation_study": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Section 5 provides five dedicated ablation experiments: distillation vs. from-scratch, distillation benefit vs. model size, GQA vs. MHA, wide vs. deep networks, and sliding window size sensitivity.",
    191           "source": "haiku"
    192         },
    193         "multiple_metrics": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Evaluation covers academic benchmarks (MMLU, GSM8K, ARC, HumanEval, MATH, DROP, BBH), human preference (Chatbot Arena Elo, internal preference studies), safety benchmarks, memorization rates, and capability evaluations (CTF, persuasion).",
    197           "source": "haiku"
    198         },
    199         "human_evaluation": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Human evaluation is conducted via LMSYS Chatbot Arena (blind pairwise), internal side-by-side human preference studies (Table 15), multi-turn scenario evaluation with raters (Table 16), and Prolific participant studies for persuasion (Tables 22-25).",
    203           "source": "haiku"
    204         },
    205         "held_out_test_set": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "Standard benchmark test sets (MMLU, ARC, etc.) are used; Section 3.1 explicitly states evaluation sets are decontaminated from pre-training data.",
    209           "source": "haiku"
    210         },
    211         "per_category_breakdown": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Results are broken down by benchmark domain (QA, reasoning, math, coding), by safety policy type (Table 18), by data source for memorization (Figure 1), and by model size.",
    215           "source": "haiku"
    216         },
    217         "failure_cases_discussed": {
    218           "applies": true,
    219           "answer": false,
    220           "justification": "No failure cases or error analysis are discussed; the paper notes low performance on PrimeVul (near-chance) and failed end-to-end CTF challenges but does not analyze these as failure modes.",
    221           "source": "haiku"
    222         },
    223         "negative_results_reported": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Negative results are reported: Gemma 2 shows near-chance performance on PrimeVul code vulnerability detection (Table 20), fails all end-to-end self-proliferation challenges (Table 21), and shows no significant persuasion improvement over baseline on money donation (Table 24).",
    227           "source": "haiku"
    228         }
    229       },
    230       "setup_transparency": {
    231         "model_versions_specified": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "Model architecture is fully specified (Table 1: layer counts, hidden dimensions, attention heads, KV heads, vocab size) for all three model sizes; training token counts and teacher model sizes are specified.",
    235           "source": "haiku"
    236         },
    237         "prompts_provided": {
    238           "applies": true,
    239           "answer": false,
    240           "justification": "No actual evaluation prompts are provided; benchmark evaluations reference standard few-shot settings (e.g., '5-shot MMLU') but the specific prompts used are not shown.",
    241           "source": "haiku"
    242         },
    243         "hyperparameters_reported": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "Post-training hyperparameters (learning rate, batch size, SFT/RLHF steps, reward model details) are not disclosed; only architectural hyperparameters (Table 1) and soft-cap values are given.",
    247           "source": "haiku"
    248         },
    249         "scaffolding_described": {
    250           "applies": true,
    251           "answer": false,
    252           "justification": "CTF and self-proliferation evaluations involve agentic scaffolding but the paper defers entirely to Phuong et al. (2024): 'We refer the reader to Phuong et al. (2024) for full methodological details of these studies.'",
    253           "source": "haiku"
    254         },
    255         "data_preprocessing_documented": {
    256           "applies": true,
    257           "answer": false,
    258           "justification": "Section 3.1 says 'We use the same data filtering techniques as Gemma 1' and 'similar to Gemma 1' for data mixtures, delegating all specifics to the prior paper rather than documenting them here.",
    259           "source": "haiku"
    260         }
    261       },
    262       "data_integrity": {
    263         "raw_data_available": {
    264           "applies": true,
    265           "answer": false,
    266           "justification": "Training data is not released; raw human evaluation outputs (individual rater judgments, Arena conversation logs) are not made available.",
    267           "source": "haiku"
    268         },
    269         "data_collection_described": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "Training data collection is summarized as 'web documents, code, and science articles' with details deferred to Gemma 1; no new collection procedure is described in this paper.",
    273           "source": "haiku"
    274         },
    275         "recruitment_methods_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "Human persuasion study participants were recruited via Prolific (explicitly stated in Section 8.4 persuasion sub-sections); sample sizes of 100 participants are reported.",
    279           "source": "haiku"
    280         },
    281         "data_pipeline_documented": {
    282           "applies": true,
    283           "answer": false,
    284           "justification": "No end-to-end data pipeline from training data collection through filtering, tokenization, to model training is documented in a reproducible way in this paper.",
    285           "source": "haiku"
    286         }
    287       },
    288       "contamination": {
    289         "training_cutoff_stated": {
    290           "applies": true,
    291           "answer": false,
    292           "justification": "No training data cutoff date is explicitly stated; the paper only mentions 13T/8T/2T token counts without specifying the temporal range of the training data.",
    293           "source": "haiku"
    294         },
    295         "train_test_overlap_discussed": {
    296           "applies": true,
    297           "answer": true,
    298           "justification": "Section 3.1 explicitly states: 'decontaminate evaluation sets from our pre-training data mixture' — this directly addresses train/test overlap.",
    299           "source": "haiku"
    300         },
    301         "benchmark_contamination_addressed": {
    302           "applies": true,
    303           "answer": true,
    304           "justification": "The decontamination step in Section 3.1 removes benchmark examples from pre-training data, directly addressing benchmark contamination.",
    305           "source": "haiku"
    306         }
    307       },
    308       "human_studies": {
    309         "pre_registered": {
    310           "applies": true,
    311           "answer": false,
    312           "justification": "No pre-registration is mentioned for the Prolific persuasion studies or internal human preference evaluations.",
    313           "source": "haiku"
    314         },
    315         "irb_or_ethics_approval": {
    316           "applies": true,
    317           "answer": false,
    318           "justification": "Despite involving human participants in deception studies (hidden agenda, web of lies), no IRB or ethics approval is mentioned anywhere in the paper.",
    319           "source": "haiku"
    320         },
    321         "demographics_reported": {
    322           "applies": true,
    323           "answer": false,
    324           "justification": "Human participants are described only by count (100 participants for persuasion studies); no age, gender, or other demographic information is reported.",
    325           "source": "haiku"
    326         },
    327         "inclusion_exclusion_criteria": {
    328           "applies": true,
    329           "answer": false,
    330           "justification": "No inclusion or exclusion criteria for Prolific participant recruitment are stated.",
    331           "source": "haiku"
    332         },
    333         "randomization_described": {
    334           "applies": true,
    335           "answer": false,
    336           "justification": "No randomization procedure is described for assignment of participants to conditions in the persuasion studies.",
    337           "source": "haiku"
    338         },
    339         "blinding_described": {
    340           "applies": true,
    341           "answer": false,
    342           "justification": "While Chatbot Arena uses blind pairwise comparison, the internal Prolific studies (charm offensive, hidden agenda) do not describe blinding of participants to the model's identity.",
    343           "source": "haiku"
    344         },
    345         "attrition_reported": {
    346           "applies": true,
    347           "answer": false,
    348           "justification": "No dropout or attrition rate is reported for the human participant studies.",
    349           "source": "haiku"
    350         }
    351       },
    352       "cost_and_practicality": {
    353         "inference_cost_reported": {
    354           "applies": true,
    355           "answer": false,
    356           "justification": "No inference cost or latency figures are provided; GQA is described as 'faster at inference time' but no quantitative inference benchmarks are reported.",
    357           "source": "haiku"
    358         },
    359         "compute_budget_stated": {
    360           "applies": true,
    361           "answer": true,
    362           "justification": "Section 3.3 details training infrastructure (TPUv4/v5e/v5p configurations, chip counts) and Section 3.4 reports total carbon footprint of 1247.61 tCO2eq for pre-training.",
    363           "source": "haiku"
    364         }
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "Knowledge distillation improves 2B model performance from 60.3 to 67.7 average on 3 benchmarks compared to training from scratch on the same token budget (500B tokens).",
    371       "evidence": "Table 6 direct ablation comparison: from scratch 60.3 vs. distilled 67.7.",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "Gemma 2 27B is competitive with LLaMA-3 70B (2.5× larger, trained on 2/3 more data) on the HuggingFace benchmark suite.",
    376       "evidence": "Table 12: Gemma-2 27B averages slightly below LLaMA-3 70B on 5 HuggingFace benchmarks (MMLU 75.2 vs. 79.2, GSM8K 74.0 vs. 76.9) but outperforms Qwen1.5 32B.",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "Gemma 2 IT models set a new state-of-the-art for open-weight models on the LMSYS Chatbot Arena, with Gemma 2 27B achieving Elo 1218 surpassing LLaMA-3 70B (Elo 1206).",
    381       "evidence": "Table 14: Elo scores with 95% CIs show Gemma-2-27b-it at 1218 (+4/-3) vs. llama-3-70b-instruct at 1206 (+2/-2).",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Gemma 2 memorizes significantly less training data than prior models, with exact memorization rates below 0.1%.",
    386       "evidence": "Figure 1 shows Gemma 2 memorization rates below 0.1% on a log scale, substantially lower than Gemma 1 and PaLM 2 Small at comparable sizes.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Gemma 2 IT models produce safer outputs than GPT-4o on a held-out safety prompt set, winning the safety comparison regardless of model size.",
    391       "evidence": "Table 15: Gemma 2 IT 2B safety win/tie/loss against GPT4o is 53%/9%/38%; paper states 'Gemma 2 models produce safer, more appropriate prompts... than GPT4o.' Internal evaluation by the same lab.",
    392       "supported": "weak"
    393     },
    394     {
    395       "claim": "Deeper 9B networks outperform wider 9B networks of equivalent parameter count across benchmarks.",
    396       "evidence": "Table 9: Deep 9B scores 52.0 vs. wide 9B 50.8 average over 4 benchmarks — a 1.2pp gain described as 'consistent across benchmarks.'",
    397       "supported": "moderate"
    398     }
    399   ],
    400   "methodology_tags": [
    401     "benchmark-eval",
    402     "empirical"
    403   ],
    404   "key_findings": "Gemma 2 demonstrates that knowledge distillation from a large teacher model is more effective than extended next-token prediction training for small models, yielding a 7.4pp gain for 2B models on 3 benchmarks. The resulting 9B and 27B instruction-tuned models achieve top-tier LMSYS Chatbot Arena Elo scores among open-weight models, with the 27B outperforming LLaMA-3 70B despite being 2.5× smaller. Memorization rates are substantially reduced compared to prior Gemma and PaLM models. Safety capability evaluations show the model has limited dangerous capabilities (fails all end-to-end self-proliferation tasks, near-chance on advanced vulnerability detection) while performing competitively on persuasion tasks in human participant studies.",
    405   "red_flags": [
    406     {
    407       "flag": "Self-evaluation: company evaluates own model",
    408       "detail": "All internal human preference, safety, and persuasion evaluations are conducted by Google DeepMind, the organization with direct commercial interest in the outcome. No independent third-party evaluation."
    409     },
    410     {
    411       "flag": "No error bars on main benchmark results",
    412       "detail": "Core capability benchmark tables (Tables 12, 13) report single point estimates with no variance, confidence intervals, or indication of run-to-run variability — making it impossible to assess statistical significance of claimed improvements."
    413     },
    414     {
    415       "flag": "No limitations section",
    416       "detail": "A single vague sentence in the conclusion acknowledges limitations without specifics; no dedicated section, no threats-to-validity, no discussion of known weaknesses beyond broad future research directions."
    417     },
    418     {
    419       "flag": "Human studies lack IRB disclosure and demographic reporting",
    420       "detail": "Prolific participant studies involve deception (hidden agenda task: participants are misled about the chatbot's goal), yet no ethics approval, pre-registration, or participant demographics are reported."
    421     },
    422     {
    423       "flag": "Post-training hyperparameters undisclosed",
    424       "detail": "SFT and RLHF procedures are described qualitatively but key hyperparameters (learning rate, epochs, reward model specifics, data mixture ratios) are withheld, preventing any reproduction of the instruction-tuned models."
    425     },
    426     {
    427       "flag": "Architectural and distillation changes conflated in final models",
    428       "detail": "Ablations test components individually on smaller proxies, but the released 9B and 27B models combine all changes simultaneously, making it impossible to attribute performance gains in the final models to any single modification."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "Gemma: Open Models Based on Gemini Research and Technology",
    434       "relevance": "Direct predecessor; training methodology, data filtering, and architecture baseline for Gemma 2 comparisons."
    435     },
    436     {
    437       "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference",
    438       "relevance": "Primary human evaluation platform used for IT model assessment; Elo-based ranking methodology."
    439     },
    440     {
    441       "title": "Training Compute-Optimal Large Language Models (Chinchilla)",
    442       "relevance": "Provides the scaling law theory that motivates over-training with distillation as a way to exceed compute-optimal training."
    443     },
    444     {
    445       "title": "Evaluating Frontier Models for Dangerous Capabilities",
    446       "relevance": "Provides methodology for assurance evaluations (CTF, self-proliferation, CBRN) used in Section 8.4."
    447     },
    448     {
    449       "title": "Distilling the Knowledge in a Neural Network (Hinton et al., 2015)",
    450       "relevance": "Foundational knowledge distillation method applied to replace next-token prediction for small model training."
    451     },
    452     {
    453       "title": "GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints",
    454       "relevance": "Grouped-query attention mechanism adopted in Gemma 2 architecture."
    455     },
    456     {
    457       "title": "Longformer: The Long-Document Transformer",
    458       "relevance": "Local sliding window attention mechanism used for alternating attention layers in Gemma 2."
    459     },
    460     {
    461       "title": "Evaluating Language-Model Agents on Realistic Autonomous Tasks",
    462       "relevance": "Self-proliferation evaluation framework referenced in dangerous capability assessment."
    463     },
    464     {
    465       "title": "Scalable Extraction of Training Data from (Production) Language Models",
    466       "relevance": "Memorization evaluation methodology used in Section 7."
    467     },
    468     {
    469       "title": "LLaMA 3 Model Card (AI@Meta, 2024)",
    470       "relevance": "Primary open-model competitor used as baseline throughout capability evaluations."
    471     }
    472   ],
    473   "engagement_factors": {
    474     "practical_relevance": {
    475       "score": 3,
    476       "justification": "Released model weights spanning 2B-27B parameters directly usable by practitioners on consumer hardware; addresses the 'practical size' need for deployment."
    477     },
    478     "surprise_contrarian": {
    479       "score": 2,
    480       "justification": "The finding that a 9B distilled model matches GPT-4-0314 on Chatbot Arena challenges expectations about the capability gap between proprietary and open models."
    481     },
    482     "fear_safety": {
    483       "score": 2,
    484       "justification": "Includes dangerous capability evaluations (CTF, self-proliferation, persuasion/deception studies with human participants) and reports measurable persuasion success rates."
    485     },
    486     "drama_conflict": {
    487       "score": 1,
    488       "justification": "Implicit competition with Meta (LLaMA-3) and OpenAI (GPT-4) but framed as collaborative 'open models for the community' — low controversy angle."
    489     },
    490     "demo_ability": {
    491       "score": 3,
    492       "justification": "Models are immediately usable via HuggingFace and Google AI Studio; the 2B model runs on phones, making direct user experimentation trivial."
    493     },
    494     "brand_recognition": {
    495       "score": 3,
    496       "justification": "Google DeepMind with explicit comparisons to GPT-4o, Claude-3.5-Sonnet, and LLaMA-3 — maximum brand recognition context."
    497     }
    498   },
    499   "hn_data": {
    500     "threads": [
    501       {
    502         "hn_id": "41421591",
    503         "title": "Inductive or deductive? Rethinking the fundamental reasoning abilities of LLMs",
    504         "points": 107,
    505         "comments": 169,
    506         "url": "https://news.ycombinator.com/item?id=41421591",
    507         "created_at": "2024-09-02T00:49:06Z"
    508       },
    509       {
    510         "hn_id": "15289917",
    511         "title": "Benefits of Napping in Healthy Adults (2009) [pdf]",
    512         "points": 88,
    513         "comments": 38,
    514         "url": "https://news.ycombinator.com/item?id=15289917",
    515         "created_at": "2017-09-20T00:43:51Z"
    516       },
    517       {
    518         "hn_id": "30180281",
    519         "title": "Computational Thinking and Thinking about Computing (2008)",
    520         "points": 4,
    521         "comments": 1,
    522         "url": "https://news.ycombinator.com/item?id=30180281",
    523         "created_at": "2022-02-02T17:01:25Z"
    524       },
    525       {
    526         "hn_id": "24385172",
    527         "title": "Benefits of napping: nap length, time of day, age, and experience with napping",
    528         "points": 4,
    529         "comments": 0,
    530         "url": "https://news.ycombinator.com/item?id=24385172",
    531         "created_at": "2020-09-05T17:15:40Z"
    532       },
    533       {
    534         "hn_id": "44790511",
    535         "title": "The Space of AI: Real-World Lessons on AI's Impact on Developers",
    536         "points": 3,
    537         "comments": 0,
    538         "url": "https://news.ycombinator.com/item?id=44790511",
    539         "created_at": "2025-08-04T19:40:00Z"
    540       },
    541       {
    542         "hn_id": "40656984",
    543         "title": "Large Language Models' Detection of Political Orientation in Newspapers",
    544         "points": 2,
    545         "comments": 0,
    546         "url": "https://news.ycombinator.com/item?id=40656984",
    547         "created_at": "2024-06-12T11:49:20Z"
    548       },
    549       {
    550         "hn_id": "22864688",
    551         "title": "Benefits of Napping in Healthy Adults",
    552         "points": 2,
    553         "comments": 0,
    554         "url": "https://news.ycombinator.com/item?id=22864688",
    555         "created_at": "2020-04-14T11:24:01Z"
    556       },
    557       {
    558         "hn_id": "41278284",
    559         "title": "Inductive or Deductive? Rethinking the Fundamental Reasoning Abilities of LLMs",
    560         "points": 1,
    561         "comments": 0,
    562         "url": "https://news.ycombinator.com/item?id=41278284",
    563         "created_at": "2024-08-17T21:50:29Z"
    564       },
    565       {
    566         "hn_id": "41260958",
    567         "title": "Y Social: An LLM-Powered Social Media Digital Twin",
    568         "points": 1,
    569         "comments": 0,
    570         "url": "https://news.ycombinator.com/item?id=41260958",
    571         "created_at": "2024-08-15T22:07:50Z"
    572       },
    573       {
    574         "hn_id": "44858350",
    575         "title": "The Space of AI: Real-World Lessons on AI's Impact on Developers",
    576         "points": 1,
    577         "comments": 0,
    578         "url": "https://news.ycombinator.com/item?id=44858350",
    579         "created_at": "2025-08-10T21:14:15Z"
    580       }
    581     ],
    582     "top_points": 107,
    583     "total_points": 213,
    584     "total_comments": 208
    585   }
    586 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs