scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28354B)
      1 {
      2   "paper": {
      3     "title": "LaMDA: Language Models for Dialog Applications",
      4     "authors": ["Romal Thoppilan", "Daniel De Freitas", "Jamie Hall", "Noam Shazeer", "Apoorv Kulshreshtha"],
      5     "year": 2022,
      6     "venue": "arXiv",
      7     "arxiv_id": "2201.08239"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "LaMDA demonstrates that fine-tuning with modest amounts of crowdworker-annotated data (<0.001% of pre-training data) significantly improves dialog quality, safety, and factual groundedness beyond what model scaling alone achieves. The 137B parameter model fine-tuned for groundedness achieves 73.2% groundedness and 65% citation accuracy by learning to call external information retrieval tools. Safety fine-tuning is effective where scaling is not—safety scores barely improve with model size alone but reach 95.2% with fine-tuning. Domain grounding via preconditioning enables role-specific applications (education, music recommendation) with high role consistency (>84%).",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No source code or repository URL is provided in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The crowdworker-annotated datasets (SSI, safety, groundedness) and the pre-training dataset (Infiniset) are not released. The MTB evaluation dataset references Adiwardana et al. but no new data is made public."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper mentions 1024 TPU-v3 chips, Lingvo framework, and GSPMD but does not provide a reproducible environment specification (no requirements.txt, no software versions beyond framework names)."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided. The paper describes the approach at a high level but lacks the detail needed for independent reproduction."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Results in Figures 4, 5 and Table 28 are reported as point estimates without confidence intervals or error bars."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No statistical significance tests are reported. Claims of improvement (e.g., PT vs LaMDA) are based on comparing point estimates."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Results are reported with baseline context, e.g., sensibleness from 80.2% (PT 137B) to 92.3% (LaMDA 137B), groundedness from 57.9% to 73.2%, allowing readers to assess magnitude of improvement (Table 28, Figure 4)."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification is given for the number of crowdworker-annotated dialogs (6.4K, 8K, 4K) or evaluation set sizes (1477, 1458, 784 turns). No power analysis is discussed."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Results appear to be single-run numbers."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The pre-trained model (PT) serves as baseline, with comparisons at 2B, 8B, and 137B scales, and intermediate fine-tuning stages (FT quality-safety). Human crowdworker performance is also included as a reference (Figures 4, 5)."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "The paper compares primarily against its own PT variants and human baselines. No comparison against contemporary dialog systems (e.g., Meena, BlenderBot) on the same metrics is provided, despite extensive discussion of these systems in related work."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Figure 5 shows ablation across fine-tuning stages: PT → FT quality-safety → FT groundedness (LaMDA), isolating the contribution of each fine-tuning component."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Multiple metrics are used: sensibleness, specificity, interestingness (SSI composite), safety, groundedness, informativeness, citation accuracy, helpfulness, and role consistency."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "All primary metrics are based on human crowdworker evaluation. Each response is labeled by 3-5 crowdworkers with consensus determined by majority voting (Section 4, 5)."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Quality is evaluated on the Mini-Turing Benchmark (MTB) dataset of 1477 dialogs. Safety evaluation uses a holdout sample of 1166 dialogs. Groundedness evaluation uses 784 turns from Dinan et al. (Section 5)."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down by model size (2B, 8B, 137B), fine-tuning stage, and individual metrics (sensibleness, specificity, interestingness separately rather than just SSI average). Table 28 provides full breakdown."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 9 discusses multiple failure modes: factual errors, repetitive pledging, premature conversation ending, fabricated details. Tables 15-16 show specific grounding failures. The paper acknowledges cherry-picked examples and discusses where groundedness still fails (~30% of Mount Everest responses)."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports that scaling alone does not improve safety (Section 7, Figure 4). It also reports that LaMDA Music misses recommendations in ~9% and provides broken links in ~7% of responses (Section 8). Tables 15-16 show reasoning failures."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims about scaling alone being insufficient for safety/groundedness, and fine-tuning with annotated data improving these metrics, are supported by Figures 4 and 5 and Table 28."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The paper makes causal claims about fine-tuning improving metrics. The ablation design (PT → FT quality-safety → LaMDA) with controlled single-variable manipulation at each stage provides adequate support for these claims."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper explicitly bounds its claims: 'This is not the final version of LaMDA. Rather this is just a recipe for generating LaMDAs' (Section 9). Safety objectives are stated as 'developed for a U.S. societal context' (Section 9.3). The paper is titled specifically for dialog applications."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 9 extensively discusses alternative explanations and confounds: crowdworker quality as a weak baseline, demographic non-representativeness, long-tail safety threats, anthropomorphization risks, and cultural context limitations."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper carefully defines its metrics (SSI, safety, groundedness) and distinguishes them from broader concepts. For safety: 'mitigating safety risks does not guarantee complete reliability' (Section 9). For groundedness: 'grounding in known sources does not guarantee factual accuracy' (Section 1). The gap between proxy and outcome is explicitly acknowledged."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "This is a first-party paper describing their own model. Architecture details are fully specified: 137B parameters, 64 layers, dmodel=8192, dff=65536, h=128, decoder-only Transformer with relative attention and gated-GELU (Section 3, Table 27)."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The fine-tuning format is explicitly provided with examples: 'What's up? RESPONSE not much. SENSIBLE 1' (Section 6.1). Preconditioning prompts for domain grounding are shown in Tables 3 and 4. The discriminative and generative fine-tuning format is fully specified."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Table 27 provides hyperparameters for all model sizes. Decoding uses top-k (k=40) sampling with 16 candidate responses. Sensibleness weight is 3x specificity and interestingness in ranking (Section 6.1). Training used 256K tokens per batch."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The toolset (TS) scaffolding is described in detail: information retrieval system, calculator, and translator. The research loop (Base → Research queries → TS → grounded response) is fully documented with examples in Section 6.2 and Figure 3."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Pre-training data composition is documented in Appendix E (50% dialog, 12.5% C4, 12.5% code, 12.5% Wikipedia, 6.25% English web, 6.25% non-English). Tokenization uses SentencePiece with 32K BPE vocabulary. Fine-tuning data filtering is described: 2.5M turns filtered to 800K safe/quality turns using LaMDA discriminators (Section 6.1)."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 9 'Discussion and limitations' is extensive (3+ pages) with dedicated subsections on bias (9.1), adversarial data (9.2), safety as metric (9.3), appropriateness (9.4), cultural responsiveness (9.5), and impersonation (9.6)."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Specific threats discussed include: crowdworker population overrepresented in 25-34 age group (Section 9), crowdworker quality as a weak baseline due to limited financial incentives (Section 7), safety objectives developed for U.S. context only (Section 9.3), adversarial testing finding common problems but not rare ones (Section 9.2)."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "The paper explicitly states: 'This is not the final version of LaMDA' (Section 9), safety objectives are 'for a U.S. societal context' (Section 9.3), groundedness progress is 'limited to simple questions of fact' (Section 9), and the recipe should not be taken as production-ready."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The crowdworker-annotated datasets, pre-training data, and evaluation data are not publicly available."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Sections 5 and Appendix A.2 describe data collection in detail: dialog lengths (5-10 or 14-30 turns), three conversation types (natural, sensitive, adversarial), annotation procedures (3-5 crowdworkers per response, majority voting), and consent procedures."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Appendix A.3 describes demographics of crowdworker pools. Section A.2 notes participants include 'a mix of employees, employee volunteers and crowdworkers.' Consent forms, optional demographic surveys, and recruitment goals (demographically diverse) are described."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline from collection to analysis is documented: conversation generation → annotation → consensus labeling (majority voting) → fine-tuning/evaluation. Table 1 summarizes dataset sizes. Appendix A.4 provides safety annotation data distribution (Table 10)."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No explicit funding statement is provided. All authors are listed under 'Google' but no funding source or grant numbers are mentioned."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "All authors are listed as affiliated with Google. The paper is clearly a Google research paper evaluating a Google system."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Google, as the employer of all authors and developer of LaMDA, has a direct commercial interest in the model performing well. The funder is not independent of the outcome."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement or financial interest disclosure is provided in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper does not state a training data cutoff date. Pre-training data composition is described (Appendix E) but the time range of data collection is not specified."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether evaluation data (MTB, WoW) could overlap with the 1.56T word pre-training corpus. The pre-training set includes public web and dialog data which could contain evaluation examples."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The Mini-Turing Benchmark and Wizard of Wikipedia evaluation sets were published before LaMDA's training. No contamination analysis is performed."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "No pre-registration is mentioned for any of the crowdworker studies."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No IRB or ethics board approval is mentioned. Consent forms are described but no institutional review is cited."
    252       },
    253       "demographics_reported": {
    254         "applies": true,
    255         "answer": true,
    256         "justification": "Appendix A.3 provides detailed crowdworker demographics: gender, age group, ethnicity, education, LGBTQ+ identification, disability status for both conversation collection (Table 8) and safety annotation (Table 9) pools."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "The paper mentions 'all of them were from the U.S.' and 'special attention was paid to pursue a representative set of voices' but does not specify concrete inclusion/exclusion criteria for crowdworker selection."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "This is not an experimental study comparing conditions with randomized participant assignment. Crowdworkers annotate model outputs; there is no treatment/control randomization."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "Blinding is not applicable to this study design — crowdworkers evaluate dialog responses in context, not a controlled experimental comparison."
    272       },
    273       "attrition_reported": {
    274         "applies": true,
    275         "answer": false,
    276         "justification": "No attrition information is reported. The paper states final counts of crowdworkers and dialogs but does not mention how many started vs. finished or dropout rates."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference cost or latency is reported for the fine-tuned LaMDA model, despite the multi-step generation pipeline (generate → filter → rank → optionally query TS)."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Section 10 reports detailed compute: 1024 TPU-v3 chips for 57.7 days, 3.55E+23 total FLOPS, 451 MWh energy, 26 tCO2e carbon footprint. Table 27 provides training details for all model sizes."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No multi-seed results are reported. All results appear to be from single training runs."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The number of experimental runs is not stated. Results appear to be single-run."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No hyperparameter search budget is reported. The sensibleness weight of 3x was 'found to work well' (Section 6.1) but no search process is described."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The paper states the 3x sensibleness weight 'was found to work well for all metrics' (Section 6.1) but does not describe how this was selected or what alternatives were tried."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Google employees evaluate Google's own LaMDA system. The self-comparison bias (comparing PT vs fine-tuned variants of their own model) is not acknowledged."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "Performance is shown as a function of model size (2B, 8B, 137B) in Figure 4, providing performance-compute scaling curves. Section 10 quantifies total compute."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "Section 4 extensively discusses metric design rationale — why sensibleness alone is insufficient (GenericBot scores 70%), why specificity was added, why interestingness was needed. Section 9.3-9.4 discuss limitations of the safety metric as a construct. The paper questions its own evaluation constructs."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "The paper explicitly separates the model from the toolset (TS) scaffold: Figure 5 shows results with and without the groundedness fine-tuning that uses TS, isolating the scaffold's contribution."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of temporal leakage between pre-training data and evaluation benchmarks (MTB, WoW datasets)."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the evaluation setup leaks information. The toolset (TS) used at inference could provide answers to groundedness evaluation questions, but this is by design rather than leakage."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether pre-training data (which includes public dialog and web text) overlaps with or is structurally similar to evaluation data."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection or prevention method is applied."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "Model scaling alone improves dialog quality but shows limited improvement on safety and groundedness.",
    364       "evidence": "Figure 4 shows safety scores barely change across 2B/8B/137B PT models (84.8→87.5→88.0) while sensibleness improves (76.6→79.1→80.2). Table 28 provides full numbers.",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "Fine-tuning with crowdworker-annotated data (<0.001% of pre-training data) significantly improves all metrics.",
    369       "evidence": "Table 28: PT 137B vs LaMDA 137B shows sensibleness 80.2→92.3, safety 88.0→95.2, groundedness 57.9→73.2. Section 9 quantifies fine-tuning data as <0.001% of pre-training.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "LaMDA achieves 73.2% groundedness and 65% citation accuracy through tool-augmented fine-tuning.",
    374       "evidence": "Section 7 and Table 28 report these numbers. Figure 5 shows the progression from PT to FT quality-safety to LaMDA.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "LaMDA surpasses crowdworker quality on interestingness.",
    379       "evidence": "Figure 4 shows LaMDA 137B exceeds 'Human' baseline on interestingness. However, the paper acknowledges crowdworkers 'are not extensively trained and were not incentivized to generate high-quality responses' (Section 7).",
    380       "supported": "weak"
    381     },
    382     {
    383       "claim": "LaMDA applications are significantly more helpful than PT applications in domain grounding.",
    384       "evidence": "Table 5: LaMDA Everest 65% helpful vs PT 18%; LaMDA Music 57% vs PT 31%. Based on 600 dialog turns evaluated by crowdworkers (Section 8).",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "LaMDA's total carbon footprint is 21.2x smaller than GPT-3.",
    389       "evidence": "Section 10: LaMDA 26 tCO2e vs GPT-3's estimated footprint. Attributed to more optimized energy mix (0.056 vs 0.429 kg CO2e/kWh).",
    390       "supported": "moderate"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "Company evaluating its own product",
    396       "detail": "All authors are Google employees evaluating Google's LaMDA system. No independent evaluation is included. The paper's positive framing of LaMDA's capabilities is not counterbalanced by external assessment."
    397     },
    398     {
    399       "flag": "Cherry-picked qualitative examples",
    400       "detail": "The paper explicitly acknowledges examples are 'real, albeit cherry-picked' (Table 3 caption). Qualitative examples in Tables 3, 4, 17-26 are selected to showcase capabilities."
    401     },
    402     {
    403       "flag": "Weak human baseline",
    404       "detail": "The paper claims LaMDA surpasses human performance on interestingness, but acknowledges crowdworkers were not incentivized or trained to generate high-quality responses (Section 7). This makes the 'human' baseline artificially weak."
    405     },
    406     {
    407       "flag": "No statistical significance testing",
    408       "detail": "All comparisons between model variants and against human baselines rely on point estimate comparisons with no confidence intervals, significance tests, or variance reporting."
    409     },
    410     {
    411       "flag": "No contamination analysis",
    412       "detail": "The 1.56T word pre-training corpus includes public web and dialog data that could contain evaluation benchmark examples (MTB, WoW). No decontamination or overlap analysis is performed."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "Language models are few-shot learners",
    418       "authors": ["Tom B. Brown"],
    419       "year": 2020,
    420       "relevance": "GPT-3 paper establishing large language model scaling and few-shot capabilities, a key baseline for LaMDA's positioning."
    421     },
    422     {
    423       "title": "Towards a human-like open-domain chatbot",
    424       "authors": ["Daniel Adiwardana"],
    425       "year": 2020,
    426       "arxiv_id": "2001.09977",
    427       "relevance": "Meena paper that introduced SSA metric and dialog quality scaling laws that LaMDA directly builds upon."
    428     },
    429     {
    430       "title": "Scaling laws for neural language models",
    431       "authors": ["Jared Kaplan"],
    432       "year": 2020,
    433       "arxiv_id": "2001.08361",
    434       "relevance": "Foundational work on neural scaling laws that motivated LaMDA's scaling experiments."
    435     },
    436     {
    437       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    438       "authors": ["Patrick Lewis"],
    439       "year": 2020,
    440       "relevance": "RAG architecture for augmenting LMs with retrieval, directly related to LaMDA's groundedness approach."
    441     },
    442     {
    443       "title": "Ethical and social risks of harm from language models",
    444       "authors": ["Laura Weidinger"],
    445       "year": 2021,
    446       "arxiv_id": "2112.04359",
    447       "relevance": "Comprehensive risk taxonomy for LLMs that informed LaMDA's safety objectives."
    448     },
    449     {
    450       "title": "Recipes for building an open-domain chatbot",
    451       "authors": ["Stephen Roller"],
    452       "year": 2020,
    453       "arxiv_id": "2004.13637",
    454       "relevance": "BlenderBot paper on dialog model training recipes, a direct comparison point for LaMDA's approach."
    455     },
    456     {
    457       "title": "WebGPT: Browser-assisted question-answering with human feedback",
    458       "authors": ["Reiichiro Nakano"],
    459       "year": 2021,
    460       "arxiv_id": "2112.09332",
    461       "relevance": "Contemporaneous work on augmenting LMs with web search for grounded responses, comparable to LaMDA's TS approach."
    462     },
    463     {
    464       "title": "Process for adapting language models to society (PALMS) with values-targeted datasets",
    465       "authors": ["Irene Solaiman", "Christy Dennison"],
    466       "year": 2021,
    467       "relevance": "Safety fine-tuning approach using values-targeted datasets, directly compared to LaMDA's safety methodology."
    468     },
    469     {
    470       "title": "Retrieval augmentation reduces hallucination in conversation",
    471       "authors": ["Kurt Shuster"],
    472       "year": 2021,
    473       "arxiv_id": "2104.07567",
    474       "relevance": "Demonstrates that retrieval augmentation reduces hallucination in dialog, directly supporting LaMDA's groundedness approach."
    475     },
    476     {
    477       "title": "Measuring attribution in natural language generation models",
    478       "authors": ["Hannah Rashkin"],
    479       "year": 2021,
    480       "arxiv_id": "2112.12870",
    481       "relevance": "AIS framework for evaluating attribution in language model outputs, related to LaMDA's groundedness evaluation."
    482     },
    483     {
    484       "title": "Recipes for safety in open-domain chatbots",
    485       "authors": ["Jing Xu"],
    486       "year": 2020,
    487       "arxiv_id": "2010.07079",
    488       "relevance": "Safety training approaches for dialog models including separate safety layers, directly related to LaMDA's safety fine-tuning."
    489     },
    490     {
    491       "title": "Carbon emissions and large neural network training",
    492       "authors": ["David Patterson"],
    493       "year": 2021,
    494       "arxiv_id": "2104.10350",
    495       "relevance": "Framework for estimating carbon footprint of LLM training, methodology used for LaMDA's energy analysis in Section 10."
    496     }
    497   ]
    498 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs