scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28202B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "LaMDA: Language Models for Dialog Applications",
      6     "authors": [
      7       "R. Thoppilan",
      8       "Daniel De Freitas",
      9       "Jamie Hall",
     10       "Noam Shazeer",
     11       "Apoorv Kulshreshtha",
     12       "et al."
     13     ],
     14     "year": 2022,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2201.08239",
     17     "doi": null
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All abstract claims — 137B parameters, 1.56T word pretraining, fine-tuning improves safety/groundedness — are directly supported by results in Figure 4, Table 28, and Sections 6–8.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Causal claims about fine-tuning's effect are supported by controlled ablations (PT vs. FT quality-safety vs. LaMDA) on the same base model across three sizes, providing adequate evidence for causal attribution within the ML context.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The paper bounds results to English dialog, notes the model is not production-ready ('This is not the final version of LaMDA'), and explicitly discusses US-centric safety objectives throughout Section 9.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper does not discuss whether improvements could stem from data quality differences, crowdworker priming effects, or benchmark familiarity rather than the fine-tuning mechanism; only the intended causal explanation is presented.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "Section 4 explicitly acknowledges SSI are proxy metrics for dialog quality, discusses their limitations, and Section 9.3 separately addresses safety as a metric vs. safety as a concept.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 9 ('Discussion and limitations') spans seven subsections covering bias, adversarial data limitations, safety metric constraints, cultural responsiveness, appropriateness, and impersonation risks.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Specific threats include: crowdworker pool overrepresented in 25-34 age group, safety objectives are US-centric (Section 9.5), crowdworkers not extensively trained, and the human baseline is weak due to low incentives (Section 7).",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper explicitly states results are limited to English dialog, safety objectives apply to US societal context, and frames LaMDA as a research recipe not a production system.",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No funding statement appears in the paper; while all authors are Google employees, there is no formal funding disclosure or grant acknowledgment.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper header lists 'Google' as the institutional affiliation for all authors, making the commercial affiliation unambiguous.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "All authors are Google employees evaluating Google's proprietary LaMDA product; the organization conducting the evaluation is not independent of the system being evaluated.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement, patent declarations, or financial interest disclosures appear anywhere in the paper.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 4 formally defines sensibleness, specificity, interestingness, safety, groundedness, informativeness, citation accuracy, helpfulness, and role consistency with explicit operationalizations.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper clearly states its contribution: demonstrating that LaMDA fine-tuning with annotated data and external knowledge tools yields significant improvements in safety and factual grounding for dialog.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2 substantively positions LaMDA against Meena, BlenderBot, GPT-3, RAG, WebGPT, and RETRO, explaining methodological similarities and differences rather than merely listing citations.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "No source code is released; LaMDA is a Google proprietary model with no open-source release mentioned in the paper.",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": false,
    131           "justification": "The fine-tuning datasets (6.4K quality dialogs, 8K safety dialogs, 4K groundedness dialogs) collected specifically for this work are not publicly released.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "Training used TPU-V3 chips and the Lingvo framework, but no Dockerfile, requirements.txt, or version-pinned environment specification is provided.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No step-by-step reproduction instructions are included; architectural details and hyperparameters are described but not at a level enabling reproduction without proprietary infrastructure.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "All results in Figure 4, Figure 5, and Table 28 are reported as point estimates without confidence intervals or error bars.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "No statistical significance tests are reported for any comparative claims between PT and LaMDA conditions.",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Table 28 reports absolute percentage values for all metrics across conditions (e.g., safety from 88.0% PT 137B to 95.2% LaMDA 137B), providing effect size context with baseline comparison.",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "The evaluation dataset sizes (1477 MTB dialogs, 1458 safety turns, 784 groundedness turns) are stated but not justified with power analysis or sample size rationale.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "No variance, standard deviation, or inter-rater reliability statistics are reported for main evaluation results; majority voting is used but agreement rates between raters are not quantified.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Pre-trained models (PT) at 2B, 8B, and 137B parameter scales serve as baselines, and human crowdworker performance with and without IR tools is used as a reference upper bound.",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Meena (2020) and GPT-3 (2020) are used as comparative references; these were competitive state-of-the-art dialog and language models at the time of publication.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Figure 5 ablates PT vs. FT quality-safety vs. LaMDA (full fine-tuning), isolating the contribution of quality/safety fine-tuning from groundedness fine-tuning.",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Six foundation metrics are reported: sensibleness, specificity, interestingness, safety, groundedness, and informativeness, plus citation accuracy, helpfulness, and role consistency for domain applications.",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Crowdworker human evaluation is the primary evaluation method for all metrics, with 5 raters per SSI response and 3 raters per safety/groundedness response using majority voting.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "Safety evaluation uses a holdout adversarial dataset (1166 dialogs), SSI uses the MTB benchmark (1477 dialogs), and groundedness uses WoW dataset contexts (784 turns), all separate from training data.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Results are broken down by model size (2B, 8B, 137B), fine-tuning stage, and application domain (Everest vs. Music in Table 5), enabling granular comparison.",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Section 9 and Tables 11–26 discuss and show failure modes: factually incorrect statements (Table 16: Gagarin moon rock error), unsafe PT responses (Table 11), domain application failures (Table 6), and broken links (~7% in music app).",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "The paper reports that scaling alone does not significantly improve safety (Figure 4), LaMDA produces ~30% ungrounded responses in domain applications (Section 8), and complex reasoning remains unsolved (Section 9).",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "Exact model architecture details are provided: 137B non-embedding parameters, 64 layers, dmodel=8192, dff=65536, h=128, relative attention, gated-GELU, with full hyperparameter table (Table 27).",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Crowdworker instructions are provided in full (Appendix A.2, B), domain preconditioning prompts are shown in dialog tables, and the fine-tuning input/output format is described with concrete examples.",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Table 27 provides per-model hyperparameters (layers, units, heads, training steps, chips, training time), and Section 3 specifies top-k (k=40) sampling and batch size (256K tokens).",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": true,
    256           "answer": true,
    257           "justification": "Section 6.2 and Figure 3 describe the Base→Research model pipeline in detail, including toolset routing logic, how output direction (TS vs. user) is determined, and the maximum query loop constraint.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Appendix E documents pretraining data composition (50% dialog forums, 12.5% C4, etc.), SentencePiece tokenization, and Section 5 details fine-tuning data collection procedures.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": false,
    271           "justification": "Raw crowdworker annotation data (48K safety turns, 121K quality turns, 40K groundedness turns) is not publicly released.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Section 5 and Appendix A.2 provide detailed descriptions of crowdworker data collection including dialog generation protocols (natural/sensitive/adversarial), annotation task design, and UI screenshots (Figures 6–9).",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": true,
    282           "answer": true,
    283           "justification": "Appendix A.2 describes participant recruitment (mix of employees, volunteers, and vendor-supplied crowdworkers) and Appendix A.3 provides detailed demographic distributions for both crowdworker pools.",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The pipeline from conversation collection through annotation to discriminator fine-tuning is described in Sections 5–6, including filtering of 2.5M pre-training turns to 800K high-quality turns using LaMDA discriminators.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "The paper does not state a training data cutoff date; pretraining data is described as 'public dialog data and web documents' without temporal bounds.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "The WoW evaluation dataset (published 2019) and MTB benchmark (2020) likely appear in the 1.56T-word pretraining corpus, but potential train/test overlap is never acknowledged or discussed.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "Both primary evaluation benchmarks (WoW 2019, MTB 2020) predate LaMDA's training cutoff; no discussion of whether benchmark examples appeared in training data is provided.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": true,
    316           "answer": false,
    317           "justification": "No pre-registration is mentioned for any of the crowdworker studies.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": true,
    322           "answer": false,
    323           "justification": "No IRB or ethics board approval is mentioned; Appendix A.2 describes consent forms for participants but no formal ethics review process.",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": true,
    328           "answer": true,
    329           "justification": "Tables 8 and 9 (Appendix A.3) provide detailed demographic breakdowns by gender, age, ethnicity, education, disability, and sexual orientation for both crowdworker pools.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": true,
    334           "answer": true,
    335           "justification": "Participants were required to be US-based, complete 5–10 exchanges per session, use English, and consent to participation; Appendix A.2 documents these criteria explicitly.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": true,
    340           "answer": false,
    341           "justification": "No randomization of dialog or response assignment to crowdworkers is described; how annotation tasks were distributed across the rater pool is not explained.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": true,
    346           "answer": false,
    347           "justification": "The paper does not describe whether crowdworkers evaluating model responses were blind to which model condition (PT vs. LaMDA) generated the response.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": true,
    352           "answer": false,
    353           "justification": "No crowdworker attrition or dropout rates are reported for any of the annotation tasks.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "Section 10 reports training cost and carbon footprint but no inference cost, latency, or serving resource requirements are provided.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": true,
    367           "justification": "Section 10 provides detailed compute budget: 1024 TPU-V3 chips for 57.7 days, 451 MWh energy, 26 tCO2e carbon footprint, with comparison to GPT-3 training costs.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "Fine-tuning with crowdworker-annotated data significantly improves safety and groundedness beyond what model scaling alone achieves",
    376       "evidence": "Table 28: PT 137B safety 88.0% vs. LaMDA 137B 95.2%; groundedness 57.9% vs. 73.2%; Figure 4 shows safety plateauing across 2B–137B PT scaling",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Model scaling alone improves quality metrics (SSI) but has negligible effect on safety",
    381       "evidence": "Figure 4 shows safety across PT 2B/8B/137B as 84.8/87.5/88.0%, minimal improvement, while sensibleness improves 76.6→80.2%; Section 7 explicitly states this finding",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Enabling LaMDA to consult external knowledge tools achieves 73.2% groundedness and 65% citation accuracy",
    386       "evidence": "Table 28 reports LaMDA 137B groundedness at 73.2% and informativeness at 62.3%; Section 7 reports 65% citation accuracy for the FT groundedness model",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "LaMDA domain applications are significantly more helpful than pre-trained model applications",
    391       "evidence": "Table 5: LaMDA Everest 65% vs. PT Everest 18% helpful; LaMDA Music 57% vs. PT Music 31%, across 600 crowdworker-evaluated dialog turns",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "Significant quality improvement is achievable with less than 0.001% of pre-training data volume as fine-tuning",
    396       "evidence": "Section 9 states this explicitly; 6.4K quality + 8K safety dialogs used for fine-tuning vs. 1.56T word pretraining corpus; no formal analysis of this ratio is provided",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "LaMDA 137B narrows the gap to human-level sensibleness, achieving 92.3% vs. human crowdworker baseline",
    401       "evidence": "Table 28 confirms 92.3% sensibleness; however Section 7 acknowledges the human baseline is weak (low-incentive crowdworkers), undermining the comparison",
    402       "supported": "weak"
    403     }
    404   ],
    405   "methodology_tags": [
    406     "benchmark-eval",
    407     "observational",
    408     "case-study"
    409   ],
    410   "key_findings": "LaMDA demonstrates that scaling pre-trained dialog models alone is insufficient for safety and factual grounding: safety shows minimal improvement (84.8%→88.0%) as parameters scale 2B→137B, while fine-tuning raises it to 95.2%. Fine-tuning with less than 0.001% of pre-training data volume achieves significant gains across quality, safety, and groundedness. Enabling the model to consult external tools (information retrieval, calculator, translator) achieves 73.2% groundedness and 65% citation accuracy for factual dialog claims. Domain-specific preconditioning yields role-consistent agents with dramatically higher helpfulness than pre-training alone (65% vs. 18% for the Everest education application), demonstrating the power of modest prompt-based adaptation.",
    411   "red_flags": [
    412     {
    413       "flag": "No error bars or significance tests",
    414       "detail": "All main results in Table 28 and Figures 4–5 are single point estimates with no confidence intervals, standard deviations, or statistical significance tests for any comparative claim."
    415     },
    416     {
    417       "flag": "Self-evaluation by product team",
    418       "detail": "All 60+ authors are Google employees evaluating Google's proprietary LaMDA system with no independent external evaluation, creating a strong unacknowledged conflict of interest."
    419     },
    420     {
    421       "flag": "Weak human baseline presented without adequate caveat",
    422       "detail": "The paper acknowledges crowdworker baseline is weak ('crowdworkers are not extensively trained and were not incentivized') yet LaMDA 'exceeding human level' on interestingness is presented as a headline result without sufficient qualification."
    423     },
    424     {
    425       "flag": "Benchmark contamination unaddressed",
    426       "detail": "Both primary evaluation benchmarks (WoW 2019, MTB 2020) predate LaMDA's training and were likely in the 1.56T-word pretraining corpus; potential data leakage is never acknowledged."
    427     },
    428     {
    429       "flag": "Cherry-picked qualitative examples",
    430       "detail": "Section 8 and Tables 3–4 explicitly note examples are 'cherry-picked' without systematic analysis of representative or failure-mode distributions."
    431     },
    432     {
    433       "flag": "No code or data release",
    434       "detail": "Despite 60+ authors and Google's infrastructure, no model weights, training data, fine-tuning datasets, or evaluation tools are released, making independent replication impossible."
    435     }
    436   ],
    437   "cited_papers": [
    438     {
    439       "title": "Towards a Human-like Open-Domain Chatbot (Meena/Adiwardana et al. 2020)",
    440       "relevance": "Primary dialog baseline; introduces SSA metric that LaMDA extends; main scale comparison reference for parameters and training data size"
    441     },
    442     {
    443       "title": "Language Models are Few-Shot Learners (GPT-3/Brown et al. 2020)",
    444       "relevance": "Primary scaling comparison; LaMDA compares parameter count, training FLOPs, energy use, and carbon footprint against GPT-3"
    445     },
    446     {
    447       "title": "Scaling Laws for Neural Language Models (Kaplan et al. 2020)",
    448       "relevance": "Theoretical foundation for LaMDA's scaling experiments; motivates 2B/8B/137B parameter comparison"
    449     },
    450     {
    451       "title": "Recipes for Building an Open-Domain Chatbot (BlenderBot/Roller et al. 2020)",
    452       "relevance": "Contemporary dialog model baseline; LaMDA compares fine-tuning for interestingness and safety filtering approaches"
    453     },
    454     {
    455       "title": "Retrieval Augmentation Reduces Hallucination in Conversation (Shuster et al. 2021)",
    456       "relevance": "Direct predecessor for LaMDA's groundedness approach; establishes that retrieval reduces hallucination in dialog systems"
    457     },
    458     {
    459       "title": "Ethical and Social Risks of Harm from Language Models (Weidinger et al. 2021)",
    460       "relevance": "Framework for 21 risk categories informing LaMDA's safety objective design; cited as comprehensive risk landscape reference"
    461     },
    462     {
    463       "title": "WebGPT: Browser-assisted Question-answering with Human Feedback (Nakano et al. 2021)",
    464       "relevance": "Closely related grounding approach; LaMDA compares its post-generation grounding vs. WebGPT's browser-interaction paradigm"
    465     },
    466     {
    467       "title": "Wizard of Wikipedia: Knowledge-Powered Conversational Agents (Dinan et al. 2019)",
    468       "relevance": "Source of 784-turn groundedness evaluation dataset; methodological predecessor for knowledge-grounded dialog evaluation"
    469     },
    470     {
    471       "title": "Internet-Augmented Dialogue Generation (Komeili et al. 2021)",
    472       "relevance": "Direct comparison paper with similar search-augmented dialog approach; LaMDA distinguishes its post-generation grounding from Komeili's pre-generation encoding"
    473     }
    474   ],
    475   "engagement_factors": {
    476     "practical_relevance": {
    477       "score": 3,
    478       "justification": "LaMDA directly influenced Google Bard/Gemini and established the fine-tuning + tool-use paradigm now standard in production dialog systems."
    479     },
    480     "surprise_contrarian": {
    481       "score": 2,
    482       "justification": "The empirical demonstration that scaling alone fails to improve safety — contradicting naive 'scaling solves alignment' assumptions — was notable and widely cited."
    483     },
    484     "fear_safety": {
    485       "score": 2,
    486       "justification": "The paper directly addresses AI safety for dialog (toxicity, bias, misinformation) and LaMDA subsequently became the center of the 'sentient AI' controversy involving a Google engineer."
    487     },
    488     "drama_conflict": {
    489       "score": 2,
    490       "justification": "LaMDA became publicly controversial when a Google engineer claimed the model was sentient, generating significant media coverage and AI consciousness debates far beyond the paper's scope."
    491     },
    492     "demo_ability": {
    493       "score": 2,
    494       "justification": "LaMDA itself is not publicly accessible, but the grounding examples (real-time stock prices, Wikipedia citations) and successor products (Bard) make capabilities demonstrable."
    495     },
    496     "brand_recognition": {
    497       "score": 3,
    498       "justification": "Google Research paper with 60+ authors including Noam Shazeer, Quoc Le, and Ray Kurzweil, presenting a direct predecessor to Google's production AI assistant products."
    499     }
    500   },
    501   "hn_data": {
    502     "threads": [
    503       {
    504         "hn_id": "30315604",
    505         "title": "Joke written by an AI: “A basic program walked into a bar ”",
    506         "points": 309,
    507         "comments": 136,
    508         "url": "https://news.ycombinator.com/item?id=30315604",
    509         "created_at": "2022-02-12T19:39:00Z"
    510       },
    511       {
    512         "hn_id": "31991217",
    513         "title": "Open-Source LaMDA Model",
    514         "points": 27,
    515         "comments": 0,
    516         "url": "https://news.ycombinator.com/item?id=31991217",
    517         "created_at": "2022-07-05T17:35:01Z"
    518       },
    519       {
    520         "hn_id": "30057882",
    521         "title": "LaMDA: Language Models for Dialog Applications",
    522         "points": 9,
    523         "comments": 0,
    524         "url": "https://news.ycombinator.com/item?id=30057882",
    525         "created_at": "2022-01-24T14:23:01Z"
    526       },
    527       {
    528         "hn_id": "30021052",
    529         "title": "A Brief Analysis of the Apollo Guidance Computer [pdf]",
    530         "points": 5,
    531         "comments": 0,
    532         "url": "https://news.ycombinator.com/item?id=30021052",
    533         "created_at": "2022-01-21T09:13:06Z"
    534       }
    535     ],
    536     "top_points": 309,
    537     "total_points": 350,
    538     "total_comments": 136
    539   }
    540 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs