ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30591B)


      1 {
      2   "paper": {
      3     "title": "PaLM: Scaling Language Modeling with Pathways",
      4     "authors": ["Aakanksha Chowdhery", "Sharan Narang", "Jacob Devlin", "Maarten Bosma", "Gaurav Mishra", "Adam Roberts", "Paul Barham", "Hyung Won Chung", "Charles Sutton", "Sebastian Gehrmann"],
      5     "year": 2022,
      6     "venue": "Journal of Machine Learning Research",
      7     "arxiv_id": "2204.02311"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "PaLM, a 540B parameter dense Transformer trained on 780B tokens using Pathways across 6144 TPU v4 chips, achieves state-of-the-art few-shot results on 28/29 English NLP benchmarks, outperforms average human performance on BIG-bench (150+ tasks), and matches or exceeds finetuned SOTA on reasoning tasks when combined with chain-of-thought prompting. The paper documents discontinuous improvements from scale on ~25% of BIG-bench tasks, demonstrates strong multilingual and code capabilities despite limited non-English training data (~22%), and provides comprehensive analysis of memorization, data contamination, bias, and toxicity.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No source code or model weights were released. The paper references internal infrastructure (JAX, T5X, Pathways) but provides no public repository for reproducing the experiments."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The training dataset is not released. It is described as based on LaMDA and GLaM datasets with modifications. Evaluation uses public benchmarks (BIG-bench, SuperGLUE, etc.) which are available, but the training data itself is proprietary."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper states training used JAX, T5X, and TPU v4 chips, but provides no dependency versions, requirements files, or sufficient detail to recreate the software environment."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided. The training setup is described at a high level (Section 5) but without scripts or commands to replicate."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Results are reported as point estimates across all benchmark tables (Tables 4, 6, 10, 12, etc.) with no confidence intervals or error bars."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No statistical significance tests are reported despite numerous claims that PaLM 'outperforms' prior models. Comparisons are based solely on comparing numbers."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper consistently reports absolute performance numbers with baselines for context (e.g., Table 4 shows prior SOTA alongside PaLM scores, Table 10 shows GSM8K accuracy of 58% vs prior 55%). Percentage improvements and absolute differences are calculable from the comprehensive tables."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification is given for the number of evaluation examples used. For the multilingual NLG tasks, the paper mentions uniformly sampling 5,000 test examples (Section 6.6) but provides no justification for this choice."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Section 6.8 and Figure 17 show variance across checkpoints for a few tasks, revealing significant variation on WebQuestions. However, main results (Tables 4, 6, 10, 12) report single-run numbers with no variance or standard deviation."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Extensive baselines are included: GPT-3, GLaM, Gopher, Chinchilla, Megatron-Turing NLG, LaMDA, Codex, T5, and others across all evaluation sections. Table 4 compares against 6 prior large LMs."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Baselines include Chinchilla (2022), Gopher (2021), Megatron-Turing NLG (2022), and GLaM (2021), which were contemporary state-of-the-art at the time. The paper also compares with Chinchilla in Section 13."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper trains 8B, 62B, and 540B models identically to study scale effects. Chain-of-thought vs direct prompting ablation is shown in Figure 10. Calculator vs no-calculator ablation for GSM8K (Table 10). Few-shot vs finetuning comparison (Table 8). Training longer ablation in Appendix F."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Multiple metrics are used: EM and F1 for QA tasks, BLEU for translation, ROUGE-2/ROUGE-L/BLEURT-20 for generation (Section 6.6), pass@1 and pass@k for code tasks, accuracy for classification. Different tasks use appropriate metrics."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "BIG-bench includes human performance baselines (both average and best) from crowdworkers (Section 6.2). The paper compares PaLM against these human baselines across 150 tasks. Winogender uses human performance baseline of 95.9% (Section 10.1.1)."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Results are reported on standard test splits. Section 6.1 uses the same splits as Du et al. (2021) and Brown et al. (2020). Table 9 reports SuperGLUE test set results. MMLU uses test set (Table 6). Section 8 analyzes data contamination."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Extensive per-task breakdowns: Table 4 (29 individual NLP tasks), Table 6 (MMLU by category), Figure 5 (individual BIG-bench tasks), Figure 7 (24 BIG-bench Lite tasks), Table 12 (individual code tasks), Table 14 (translation by language pair)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 6.2 discusses tasks where PaLM fails (navigate, mathematical_induction). Figure 9 categorizes GSM8K errors. Figure 6 shows 35% of BIG-bench tasks where humans outperform PaLM. Section 10 discusses bias/toxicity failures. Table 43 lists tasks where humans significantly outperform PaLM."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Several negative results: parallel layers showed quality degradation at 8B scale (Section 2); 1-shot CoQA is worse than prior SOTA (Table 4); training instability spikes occurred ~20 times (Section 5.1); toxicity increases with scale (Section 10.2); navigate and mathematical_induction show flat scaling (Figure 5c)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims of SOTA on hundreds of benchmarks, outperforming human BIG-bench average, and breakthrough reasoning performance are all supported by Tables 4, Figure 3, and Section 6.3 respectively."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Causal claims about scaling effects are supported by controlled comparisons: three model sizes trained identically on the same data (Section 2.1). Chain-of-thought ablation (Table 10) isolates prompting technique from model scale. The paper is appropriately cautious, noting confounds between factors (Section 13)."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Section 13 explicitly discusses open questions about scaling trade-offs. Section 10.3 acknowledges fairness evaluations are limited to English. Section 11 discusses limitations of benchmark evaluations. The paper avoids overly broad claims, consistently qualifying results to specific benchmarks and settings."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 13 discusses alternative scaling strategies (training smaller models longer, as Chinchilla does). Section 8 addresses data contamination as an alternative explanation. Section 7 discusses memorization. Section 5.1 discusses training instability. The paper acknowledges training corpus quality as a confound."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper reports specific benchmark metrics (accuracy, F1, BLEU, pass@k) without overclaiming broader capabilities. Section 11 and Appendix E explicitly discuss that benchmark performance may not reflect real-world capabilities. Section 6.4 Discussion acknowledges the gap between pass@k and functional correctness."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "PaLM model sizes (8B, 62B, 540B) are fully specified with architecture details in Table 1. For comparison models, specific versions are cited (GPT-3 175B, Codex 12B, Gopher 280B, etc.). The paper uses 'code-davinci-001' for Codex API (Section 6.4)."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Chain-of-thought prompts are shown in Figures 1, 8, and Section 9. Few-shot exemplars for joke explanation and logical inference are provided. Code task prompts are shown in Figure 11. Translation and summarization prompts are described in Section 6.6 with actual text."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 5 provides comprehensive hyperparameters: learning rate (10^-2 with decay), optimizer (Adafactor), momentum (β1=0.9), gradient clipping (1.0), batch size schedule, sequence length (2048), weight initialization, dropout settings, and finetuning hyperparameters (Section 6.1.2). Code inference uses nucleus sampling p=0.95, temperature 0.8 (Section 6.4)."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. PaLM is evaluated via direct prompting (few-shot) and finetuning, not through an agentic pipeline."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3 describes data preprocessing: web page quality filtering with classifier, code deduplication via Levenshtein distance, programming language filtering by extension, license filtering for code. Table 2 gives mixing proportions. Appendix D provides a datasheet. SentencePiece tokenization is detailed in Section 2."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 10.3 'Limitations' discusses limitations of bias analysis. Section 11 'Ethical Considerations' substantively discusses risks and limitations. Section 13 'Open Questions in Scaling' discusses what the study doesn't resolve."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Specific threats discussed: fairness analyses limited to English only (Section 10.3), bias benchmarks may not be portable to non-Western contexts (citing Sambasivan et al. 2021), benchmark contamination analyzed per-task (Section 8), checkpoint variance affecting results (Section 6.8, Figure 17)."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 10.3 explicitly states fairness evaluations are limited to English. Section 13 states that conclusions about compute-optimal scaling cannot be drawn without ablation studies. Section 11 acknowledges benchmarks may not capture real-world performance. The paper explicitly does not claim to resolve optimal scaling (Section 13)."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "Neither the training data nor raw evaluation outputs are released. Only aggregated benchmark results are reported."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 3 describes training data collection: web pages filtered by quality classifier, books, Wikipedia, news, social media conversations, and GitHub code filtered by license and programming language. Table 2 gives proportions. Appendix D provides a detailed datasheet."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants were recruited for this study. BIG-bench human baselines were collected by the BIG-bench collaboration, not by the PaLM authors. Data sources are standard public corpora/benchmarks."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The training data pipeline is documented: source selection, quality filtering, deduplication, license filtering for code, language distribution (Table 29), and mixing proportions (Table 2). The datasheet in Appendix D provides additional pipeline details."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No explicit funding disclosure. All authors are from Google Research, implying corporate funding, but no formal funding statement is provided."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "All authors are listed as Google Research. Some are noted as Alphabet X or 'work done while at Google.' The affiliation is clearly stated on the first page."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Google has a direct commercial interest in demonstrating the capabilities of large language models trained on its infrastructure (TPU v4, Pathways). The funder is not independent of the outcome."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement or financial interest disclosure is provided. Google employees evaluating Google's infrastructure and models have potential financial interests that are not declared."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "Appendix D states data was collected 2019-2021. Section 11 mentions 'spanning from very old texts to late 2021.' This establishes the training data timeframe."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "Section 8 provides thorough data contamination analysis: categorizes 29 benchmarks into four contamination categories, reports clean vs full subset performance for 10 partially contaminated datasets (Table 18), and performs similar analysis for translation (Table 19)."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "Section 8 addresses contamination comprehensively: splits datasets into contaminated/clean subsets using 8-gram overlap, reports performance on clean subsets showing similar or better performance (Tables 18-19). BIG-bench canary string verified absent from training data (Section 6.2). Reasoning evaluation sets checked via n-gram overlap (Section 6.3.1)."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human subjects study was conducted. BIG-bench human baselines were collected by the BIG-bench collaboration separately."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human subjects study was conducted by the PaLM authors."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human subjects study was conducted by the PaLM authors."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human subjects study was conducted by the PaLM authors."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human subjects study was conducted by the PaLM authors."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human subjects study was conducted by the PaLM authors."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human subjects study was conducted by the PaLM authors."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference cost or latency figures are reported for the evaluation runs despite the enormous model size (540B parameters)."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Appendix B reports compute usage: PaLM 540B trained on 6144 TPU v4 chips for 1200 hours plus 3072 chips for 336 hours, totaling 2.56 × 10^24 FLOPs (29,600 PetaFLOP/s-days). Environmental impact: 271.43 tCO2e. Table 22 provides FLOP counts for all three model sizes."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Section 5 mentions bitwise determinism from checkpoints, but no multi-seed experiments are reported. Section 6.8 shows checkpoint variance (Figure 17) but this is variation across training checkpoints, not across random seeds."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "Results appear to be single-run. The paper does not state the number of runs for evaluation. Memorization analysis uses greedy decoding (Section 7). Toxicity analysis uses 25 continuations per prompt (Section 10.2), but main benchmark results are single-run."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No hyperparameter search budget is reported. The paper mentions architecture choices (parallel layers, SwiGLU, multi-query attention) and some ablations, but does not report how many configurations were tried or the compute spent on search."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Architecture choices are justified through ablation experiments (Section 2: parallel layers quality-neutral at 62B, multi-query attention quality-neutral). Finetuning checkpoints selected by validation performance (Section 6.6.1). Few-shot prompts follow prior work setups."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No multiple comparison correction is applied despite comparing across 29+ NLP benchmarks, 150 BIG-bench tasks, and multiple other evaluation suites. No statistical tests are performed at all."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The paper does not acknowledge the bias of Google evaluating its own model. Comparisons against other models use published numbers from those models' papers, but no independent evaluation is conducted."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "Section 13 and Figure 24 compare performance as a function of total training FLOP count across PaLM, Chinchilla, and Gopher. Table 21 provides FLOP counts alongside performance. The discussion explicitly addresses compute-efficiency trade-offs."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "Section 11 discusses that 'benchmark evaluations are often deficient in construct validity' (citing Raji et al. 2021). Section 6.4 Discussion acknowledges that pass@k based on small test suites can overestimate performance. The paper questions what benchmarks actually measure."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding is used. PaLM is evaluated through direct prompting and finetuning, not through agentic scaffolds."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "Section 8 explicitly addresses temporal issues: training data collected 2019-2021, BIG-bench not available on Internet at data collection time. Section 6.2 verifies BIG-bench canary string absent from training data. Reasoning datasets verified clean via n-gram overlap."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "The paper does not discuss whether the few-shot prompting setup or evaluation harnesses provide information that would not be available in real usage scenarios."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": true,
    352         "justification": "Section 8 performs 8-gram overlap analysis between training data and evaluation sets, categorizing benchmarks into contamination types (wholesale, constructed from web, context on web, no significant overlap). Tables 18-19 compare clean vs full subset performance."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "Section 8 uses 8-gram overlap analysis to detect contamination. Section 6.2 checks for BIG-bench canary string in training data. Section 6.3.1 verifies no n-gram overlap between training corpus and reasoning evaluation sets. Section 6.2 spot-checks model inputs/outputs for information leakage from gold labels."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "PaLM 540B achieves state-of-the-art few-shot results on 28 out of 29 widely evaluated English NLP benchmarks",
    364       "evidence": "Table 4 shows PaLM 540B outperforming prior SOTA on 28/29 tasks in few-shot setting, compared against GLaM, GPT-3, Megatron-Turing NLG, Gopher, Chinchilla, and LaMDA.",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "PaLM 540B 5-shot outperforms average human performance on BIG-bench",
    369       "evidence": "Figure 3-left shows PaLM 540B 5-shot aggregate score exceeding average human performance on 58 common tasks. Figure 6 shows PaLM outperforms average humans on 65% of individual tasks.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Chain-of-thought prompting with PaLM 540B outperforms finetuned SOTA on GSM8K",
    374       "evidence": "Table 10: PaLM 540B + chain-of-thought + calculator achieves 58% vs prior SOTA of 55% (GPT-3 + finetuning + chain-of-thought + calculator + verifier).",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Approximately 25% of BIG-bench tasks show discontinuous improvements from scale",
    379       "evidence": "Section 6.2 defines discontinuity metric and reports 25% of 150 tasks had discontinuity > +10% and 15% had > +20%. Examples shown in Figure 5b (english_proverbs, logical_sequence).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "PaLM achieves 46.2% model FLOPs utilization, significantly higher than prior work",
    384       "evidence": "Table 3 compares MFU: PaLM 46.2% vs GPT-3 21.3%, Gopher 32.5%, Megatron-Turing NLG 30.2%. Appendix B provides detailed calculation.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Scaling improvements have not plateaued for large language models",
    389       "evidence": "Figure 3 shows log-linear scaling on BIG-bench. Table 4 shows consistent improvements from 62B to 540B. Section 13 compares against Chinchilla showing continued improvements with compute.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "PaLM outperforms supervised translation baselines on German-English and Romanian-English",
    394       "evidence": "Table 14 shows PaLM 540B few-shot BLEU of 47.5 (de-en) vs finetuned SOTA 41.2, and 43.8 (ro-en) vs 39.1. However, these supervised baselines may be outdated as noted by the authors.",
    395       "supported": "moderate"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "Company evaluating its own product",
    401       "detail": "All authors are Google employees evaluating a Google model trained on Google infrastructure (TPU v4, Pathways). Google has direct commercial interest in demonstrating scaling benefits and infrastructure capabilities. No independent evaluation is included."
    402     },
    403     {
    404       "flag": "No statistical significance testing",
    405       "detail": "Despite hundreds of comparisons across dozens of benchmarks, no statistical significance tests, confidence intervals, or error bars are reported for the main results. Claims of 'outperforming' are based on comparing point estimates."
    406     },
    407     {
    408       "flag": "Single-run results",
    409       "detail": "Main benchmark results appear to be single-run evaluations. Section 6.8 shows significant variance across checkpoints for some tasks (WebQuestions in Figure 17), suggesting reported results may be sensitive to evaluation timing."
    410     },
    411     {
    412       "flag": "Potentially outdated translation baselines",
    413       "detail": "The paper acknowledges that supervised translation baselines 'might be outdated' (Section 6.5) when claiming PaLM outperforms supervised models on some language pairs."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Language Models are Few-Shot Learners",
    419       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    420       "year": 2020,
    421       "relevance": "GPT-3 paper establishing the few-shot evaluation paradigm for large language models that PaLM extends."
    422     },
    423     {
    424       "title": "Scaling Laws for Neural Language Models",
    425       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"],
    426       "year": 2020,
    427       "arxiv_id": "2001.08361",
    428       "relevance": "Establishes power-law scaling relationships for LMs that PaLM's discontinuous improvements challenge."
    429     },
    430     {
    431       "title": "Training Compute-Optimal Large Language Models",
    432       "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"],
    433       "year": 2022,
    434       "arxiv_id": "2203.15556",
    435       "relevance": "Chinchilla paper arguing for training smaller models longer, directly compared against PaLM in Section 13."
    436     },
    437     {
    438       "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models",
    439       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    440       "year": 2022,
    441       "arxiv_id": "2201.11903",
    442       "relevance": "Chain-of-thought prompting technique that PaLM combines with scale to achieve breakthrough reasoning results."
    443     },
    444     {
    445       "title": "Evaluating Large Language Models Trained on Code",
    446       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    447       "year": 2021,
    448       "arxiv_id": "2107.03374",
    449       "relevance": "Codex/HumanEval paper establishing code generation benchmarks that PaLM is evaluated on."
    450     },
    451     {
    452       "title": "Program Synthesis with Large Language Models",
    453       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    454       "year": 2021,
    455       "arxiv_id": "2108.07732",
    456       "relevance": "MBPP benchmark and code synthesis evaluation methodology used in PaLM's code evaluation."
    457     },
    458     {
    459       "title": "Scaling Language Models: Methods, Analysis & Insights from Training Gopher",
    460       "authors": ["Jack W. Rae", "Sebastian Borgeaud", "Trevor Cai"],
    461       "year": 2021,
    462       "arxiv_id": "2112.11446",
    463       "relevance": "Gopher paper providing baselines and evaluation methodology that PaLM extends."
    464     },
    465     {
    466       "title": "GLaM: Efficient Scaling of Language Models with Mixture-of-Experts",
    467       "authors": ["Nan Du", "Yanping Huang", "Andrew M. Dai"],
    468       "year": 2021,
    469       "arxiv_id": "2112.06905",
    470       "relevance": "GLaM sparse model providing baselines for PaLM evaluation; PaLM's training dataset is based on GLaM's."
    471     },
    472     {
    473       "title": "Quantifying Memorization Across Neural Language Models",
    474       "authors": ["Nicholas Carlini", "Daphne Ippolito", "Matthew Jagielski"],
    475       "year": 2022,
    476       "arxiv_id": "2202.07646",
    477       "relevance": "Memorization analysis methodology applied to PaLM in Section 7."
    478     },
    479     {
    480       "title": "Finetuned Language Models Are Zero-Shot Learners",
    481       "authors": ["Jason Wei", "Maarten Bosma", "Vincent Y. Zhao"],
    482       "year": 2022,
    483       "relevance": "FLAN instruction tuning approach compared against PaLM's few-shot capabilities."
    484     },
    485     {
    486       "title": "Measuring Massive Multitask Language Understanding",
    487       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart"],
    488       "year": 2021,
    489       "relevance": "MMLU benchmark used to evaluate PaLM's multitask knowledge capabilities."
    490     },
    491     {
    492       "title": "Training Verifiers to Solve Math Word Problems",
    493       "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian"],
    494       "year": 2021,
    495       "arxiv_id": "2110.14168",
    496       "relevance": "GSM8K benchmark and verifier approach that PaLM's chain-of-thought prompting outperforms."
    497     }
    498   ]
    499 }

Impressum · Datenschutz