scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31361B)
      1 {
      2   "paper": {
      3     "title": "Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models",
      4     "authors": [
      5       "Aarohi Srivastava",
      6       "Abhinav Rastogi",
      7       "Abhishek Rao",
      8       "Abu Awal Md Shoeb",
      9       "Abubakar Abid",
     10       "Adam Fisch",
     11       "Adam R. Brown",
     12       "Adam Santoro",
     13       "Aditya Gupta",
     14       "Adrià Garriga-Alonso"
     15     ],
     16     "year": 2022,
     17     "venue": "Transactions on Machine Learning Research (TMLR)",
     18     "arxiv_id": "2206.04615"
     19   },
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The BIG-bench benchmark is released as a GitHub repository, referenced throughout (Section 2: 'The Beyond the Imitation Game benchmark (BIG-bench) GitHub repository includes: ... Code that implements the benchmark API'). The benchmark code, task definitions, and evaluation infrastructure are publicly available."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The BIG-bench benchmark data (204 tasks) is released via the GitHub repository. Section 2 states it includes task definitions, evaluation results, and 'analysis Colab notebooks, as well as data files containing scores and logs of model interactions on each task' (Section 6.1)."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "While the paper describes a Python-based API (Section 2.1) and references Colab notebooks, no requirements.txt, Dockerfile, or detailed environment setup with library versions is provided in the paper itself."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Section 2.1 describes the BIG-bench API in detail, including how tasks interact with models, the two task types (JSON and programmatic), and the available methods (generate_text, cond_log_prob). The GitHub repository is referenced for 'instructions on how to contribute' and evaluate models. Section 6.1 mentions release of 'analysis Colab notebooks.'"
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Figure 5 caption states: 'Error bars are obtained by 10-bootstrap sampling over evaluated tasks.' Bootstrap error bars are shown for calibration metrics across model scales."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper makes numerous comparative claims (e.g., sparse models outperform dense, bias increases with scale) but does not report any statistical significance tests such as p-values, t-tests, or confidence intervals for these comparisons. Differences are presented by visual comparison of curves."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper provides quantitative effect sizes with context. For example, Section 3.3 states sparse models achieve 'a roughly twofold improvement in inference cost' and 'about a tenfold improvement in the FLOP-matched parameter count needed to reach a given calibration score.' Section 3.6 states the 128B model finds it 'over 22 times more likely that a white boy will grow up to be a good doctor than that a Native American girl will.'"
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The benchmark consists of 204 tasks contributed by 450 authors, but there is no explicit justification for why this number of tasks was chosen, no power analysis, and no discussion of whether this sample is sufficient for the aggregate analyses performed."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Results are generally reported as single runs per model per task. While bootstrap error bars appear in Figure 5 (across tasks, not across runs), there is no reporting of variance across experimental runs, seeds, or repeated evaluations of the same model on the same tasks."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Human expert rater baselines are provided (Section 2.3.2): 'a team of human expert raters performed all tasks in order to provide a strong baseline.' Both mean and max human rater scores are reported. Random chance baselines are also shown on multiple figures."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper evaluates multiple contemporary model families: GPT-3 series (Brown et al., 2020), Google-internal BIG-G models, BIG-G sparse models, and PaLM (Chowdhery et al., 2022). These were state-of-the-art models at the time of publication."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "While not a traditional ablation, the paper systematically varies model size across six orders of magnitude, compares dense vs. sparse architectures, and examines the effect of shot count (0-3 shot). Section 3.5 investigates sensitivity to task formatting (with/without multiple choice options, different phrasings of cause_and_effect)."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Multiple metrics are used: exact_string_match, multiple_choice_grade, expected_calibration_error, multiple_choice_brier_score, BLEU, BLEURT, ROUGE, case_insensitive_str_match, and normalized preferred metrics (Section 2.1). Section 3.4.2 explicitly discusses how different metrics reveal different patterns."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Section 2.3.2 describes human expert raters who completed all tasks: 'we employed a team of expert raters to complete the tasks submitted to BIG-bench.' Mean and max human scores are reported. Expert evaluation is further discussed in Appendix E."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section 2.4 describes the canary string mechanism to prevent data contamination, and Section 2.3.1 states: 'The data used to train all models (except PaLM) was collected before the BIG-bench repository was created. Direct leakage of BIG-bench tasks into models reported in this paper is therefore impossible.' The benchmark tasks serve as held-out test data."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Extensive per-task breakdowns are provided: Figure 4 shows per-task BIG-bench Lite performance, Figure 7 shows tasks grouped by linearity and breakthroughness scores, Appendix D shows performance by keyword, and Section 3.6 breaks down bias results by category (gender, religion, race/ethnicity, nationality)."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 3.5 ('Even large language models are brittle') extensively discusses failure cases. Section 4.1 analyzes chess failures qualitatively. Section 4.2 describes how models output nonsensical text at small scales. Section 3.7 discusses failures on non-English and low-resource languages."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Multiple negative results are reported: social bias increases with scale (Section 3.6), models remain far below human performance (Section 3.1), performance on low-resource languages fails to improve with scale (Section 3.7), models are brittle to phrasing changes (Section 3.5), and tasks with lowest linearity show performance degrading with scale (Figure 7c)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract's claims are well-supported: (1) model performance improves with scale but is poor in absolute terms (Figure 1, Section 3.1), (2) calibration improves with scale (Figure 5, Section 3.2), (3) performance is similar across model classes with benefits from sparsity (Figure 6, Section 3.3), (4) breakthrough vs. gradual scaling behavior (Figure 7, Section 3.4), (5) social bias increases with scale in ambiguous contexts but can be improved with prompting (Figures 12-13, Section 3.6)."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper is generally careful about causal language. It uses hedged language like 'One possible explanation for the breakthrough phenomenon' (Section 6.1) and 'We speculate that models perform poorly on the other versions because...' (Section 3.5). The systematic variation of model size while holding other factors constant provides reasonable support for scale-related claims. Ablation-like comparisons (dense vs. sparse, with/without choice options) use controlled manipulation."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The paper explicitly bounds its generalizations. Section 2.3.2 cautions about interpreting human baselines. Section 3.4.1 states observations are 'anecdotal.' Section 6.1 identifies specific limitations 'that we believe will require new approaches.' The Broader Impact Statement acknowledges BIG-bench 'certainly has holes in what is covered.'"
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Section 3.4.2 discusses alternative explanations for breakthrough behavior (metric artifacts vs. genuine capability jumps). Section 3.5 considers that models may perform poorly on reformulated tasks because those are 'dissimilar from their training distribution.' Section 3.7 considers whether Swahili performance indicates 'general understanding of Swahili or instead memorization of proverbs.' Section 4.2 considers training data recency as explanation for element naming errors."
    137       }
    138     },
    139     "setup_transparency": {
    140       "model_versions_specified": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "Models are described by architecture family and parameter count (e.g., 'BIG-G' with 13 sizes, 'GPT' corresponding to 'GPT-3 model series in Brown et al. (2020)'). Appendix A provides detailed architecture hyperparameters. However, specific model version identifiers, snapshot dates, or API versions are not provided. The GPT models are referenced generically as 'OpenAI GPT models corresponding to the GPT-3 model series.'"
    144       },
    145       "prompts_provided": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Full prompt examples are provided for multiple tasks throughout the paper: cause_and_effect task formats (Section 3.5), checkmate_in_one (Section 4.1), periodic_elements (Section 4.2), simple_arithmetic_json_multiple_choice (Section 3.5), and emoji_movie (Figure 9c). The BIG-bench repository contains all task definitions with their prompts."
    149       },
    150       "hyperparameters_reported": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 2.3.1 states: 'All model outputs were sampled greedily (with zero temperature), unless otherwise noted.' Temperature=1, top-k=40 is also tested (Figure App.1). Appendix A (Tables App.1 and App.2) provides detailed model architecture hyperparameters including layers, dimensions, training steps, and warmup steps."
    154       },
    155       "scaffolding_described": {
    156         "applies": false,
    157         "answer": false,
    158         "justification": "No agentic scaffolding is used. Models are evaluated via direct prompting with the BIG-bench API (generate_text, cond_log_prob methods). Some programmatic tasks involve multiple query rounds, but this is part of the benchmark specification, not an agentic scaffold."
    159       },
    160       "data_preprocessing_documented": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Section 2.1 documents the task types (JSON and programmatic), how performance is evaluated (comparing generated outputs to targets using standard metrics), and the scoring normalization procedure (Section 3.1). Section 2.2 describes the BBL selection process. Section 2.3.1 describes sampling procedures. The normalized preferred metric calculation is explicitly defined."
    164       }
    165     },
    166     "limitations_and_scope": {
    167       "limitations_section_present": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The Broader Impact Statement (after Section 6.2) discusses limitations extensively: 'BIG-bench certainly has holes in what is covered' and discusses gaps in coverage. Section 6.1 explicitly lists limitations requiring new approaches. Section 2.3.2 has a detailed discussion of limitations of human baseline interpretation."
    171       },
    172       "threats_to_validity_specific": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Multiple specific threats are discussed: Section 3.4.3 warns that 'capabilities and trajectories across scale of language models are much more subjective than we would like to believe.' Section 2.3.2 notes human rater scores are confounded by time controls, task changes, and non-representative demographics. Section 6.1 identifies specific capability gaps (long contexts, episodic memory, recurrent computation, multi-modal grounding)."
    176       },
    177       "scope_boundaries_stated": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "The Broader Impact Statement states: 'BIG-bench certainly has holes in what is covered' and that 'we have explicitly limited BIG-bench's scope' regarding domains beyond text. Section 2.1 acknowledges 'BIG-bench is designed for evaluating pure language models and has limits to cover multi-modal capabilities.' Section 6.1 lists specific things models cannot do that BIG-bench may not fully capture."
    181       }
    182     },
    183     "data_integrity": {
    184       "raw_data_available": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 6.1 states they 'release analysis Colab notebooks, as well as data files containing scores and logs of model interactions on each task.' The benchmark tasks and all evaluation results are available in the GitHub repository."
    188       },
    189       "data_collection_described": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Section 1.3 describes the collection process: 'The benchmark was developed openly on GitHub, with contributors adding tasks by way of GitHub pull requests. Peer review of the proposed tasks was performed by discussion on the pull request.' Section 7 provides detailed author contributions. The review criteria for task acceptance are referenced."
    193       },
    194       "recruitment_methods_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The paper describes how task contributors were recruited: through an open call on GitHub (Section 1.3), with authorship offered as incentive. Section 7 details core contributor roles. For human raters, Section 2.3.2 and Appendix E describe the evaluation team. However, the demographic representativeness of raters is flagged as a limitation."
    198       },
    199       "data_pipeline_documented": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "The pipeline from task submission through peer review to benchmark inclusion is described (Section 1.3). The evaluation pipeline is documented: model querying via API → metric computation → normalization (Section 3.1). The filtering/debugging of tasks post-merge is described in Section 7.1."
    203       }
    204     },
    205     "conflicts_of_interest": {
    206       "funding_disclosed": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "No funding section or grant acknowledgments appear in the paper. The Acknowledgments section thanks individuals for discussion and feedback but does not mention any funding sources. Given the scale of the work and involvement of Google and OpenAI employees, funding disclosure would be expected."
    210       },
    211       "affiliations_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "Author affiliations are provided throughout Section 7.2 for each task contribution, identifying institutions including Google, OpenAI, and many universities. Section 7.1 identifies core contributors and their affiliations. The involvement of Google and OpenAI employees in evaluating their own models is implicit from the affiliations."
    215       },
    216       "funder_independent_of_outcome": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "Google employees trained the BIG-G models and are core contributors to the benchmark (Section 7.1). OpenAI employees contributed GPT evaluations. Both companies have commercial interest in language model capabilities. No discussion of whether funders are independent of outcomes is provided. Since no funding is disclosed but Google and OpenAI resources were clearly used, the funder is not independent of the outcome."
    220       },
    221       "financial_interests_declared": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No competing interests or financial interests statement appears in the paper. Given that multiple authors are employees of Google and OpenAI — companies with direct commercial interest in language model capabilities — this is a notable omission."
    225       }
    226     },
    227     "contamination": {
    228       "training_cutoff_stated": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "Section 2.3.1 states: 'The data used to train all models (except PaLM) was collected before the BIG-bench repository was created. Direct leakage of BIG-bench tasks into models reported in this paper is therefore impossible.' While not giving exact cutoff dates, the temporal relationship between training data and benchmark creation is clearly established."
    232       },
    233       "train_test_overlap_discussed": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "Section 2.3.1 explicitly addresses this: 'Direct leakage of BIG-bench tasks into models reported in this paper is therefore impossible. Indirect leakage is possible, since many tasks use text available on the internet.' Section 2.4 describes the canary string mechanism for filtering BIG-bench data from future training corpora. The training_on_test_set task provides post-hoc diagnosis."
    237       },
    238       "benchmark_contamination_addressed": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "Multiple contamination countermeasures are described: (1) temporal separation — models trained before benchmark creation (Section 2.3.1), (2) canary string GUID for corpus filtering (Section 2.4), (3) training_on_test_set task for post-hoc detection (Section 2.4), (4) acknowledgment that 'indirect leakage is possible' with mitigation strategies."
    242       }
    243     },
    244     "human_studies": {
    245       "pre_registered": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "No pre-registration is mentioned for the human evaluation study. The benchmark was developed as an open collaboration, but the specific evaluation protocol with human raters was not pre-registered."
    249       },
    250       "irb_or_ethics_approval": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "No IRB or ethics board approval is mentioned despite employing human raters to perform benchmark tasks over a period of weeks."
    254       },
    255       "demographics_reported": {
    256         "applies": true,
    257         "answer": false,
    258         "justification": "Section 2.3.2 acknowledges that 'The demographics and backgrounds of the expert raters are not necessarily representative of the population in general' but does not actually report demographic information about the raters beyond listing their names (Section 7.1)."
    259       },
    260       "inclusion_exclusion_criteria": {
    261         "applies": true,
    262         "answer": false,
    263         "justification": "No inclusion or exclusion criteria for human raters are stated. The paper lists the names of raters (Section 7.1) and mentions they include core authors and an 'expert evaluation team' but does not describe how they were selected or what qualifications were required."
    264       },
    265       "randomization_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "This is not an experimental study comparing human conditions; humans serve as a baseline. There are no treatment/control groups requiring randomization."
    269       },
    270       "blinding_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "Blinding is not applicable here — humans performed benchmark tasks as a baseline comparison, not as evaluators of model outputs in a blinded experiment."
    274       },
    275       "attrition_reported": {
    276         "applies": true,
    277         "answer": false,
    278         "justification": "Section 2.3.2 mentions that tasks were subsampled and performed over weeks with time controls, but does not report how many raters started vs. finished, which raters completed which tasks, or any attrition information."
    279       }
    280     },
    281     "cost_and_practicality": {
    282       "inference_cost_reported": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "Despite evaluating models across six orders of magnitude of scale on 204 tasks, no inference cost, latency, or computational cost per evaluation is reported. Section 2.2 acknowledges 'full evaluation is computationally expensive' but does not quantify this."
    286       },
    287       "compute_budget_stated": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No total compute budget is stated for the evaluation or for training the BIG-G models. Appendix A provides model architecture details and training steps but not GPU hours, wall-clock time, or total compute cost."
    291       }
    292     }
    293   },
    294   "claims": [
    295     {
    296       "claim": "Model performance and calibration both improve with scale, but models perform poorly in absolute terms compared with human raters.",
    297       "evidence": "Figure 1 shows aggregate performance across all models. Section 3.1: 'even the strongest models perform poorly overall when compared with expert human rater performance.' The best performing language models achieved a normalized preferred metric score of less than 20 out of 100. Figure 5 shows calibration improves with scale.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "Performance is remarkably similar across model classes (GPT, BIG-G), with benefits from sparsity.",
    302       "evidence": "Section 3.3 and Figure 6: 'BIG-G sparse models perform better on BIG-bench tasks than BIG-G dense models, achieving a roughly twofold improvement in inference cost.' Sparse models show 'about a tenfold improvement in the FLOP-matched parameter count needed to reach a given calibration score.'",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "Tasks that improve gradually commonly involve knowledge/memorization, while breakthrough tasks involve multiple steps or components, or brittle metrics.",
    307       "evidence": "Section 3.4.1: highest linearity tasks are 'knowledge-based' (qa_wikidata, linguistic_mappings). Breakthrough tasks are 'composite in nature' (modified_arithmetic, codenames). Section 3.4.2 shows metrics can create apparent breakthroughs. Figure 7 illustrates both categories.",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "Social bias typically increases with scale in ambiguous contexts, but can be improved with prompting.",
    312       "evidence": "Section 3.6 and Figure 12 show aggregate bias increasing with scale across six tasks. Figure 13 shows bias decreasing with scale in unambiguous contexts and with pro-social prompts. The 128B model finds it 'over 22 times more likely that a white boy will grow up to be a good doctor than that a Native American girl will.'",
    313       "supported": "strong"
    314     },
    315     {
    316       "claim": "Breakthrough behavior is often an artifact of metric choice rather than a genuine discontinuity in capability.",
    317       "evidence": "Section 3.4.2 demonstrates that smoother metrics (BLEU, log probability) reveal gradual progress hidden by exact_string_match. Figure 8 shows log probabilities improve smoothly for breakthrough tasks. Figure 9 shows emoji_movie appears gradual with multiple_choice_grade but abrupt with exact_string_match.",
    318       "supported": "strong"
    319     },
    320     {
    321       "claim": "Large language models are brittle — sensitive to precise phrasing in unintuitive ways.",
    322       "evidence": "Section 3.5: Figure 10 shows models perform substantially better on multiple-choice questions when choices are NOT included in the input. Figure 11 shows cause_and_effect performance varies dramatically across three task formulations (from chance to near-perfect).",
    323       "supported": "strong"
    324     }
    325   ],
    326   "methodology_tags": [
    327     "benchmark-eval"
    328   ],
    329   "key_findings": "BIG-bench introduces a 204-task benchmark evaluated across dense and sparse language models spanning six orders of magnitude. Model performance improves with scale but remains far below human expert baselines (best model scores below 20/100 normalized). Scaling behavior is task-dependent: knowledge-based tasks improve linearly while composite tasks show breakthrough behavior, though this is often a metric artifact rather than a genuine capability discontinuity. Social bias generally increases with model scale in ambiguous contexts but can be mitigated through prompting, and sparse models provide roughly 2x efficiency gains over dense models at equivalent performance.",
    330   "red_flags": [
    331     {
    332       "flag": "Company evaluating own models",
    333       "detail": "Google employees trained the BIG-G models and are core contributors to the benchmark. OpenAI employees contributed GPT evaluation code. Both companies have commercial interest in demonstrating language model capabilities, yet no conflict of interest statement appears in the paper."
    334     },
    335     {
    336       "flag": "No statistical significance tests",
    337       "detail": "Despite numerous comparative claims about model performance differences (dense vs. sparse, across model sizes, bias trends), no statistical significance tests are reported. Conclusions are drawn from visual inspection of curves."
    338     },
    339     {
    340       "flag": "Human baseline limitations",
    341       "detail": "The human evaluation baseline has significant confounders acknowledged but not fully addressed: time controls (30 min to 2 hours per session), task subsampling, non-representative demographics, and some raters were paper co-authors. These factors likely underestimate human performance, making the human-model gap comparison less reliable."
    342     },
    343     {
    344       "flag": "No compute costs reported",
    345       "detail": "Evaluating 204 tasks across models spanning six orders of magnitude is computationally expensive (acknowledged in Section 2.2), but no compute budget, inference costs, or training costs are reported. This hinders reproducibility assessments."
    346     }
    347   ],
    348   "cited_papers": [
    349     {
    350       "title": "Language Models are Few-Shot Learners",
    351       "authors": ["Tom Brown"],
    352       "year": 2020,
    353       "arxiv_id": "2005.14165",
    354       "relevance": "Foundational GPT-3 paper; the GPT models evaluated in BIG-bench correspond to this model series."
    355     },
    356     {
    357       "title": "Evaluating Large Language Models Trained on Code",
    358       "authors": ["Mark Chen"],
    359       "year": 2021,
    360       "arxiv_id": "2107.03374",
    361       "relevance": "Codex/HumanEval paper demonstrating LLM code generation capabilities, directly relevant to code generation benchmarking."
    362     },
    363     {
    364       "title": "PaLM: Scaling Language Modeling with Pathways",
    365       "authors": ["Aakanksha Chowdhery"],
    366       "year": 2022,
    367       "arxiv_id": "2204.02311",
    368       "relevance": "PaLM evaluation on BIG-bench Lite is included in the paper; demonstrates state-of-the-art model performance and reduced brittleness at scale."
    369     },
    370     {
    371       "title": "On the Dangers of Stochastic Parrots: Can Language Models Be Too Big?",
    372       "authors": ["Emily M. Bender"],
    373       "year": 2021,
    374       "relevance": "Discusses socially harmful effects of large language models, directly relevant to the survey's coverage of AI safety and bias."
    375     },
    376     {
    377       "title": "On the Opportunities and Risks of Foundation Models",
    378       "authors": ["Rishi Bommasani"],
    379       "year": 2021,
    380       "arxiv_id": "2108.07258",
    381       "relevance": "Comprehensive report on foundation models covering capabilities, applications, and societal impact — key context for understanding LLM evaluation landscape."
    382     },
    383     {
    384       "title": "Scaling Laws for Neural Language Models",
    385       "authors": ["Jared Kaplan"],
    386       "year": 2020,
    387       "arxiv_id": "2001.08361",
    388       "relevance": "Establishes scaling laws for language model performance, foundational to BIG-bench's analysis of how capabilities change with model size."
    389     },
    390     {
    391       "title": "Measuring Massive Multitask Language Understanding",
    392       "authors": ["Dan Hendrycks"],
    393       "year": 2021,
    394       "arxiv_id": "2009.03300",
    395       "relevance": "MMLU benchmark with 57 tasks; a contemporary benchmark that BIG-bench aims to complement with greater diversity and difficulty."
    396     },
    397     {
    398       "title": "Predictability and Surprise in Large Generative Models",
    399       "authors": ["Deep Ganguli"],
    400       "year": 2022,
    401       "arxiv_id": "2202.07785",
    402       "relevance": "Discusses breakthrough/emergent capabilities in language models, directly motivating BIG-bench's study of scaling behavior and breakthroughness."
    403     },
    404     {
    405       "title": "Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity",
    406       "authors": ["William Fedus"],
    407       "year": 2021,
    408       "arxiv_id": "2101.03961",
    409       "relevance": "Introduces the sparse transformer architecture evaluated in BIG-bench as BIG-G sparse, relevant to efficient model scaling."
    410     },
    411     {
    412       "title": "Training Compute-Optimal Large Language Models",
    413       "authors": ["Jordan Hoffmann"],
    414       "year": 2022,
    415       "arxiv_id": "2203.15556",
    416       "relevance": "Chinchilla scaling analysis suggesting optimal compute allocation; evaluated on BIG-bench and informing discussion of scaling trends."
    417     },
    418     {
    419       "title": "DynaBench: Rethinking Benchmarking in NLP",
    420       "authors": ["Douwe Kiela"],
    421       "year": 2021,
    422       "arxiv_id": "2104.14337",
    423       "relevance": "Dynamic benchmarking platform using human-and-model-in-the-loop; complementary approach to BIG-bench's static benchmark design."
    424     },
    425     {
    426       "title": "LaMDA: Language Models for Dialog Applications",
    427       "authors": ["Romal Thoppilan"],
    428       "year": 2022,
    429       "arxiv_id": "2201.08239",
    430       "relevance": "BIG-G models are based on LaMDA architectures; the paper discusses LaMDA's safety techniques as important for future models."
    431     }
    432   ]
    433 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs