scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (33298B)
      1 {
      2   "paper": {
      3     "title": "LiveBench: A Challenging, Contamination-Limited LLM Benchmark",
      4     "authors": [
      5       "Colin White",
      6       "Samuel Dooley",
      7       "Manley Roberts",
      8       "Arka Pal",
      9       "Benjamin Feuer",
     10       "Siddhartha Jain",
     11       "Ravid Shwartz-Ziv",
     12       "Neel Jain",
     13       "Khalid Saifullah",
     14       "Sreemanti Dey",
     15       "Shubh-Agrawal",
     16       "Sandeep Singh Sandha",
     17       "Siddartha Naidu",
     18       "Chinmay Hegde",
     19       "Yann LeCun",
     20       "Tom Goldstein",
     21       "Willie Neiswanger",
     22       "Micah Goldblum"
     23     ],
     24     "year": 2024,
     25     "venue": "ICLR 2025",
     26     "arxiv_id": "2406.19314",
     27     "doi": "10.48550/arXiv.2406.19314"
     28   },
     29   "checklist": {
     30     "artifacts": {
     31       "code_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper provides a GitHub repository at https://github.com/livebench/livebench and states in Section 6: 'we open-source the leaderboard, all questions, all code to run API and open-source models, all model outputs for 40 models, and all code to score the models.'"
     35       },
     36       "data_released": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "All questions, model answers, and the leaderboard are publicly released. Section 6 states 'every part of the project is available publicly: https://livebench.ai/' and the benchmark is available on HuggingFace at https://huggingface.co/livebench. The only exception is 1/6 of questions withheld for one month."
     40       },
     41       "environment_specified": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper mentions using FastChat for chat templates and bfloat16 for open-source models, but does not provide requirements.txt, Dockerfile, or detailed dependency specifications. Section 3 mentions setup details but lacks reproducible environment specifications."
     45       },
     46       "reproduction_instructions": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Section 6 (Reproducibility Statement) states 'The readme in the above link gives instructions to download all parts of the project and to score new models.' The paper also describes experimental setup in detail (Section 3)."
     50       }
     51     },
     52     "statistical_methodology": {
     53       "confidence_intervals_or_error_bars": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Figure 1 and Figure 5 display 95% bootstrap confidence intervals for all models. However, the main results tables (Table 1, Table 2) report only point estimates without CIs."
     57       },
     58       "significance_tests": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No significance tests are used for model comparisons. The paper makes claims like 'o1-preview-2024-09-12 substantially outperforms all other models' based on comparing point estimates without any formal statistical tests. Correlation analyses report Pearson coefficients with standard errors but no p-values."
     62       },
     63       "effect_sizes_reported": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper reports absolute percentage differences with baseline context (e.g., 'o1-preview 6% better than all other models'), per-category scores, and Pearson correlation coefficients (0.91 and 0.88 with ChatBot Arena and Arena-Hard). Tables 5 and 6 report correlation coefficients with standard errors."
     67       },
     68       "sample_size_justified": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No justification given for why 40-100 questions per task or 1000 total questions is sufficient. Section 2 states the tasks 'loosely aim for an overall 30-70% success rate on the top models' but provides no power analysis or formal justification for these choices."
     72       },
     73       "variance_reported": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "All experiments use temperature 0 for deterministic single-turn evaluation. No multiple runs, no seed sensitivity analysis. The bootstrap confidence intervals in Figure 1 capture question-sampling variance, not experimental run variance."
     77       }
     78     },
     79     "evaluation_design": {
     80       "baselines_included": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper compares 40 models including proprietary models (GPT-4, Claude, Gemini) and open-source models (Llama, Qwen, Mixtral, etc.). Additionally, Section 3.3 compares LiveBench against ChatBot Arena and Arena-Hard benchmarks."
     84       },
     85       "baselines_contemporary": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Evaluated models include state-of-the-art models at the time of publication: o1-preview, claude-3-5-sonnet, gemini-exp-1121, gpt-4o. Table 4 lists 80+ model versions across multiple evaluation rounds."
     89       },
     90       "ablation_study": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The paper does not ablate components of the benchmark design (e.g., removing task types, varying question difficulty distributions, comparing scoring methods). Correlation analyses in Section 3.2 show relationships between categories but do not constitute ablation."
     94       },
     95       "multiple_metrics": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Different tasks use different appropriate metrics: pass@1 for coding, accuracy for multiple-choice, F1 for table join, Levenshtein distance-based score for plot unscrambling, edit distance for olympiad, and a two-component instruction-following score (prompt-level + instruction-level accuracy)."
     99       },
    100       "human_evaluation": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "The paper's design explicitly avoids human evaluation: 'scores answers automatically according to objective ground-truth values, without the use of LLM judges' (Section 1). While manual inspection of scoring functions is mentioned (Appendix A.4), no human evaluation of model outputs is performed."
    104       },
    105       "held_out_test_set": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Models are evaluated zero-shot with no tuning on benchmark data. Additionally, 1/6 of questions are kept private each month. Section A.6 describes that 'the public leaderboard always has 1/6 questions that are private and completely contamination-free.' Model hyperparameters are taken from model cards, not tuned on LiveBench."
    109       },
    110       "per_category_breakdown": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Tables 1 and 2 provide per-category breakdowns (Coding, Data Analysis, Instruction Following, Language, Math, Reasoning) for all 40 models. Tables 5 and 6 provide per-category and per-task correlation analyses."
    114       },
    115       "failure_cases_discussed": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Table 7 shows each model's relative worst task. Section 3.2 and Figure 3 identify outlier models. Section 3.3 and Appendix A.2 discuss LLM judging failures. The paper discusses specific weaknesses (e.g., phi models strong in reasoning but weak in language)."
    119       },
    120       "negative_results_reported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Appendix A.2 reports that LLM judges have high error rates (38-46%) on hard math and reasoning questions. Section 3.4 reports the benchmark is becoming harder over time (1.2% drop). Section A.7 acknowledges some questions may be contaminated."
    124       }
    125     },
    126     "claims_and_evidence": {
    127       "abstract_claims_supported": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Abstract claims are supported: (1) frequently-updated questions from recent sources — demonstrated in Sections 2.1-2.6 and A.6; (2) automatic objective scoring — Section 2 and Appendix A.4; (3) diverse tasks — 18 tasks across 6 categories; (4) 'top models achieving below 70% accuracy' — Table 1 shows o1-preview at 64.7%."
    131       },
    132       "causal_claims_justified": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper's causal-adjacent claims are appropriately hedged. 'Likely due to the known bias from using gpt-4 itself as the LLM judge' (Section 3.3) is hedged with 'likely.' 'We hypothesize that the strong performance of some models... may be due to having an output style that is preferred by humans' uses 'hypothesize' and 'may be.'"
    136       },
    137       "generalization_bounded": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper bounds its claims to the specific tasks and models tested. Section 5 explicitly states limitations: no non-English tasks, ground-truth scoring cannot handle open-ended tasks, prompt formatting biases. The 'first benchmark' claim is qualified to require all three desiderata simultaneously."
    141       },
    142       "alternative_explanations_discussed": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Section 3.3 discusses alternative explanations for benchmark score differences (LLM judge bias, output style preferences). Section A.7 distinguishes between test set contamination and task contamination. Section 3.2 discusses why certain categories correlate."
    146       },
    147       "proxy_outcome_distinction": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "The paper measures specific task accuracies and reports them as such (pass@1, accuracy, F1). It does not overclaim beyond the measured metrics. Section 5 acknowledges that ground-truth scoring 'cannot be used for certain use cases, such as write a travel guide to Hawaii in which it is hard to define a ground truth.'"
    151       }
    152     },
    153     "setup_transparency": {
    154       "model_versions_specified": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "All 40+ models are identified with specific version strings throughout (e.g., 'gpt-4o-2024-08-06', 'claude-3-5-sonnet-20240620', 'o1-preview-2024-09-12'). Table 4 provides full model names with citations and version identifiers."
    158       },
    159       "prompts_provided": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Example prompts with actual text are provided for multiple tasks throughout the paper: Math Competitions (Section 2.1, Appendix A.3.1), Zebra Puzzles (Section 2.3), Typos (Section 2.6), Web of Lies (Appendix A.3.3), Connections (Appendix A.3.6), Spatial (Appendix A.3.3). Full code including prompts is released."
    163       },
    164       "hyperparameters_reported": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 3 states: 'we perform single-turn evaluation with temperature 0, unless otherwise noted in the model card. All models run with their respective templates from our updated version of FastChat. We run all open-source models with bfloat16.'"
    168       },
    169       "scaffolding_described": {
    170         "applies": false,
    171         "answer": false,
    172         "justification": "No agentic scaffolding is used. This is a direct single-turn evaluation benchmark where models receive a question and produce an answer."
    173       },
    174       "data_preprocessing_documented": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Each task's question generation and data processing pipeline is described in detail in Sections 2.1-2.6 and Appendix A.3. For example, math competition questions describe specific modifications made; coding completion describes how solutions are truncated; typos task describes how misspellings are injected."
    178       }
    179     },
    180     "limitations_and_scope": {
    181       "limitations_section_present": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 5 'Conclusions, Limitations, and Future Work' includes a dedicated paragraph discussing limitations: lack of non-English tasks, inability to handle open-ended tasks with ground-truth scoring, biases from prompt formatting across model families."
    185       },
    186       "threats_to_validity_specific": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 5 discusses specific threats: 'there are still biases due to certain LLM families favoring certain prompt types.' Section A.7 discusses specific contamination risks for certain tasks. Appendix A.4 addresses the concern that automated grading adds an instruction-following component to non-IF tasks."
    190       },
    191       "scope_boundaries_stated": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section 5 states specific things the benchmark does NOT cover: 'we hope to add non-English language tasks,' 'ground truth scoring... cannot be used for certain use cases.' Section A.7 explicitly states what LiveBench does and does not guard against regarding contamination types."
    195       }
    196     },
    197     "data_integrity": {
    198       "raw_data_available": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 6: 'we open-source the leaderboard, all questions, all code to run API and open-source models, all model outputs for 40 models, and all code to score the models.' All raw model outputs and questions are available at livebench.ai and HuggingFace."
    202       },
    203       "data_collection_described": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Each task's data collection is documented: math competitions from AMC12/AIME/SMC/USAMO/IMO with specific dates; coding from LiveCodeBench and LeetCode; instruction following from The Guardian API; language from ArXiv/NYT/IMDb; data analysis from Kaggle/Socrata. Sections 2.1-2.6 and Appendix A.3 provide details."
    207       },
    208       "recruitment_methods_described": {
    209         "applies": false,
    210         "answer": false,
    211         "justification": "No human participants. Data sources are publicly available competitions, news articles, datasets, and paper abstracts."
    212       },
    213       "data_pipeline_documented": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "The full pipeline from data sources to questions to evaluation to scoring is documented across Sections 2 and Appendix A.3-A.5. Table 13 summarizes number of questions and data sources per task. Table 15 provides token statistics."
    217       }
    218     },
    219     "conflicts_of_interest": {
    220       "funding_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "The paper states 'Sponsored by Abacus.AI' in the author footnote on page 1. Author affiliations are listed as '1 Abacus.AI, 2 NYU, 3 Nvidia, 4 UMD, 5 USC, 6 Columbia.'"
    224       },
    225       "affiliations_disclosed": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "All author affiliations are listed: Abacus.AI, NYU, Nvidia, UMD, USC, Columbia. Table 4 identifies Dracarys models as from Abacus.AI (https://huggingface.co/abacusai/). The connection between sponsor and evaluated models is traceable."
    229       },
    230       "funder_independent_of_outcome": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "The paper is sponsored by Abacus.AI, whose models (Dracarys2-72B-Instruct, Dracarys-72B-Instruct, Dracarys-Llama-3.1-70B-Instruct) are evaluated in the benchmark. Abacus.AI has a financial interest in their models performing well on the benchmark they sponsor."
    234       },
    235       "financial_interests_declared": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No formal competing interests or conflict of interest statement is present in the paper. The Abacus.AI sponsorship is noted, but there is no declaration of whether authors hold equity, patents, or other financial interests related to the evaluated products."
    239       }
    240     },
    241     "contamination": {
    242       "training_cutoff_stated": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "While the paper discusses contamination extensively and designs around training cutoffs conceptually, it does not state the specific training data cutoff dates for each of the 40 evaluated models. Section 1 mentions 'after the training cutoff date of the LLM' generically."
    246       },
    247       "train_test_overlap_discussed": {
    248         "applies": true,
    249         "answer": true,
    250         "justification": "Section 1 extensively discusses train/test overlap. Section A.7 provides a detailed analysis of two types of contamination (test set contamination vs. task contamination). The paper acknowledges that 'a limited fraction of LiveBench is likely contaminated on all recent LLMs' for some coding questions from November 2023."
    251       },
    252       "benchmark_contamination_addressed": {
    253         "applies": true,
    254         "answer": true,
    255         "justification": "Contamination mitigation is the paper's core design principle: frequently-updated questions from recent sources, 1/6 of questions kept private each month, questions become harder over time. Section A.7 explicitly discusses which questions may be contaminated and the paper's mitigation strategies."
    256       }
    257     },
    258     "human_studies": {
    259       "pre_registered": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study. The paper evaluates LLMs on automated benchmark tasks."
    263       },
    264       "irb_or_ethics_approval": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants. The paper evaluates LLMs on benchmark tasks with synthetic and publicly available data."
    268       },
    269       "demographics_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "inclusion_exclusion_criteria": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "randomization_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       },
    284       "blinding_described": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in this study."
    288       },
    289       "attrition_reported": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "No human participants in this study."
    293       }
    294     },
    295     "cost_and_practicality": {
    296       "inference_cost_reported": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Table 16 provides per-model API costs for running LiveBench (e.g., o1-preview ≈$47.87, claude-3-haiku ≈$0.90). Table 15 provides mean input/output token counts per task."
    300       },
    301       "compute_budget_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Per-model API costs are provided (Table 16) but the total computational budget is not stated. No GPU hours reported for running open-source models. Section 2.7 mentions costs are 'easily within the computational budgets of the authors' institutions' without quantifying."
    305       }
    306     },
    307     "experimental_rigor": {
    308       "seed_sensitivity_reported": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No seed sensitivity analysis is performed. All models are evaluated at temperature 0 (deterministic), producing a single output per question. No analysis of how results vary across seeds or sampling parameters."
    312       },
    313       "number_of_runs_stated": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "Section 3 states 'we perform single-turn evaluation with temperature 0' — this effectively specifies one deterministic run per question per model."
    317       },
    318       "hyperparameter_search_budget": {
    319         "applies": true,
    320         "answer": true,
    321         "justification": "The paper explicitly states that model hyperparameters are taken from model cards and example code (Appendix A.5): 'we match this setup in LiveBench's code.' No hyperparameter search is performed, and this is clearly documented."
    322       },
    323       "best_config_selection_justified": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "No configuration selection is performed — default model configurations from model cards are used consistently (Appendix A.5). This eliminates selection bias."
    327       },
    328       "multiple_comparison_correction": {
    329         "applies": false,
    330         "answer": false,
    331         "justification": "The paper does not perform formal statistical hypothesis tests requiring multiple comparison correction. Model rankings are based on point estimates and correlation analyses, not significance tests."
    332       },
    333       "self_comparison_bias_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "The benchmark is designed and sponsored by Abacus.AI, whose Dracarys models are evaluated (ranking 14th/40 at 50.1%). There is no discussion of potential bias from the benchmark authors evaluating models from their sponsoring organization."
    337       },
    338       "compute_budget_vs_performance": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The paper does not present performance as a function of compute budget or model size in a systematic way. While models range from 0.5B to 405B parameters, no compute-normalized comparisons are provided."
    342       },
    343       "benchmark_construct_validity": {
    344         "applies": true,
    345         "answer": true,
    346         "justification": "Section 3.2 analyzes correlations between LiveBench categories and tasks to assess internal validity. Section 3.3 compares with ChatBot Arena and Arena-Hard to assess external validity. The paper discusses what ground-truth scoring can and cannot capture (Section 5)."
    347       },
    348       "scaffold_confound_addressed": {
    349         "applies": false,
    350         "answer": false,
    351         "justification": "No scaffolding is involved. Models are evaluated in direct single-turn mode with standardized inference via FastChat."
    352       }
    353     },
    354     "data_leakage": {
    355       "temporal_leakage_addressed": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "Temporal leakage is the paper's core concern. Questions are based on recently-released sources (math competitions, arXiv papers, news articles, datasets), and 1/6 are withheld privately each month. Section A.7 details the temporal anti-contamination strategy."
    359       },
    360       "feature_leakage_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "The paper does not discuss whether the evaluation prompts or task formatting could leak answer information. For example, multiple-choice questions with modified answer orders could still provide structural hints. No analysis of prompt-level feature leakage is provided."
    364       },
    365       "non_independence_addressed": {
    366         "applies": true,
    367         "answer": true,
    368         "justification": "Section A.7 distinguishes between test set contamination (data duplication) and task contamination (distribution similarity), noting that 'the data still ends up being highly similar to the kind of data likely seen in the pretraining set.' The paper acknowledges this as a separate concern from exact data overlap."
    369       },
    370       "leakage_detection_method": {
    371         "applies": true,
    372         "answer": true,
    373         "justification": "The paper uses temporal splits as a leakage prevention method: questions from recently-released sources after model training cutoffs, plus 1/6 private questions at any time. Section A.7 describes their approach. They also reference Roberts et al. (2024) on contamination detection methods."
    374       }
    375     }
    376   },
    377   "scan_version": 3,
    378   "active_modules": ["experimental_rigor", "data_leakage"],
    379   "claims": [
    380     {
    381       "claim": "LiveBench is the first benchmark combining frequently-updated questions, objective ground-truth scoring, and diverse challenging tasks.",
    382       "evidence": "The benchmark design is described in Section 2 with 18 tasks across 6 categories, all using objective scoring. LiveCodeBench is acknowledged as related but limited to coding only (Section 4).",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "o1-preview-2024-09-12 achieves the highest overall LiveBench score at 64.7%, 6 percentage points ahead of all other models.",
    387       "evidence": "Table 1 shows o1-preview at 64.7%, with the next best models (claude-3-5-sonnet variants) at 58.5% and 58.2%.",
    388       "supported": "strong"
    389     },
    390     {
    391       "claim": "LiveBench has 0.91 and 0.88 Pearson correlation with ChatBot Arena and Arena-Hard respectively.",
    392       "evidence": "Section 3.3 reports these correlations based on models common to both benchmarks. Figure 4 and Figure 6 visualize the comparisons.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "LLM judges have high error rates (21-46%) on challenging math and reasoning questions.",
    397       "evidence": "Table 8 shows GPT-4-Turbo judging error rates: 38% on AMC12, 21% on AIME, 35% on SMC, 42-46% on Zebra Puzzles. However, the study is described as 'preliminary' and uses only one judge model (Appendix A.2).",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "Model rankings remain consistent across monthly benchmark updates with >0.997 rank correlation.",
    402       "evidence": "Section 3.4 states 'The rank correlation between the original and first update, and the first and second update, are both > 0.997.'",
    403       "supported": "strong"
    404     },
    405     {
    406       "claim": "The best open-source models (Llama-3.1-405b, Qwen2.5-72b) outperform GPT-4-Turbo.",
    407       "evidence": "Table 2 shows meta-llama-3.1-405b at 51.1% and qwen2.5-coder-32b at 45.0%, vs gpt-4-turbo at 49.6%. However, qwen2.5-72b-instruct is not in the displayed table (likely evaluated in another version).",
    408       "supported": "moderate"
    409     }
    410   ],
    411   "methodology_tags": ["benchmark-eval"],
    412   "key_findings": "LiveBench introduces a contamination-resistant LLM benchmark with 18 tasks across 6 categories (math, coding, reasoning, language, instruction following, data analysis) using objective ground-truth scoring rather than LLM or human judges. Among 40 evaluated models, o1-preview achieves the highest score (64.7%) while no model exceeds 70%. The benchmark correlates well with ChatBot Arena (r=0.91) and Arena-Hard (r=0.88) but reveals systematic differences attributable to judging biases. LLM judges are shown to have 21-46% error rates on hard math and reasoning tasks, supporting the case for ground-truth scoring.",
    413   "red_flags": [
    414     {
    415       "flag": "Sponsor conflict of interest",
    416       "detail": "The paper is sponsored by Abacus.AI, whose models (Dracarys2-72B-Instruct) are evaluated in the benchmark, ranking 14th out of 40 models at 50.1%. This conflict is not explicitly acknowledged despite the evaluated models bearing the sponsor's name."
    417     },
    418     {
    419       "flag": "No formal significance tests",
    420       "detail": "Model comparisons across 40 models are based entirely on point estimates. Claims like 'substantially outperforms' lack statistical tests. Bootstrap CIs are shown in figures but not used for formal pairwise comparisons."
    421     },
    422     {
    423       "flag": "Single deterministic run only",
    424       "detail": "All evaluations use temperature 0 with a single pass. While this ensures reproducibility, it means no analysis of how results might vary with different sampling parameters, and the bootstrap CIs capture only question-sampling variance."
    425     },
    426     {
    427       "flag": "Acknowledged partial contamination",
    428       "detail": "Section A.7 acknowledges that 'a limited fraction of LiveBench is likely contaminated on all recent LLMs' — specifically some coding questions from November 2023 and minimally-modified AMC questions. This undermines the contamination-free framing."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "To the cutoff... and beyond? A longitudinal perspective on LLM data contamination",
    434       "authors": ["Manley Roberts", "Himanshu Thakur", "Christine Herlihy", "Colin White", "Samuel Dooley"],
    435       "year": 2024,
    436       "relevance": "Key study on LLM data contamination showing performance drops after training cutoffs, directly motivating LiveBench's design."
    437     },
    438     {
    439       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    440       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    441       "year": 2024,
    442       "arxiv_id": "2403.07974",
    443       "relevance": "Related contamination-free benchmark for coding tasks; LiveBench uses its coding questions and extends the approach to multiple domains."
    444     },
    445     {
    446       "title": "Chatbot Arena: An open platform for evaluating LLMs by human preference",
    447       "authors": ["Wei-Lin Chiang", "Lianmin Zheng"],
    448       "year": 2024,
    449       "arxiv_id": "2403.04132",
    450       "relevance": "Major human-judged LLM benchmark that LiveBench compares against, finding r=0.91 correlation but also identifying bias differences."
    451     },
    452     {
    453       "title": "From live data to high-quality benchmarks: The Arena-Hard pipeline",
    454       "authors": ["Tianle Li", "Wei-Lin Chiang", "Evan Frick"],
    455       "year": 2024,
    456       "relevance": "LLM-judge benchmark that LiveBench compares against (r=0.88), with discussion of GPT-4 self-preference bias in judging."
    457     },
    458     {
    459       "title": "Challenging Big-Bench tasks and whether chain-of-thought can solve them",
    460       "authors": ["Mirac Suzgun", "Nathan Scales"],
    461       "year": 2023,
    462       "arxiv_id": "2210.09261",
    463       "relevance": "Big-Bench Hard benchmark from which LiveBench derives and creates harder versions of the Web of Lies task."
    464     },
    465     {
    466       "title": "Instruction-following evaluation for large language models",
    467       "authors": ["Jeffrey Zhou", "Tianjian Lu", "Swaroop Mishra"],
    468       "year": 2023,
    469       "arxiv_id": "2311.07911",
    470       "relevance": "IFEval benchmark that inspired LiveBench's instruction following tasks with verifiable constraints."
    471     },
    472     {
    473       "title": "Generalization or memorization: Data contamination and trustworthy evaluation for large language models",
    474       "authors": ["Yihong Dong", "Xue Jiang"],
    475       "year": 2024,
    476       "arxiv_id": "2402.15938",
    477       "relevance": "Study on data contamination in LLM evaluation, documenting how benchmark leakage inflates performance."
    478     },
    479     {
    480       "title": "A careful examination of large language model performance on grade school arithmetic",
    481       "authors": ["Hugh Zhang", "Jeff Da"],
    482       "year": 2024,
    483       "arxiv_id": "2405.00332",
    484       "relevance": "Demonstrates that several models have overfit to GSM8K benchmark, supporting the need for contamination-resistant benchmarks."
    485     },
    486     {
    487       "title": "Functional benchmarks for robust evaluation of reasoning performance, and the reasoning gap",
    488       "authors": ["Saurabh Srivastava"],
    489       "year": 2024,
    490       "arxiv_id": "2402.19450",
    491       "relevance": "Modifies the MATH dataset by changing problem numbers to test LLM generalization, finding performance declines."
    492     },
    493     {
    494       "title": "Evaluation data contamination in LLMs: how do we measure it and (when) does it matter?",
    495       "authors": ["Aaditya K Singh"],
    496       "year": 2024,
    497       "arxiv_id": "2411.03923",
    498       "relevance": "Comprehensive study on measuring and understanding the impact of evaluation data contamination in LLMs."
    499     },
    500     {
    501       "title": "Proving test set contamination in black box language models",
    502       "authors": ["Yonatan Oren", "Nicole Meister"],
    503       "year": 2023,
    504       "arxiv_id": "2310.17623",
    505       "relevance": "Methods for detecting test set contamination in black-box LLMs, relevant to benchmark integrity assessment."
    506     },
    507     {
    508       "title": "Training on the test model: Contamination in ranking distillation",
    509       "authors": ["Vishakha Suresh Kalal", "Andrew Parry"],
    510       "year": 2024,
    511       "arxiv_id": "2411.02284",
    512       "relevance": "Demonstrates contamination effects in ranking model distillation, expanding the scope of contamination concerns."
    513     }
    514   ],
    515   "engagement_factors": {
    516     "practical_relevance": {
    517       "score": 3,
    518       "justification": "Immediately usable benchmark with open-source code, public leaderboard at livebench.ai, and monthly updates — practitioners can evaluate their models today."
    519     },
    520     "surprise_contrarian": {
    521       "score": 1,
    522       "justification": "Confirms the widely-known problem of benchmark contamination rather than challenging conventional wisdom; the LLM judge error rate findings are modestly surprising."
    523     },
    524     "fear_safety": {
    525       "score": 0,
    526       "justification": "No AI safety or security concerns raised; the paper is about evaluation methodology."
    527     },
    528     "drama_conflict": {
    529       "score": 1,
    530       "justification": "Implicit criticism of ChatBot Arena and Arena-Hard for judging biases, and suggests some model rankings may be inflated by contamination, but framed constructively."
    531     },
    532     "demo_ability": {
    533       "score": 3,
    534       "justification": "Live leaderboard at livebench.ai, open-source code on GitHub, and HuggingFace dataset — anyone can run it on their own models."
    535     },
    536     "brand_recognition": {
    537       "score": 2,
    538       "justification": "Author list includes Yann LeCun (NYU/Meta) and Tom Goldstein (UMD), published at ICLR 2025, and evaluates all major model families."
    539     }
    540   }
    541 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs