ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (34503B)


      1 {
      2   "paper": {
      3     "title": "LiveBench: A Challenging, Contamination-Limited LLM Benchmark",
      4     "authors": [
      5       "Colin White",
      6       "Samuel Dooley",
      7       "Manley Roberts",
      8       "Arka Pal",
      9       "Benjamin Feuer",
     10       "Siddhartha Jain",
     11       "Ravid Shwartz-Ziv",
     12       "Neel Jain",
     13       "Khalid Saifullah",
     14       "Sreemanti Dey",
     15       "Shubh-Agrawal",
     16       "Sandeep Singh Sandha",
     17       "Siddartha Naidu",
     18       "Chinmay Hegde",
     19       "Yann LeCun",
     20       "Tom Goldstein",
     21       "Willie Neiswanger",
     22       "Micah Goldblum"
     23     ],
     24     "year": 2024,
     25     "venue": "International Conference on Learning Representations (ICLR 2025)",
     26     "arxiv_id": "2406.19314"
     27   },
     28   "scan_version": 3,
     29   "active_modules": ["experimental_rigor", "data_leakage"],
     30   "methodology_tags": ["benchmark-eval"],
     31   "key_findings": "LiveBench is the first LLM benchmark combining frequently-updated questions from recent sources, objective ground-truth scoring without LLM judges, and diverse tasks across six categories. Top models achieve below 70% accuracy, with o1-preview leading at 64.7%. The benchmark correlates well with ChatBot Arena (r=0.91) and Arena-Hard (r=0.88), while revealing that LLM judges have high error rates (21-46%) on challenging math and reasoning questions. Monthly question updates maintain stable model rankings (rank correlation >0.997) while increasing difficulty over time.",
     32   "checklist": {
     33     "artifacts": {
     34       "code_released": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "The paper states 'Our codebase is available at https://github.com/livebench/livebench' (Section 1) and 'we open-source the leaderboard, all questions, all code to run API and open-source models, all model outputs for 40 models, and all code to score the models' (Section 6)."
     38       },
     39       "data_released": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "All questions and model answers are released: 'We release all questions, code, and model answers' (Section 1). The benchmark is also available on HuggingFace at https://huggingface.co/livebench (Appendix B.2)."
     43       },
     44       "environment_specified": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper specifies key environment details: 'All models run with their respective templates from our updated version of FastChat' and 'We run all open-source models with bfloat16' (Section 3). All code is released on GitHub with a README containing setup instructions (Section 6)."
     48       },
     49       "reproduction_instructions": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 6 states: 'The readme in the above link gives instructions to download all parts of the project and to score new models.' The paper also describes experimental setup in detail (Section 3)."
     53       }
     54     },
     55     "statistical_methodology": {
     56       "confidence_intervals_or_error_bars": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Figure 1 and Figure 5 show '95% bootstrap confidence intervals' for all model scores. Standard errors are reported for correlation analyses in Tables 5 and 6."
     60       },
     61       "significance_tests": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The paper claims models outperform others (e.g., 'o1-preview-2024-09-12 substantially outperforms all other models') based on comparing raw scores without any formal significance tests. Pearson correlations are reported with standard errors but no hypothesis tests are conducted."
     65       },
     66       "effect_sizes_reported": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The paper reports absolute scores and percentage differences with baseline context throughout. For example, 'o1-preview-2024-09-12 performs the best overall, 6% better than all other models' (Section 3.1). Tables 1-2 provide full score breakdowns enabling comparison."
     70       },
     71       "sample_size_justified": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No justification is given for the benchmark size of 1000 questions, the number of questions per task (40-100), or the number of models evaluated (40). No power analysis is discussed."
     75       },
     76       "variance_reported": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Bootstrap confidence intervals are shown in Figures 1 and 5. Standard errors are reported for correlation coefficients in Tables 5 and 6. Table 15 reports mean and standard deviation for token counts."
     80       }
     81     },
     82     "evaluation_design": {
     83       "baselines_included": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The paper compares LiveBench against two prominent existing benchmarks: ChatBot Arena and Arena-Hard (Section 3.3, Figures 4 and 6). It also compares many models against each other."
     87       },
     88       "baselines_contemporary": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Comparisons are made against contemporary benchmarks (ChatBot Arena, Arena-Hard) and the paper evaluates state-of-the-art models including o1-preview, claude-3.5-sonnet, and gemini-1.5-pro (Table 1)."
     92       },
     93       "ablation_study": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "LiveBench has 18 tasks across 6 categories but no ablation study examines how removing tasks or categories affects the benchmark's discriminative power. Per-category correlation analysis (Figure 2) is provided but is not a proper ablation."
     97       },
     98       "multiple_metrics": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Multiple metrics are used: per-task scores, per-category averages, overall LiveBench score, Pearson correlation coefficients, rank correlations across updates, and pass@1 for coding tasks."
    102       },
    103       "human_evaluation": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "The paper explicitly avoids human evaluation as a design choice: LiveBench 'scores answers automatically according to objective ground-truth values' (Abstract). No human evaluation of the benchmark quality or model outputs is conducted."
    107       },
    108       "held_out_test_set": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The benchmark questions serve as the test set, and models are not tuned on them. Additionally, 'Each month, we do not release the new questions until one month later, so that the public leaderboard always has 1/6 questions that are private' (Section 2.7)."
    112       },
    113       "per_category_breakdown": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Tables 1 and 2 provide per-category breakdowns (Coding, Data Analysis, Instruction Following, Language, Math, Reasoning) for all 40 models. Figure 2 shows per-task correlations."
    117       },
    118       "failure_cases_discussed": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Table 7 shows the relative worst task for each model. Section 3.2 identifies outlier models. The LLM judging ablation (Tables 8-9) shows where automated judging fails. Section 3.3 discusses models that are 'disproportionately stronger' or weaker in specific categories."
    122       },
    123       "negative_results_reported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper reports that LLM judges have high error rates (21-46%) on hard math and reasoning tasks (Tables 8-9, Section 3.3). They acknowledge 'the results are not definitive' for their judging ablation. They also note residual contamination in some tasks (Appendix A.7)."
    127       }
    128     },
    129     "claims_and_evidence": {
    130       "abstract_claims_supported": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The abstract claims (frequently-updated questions, objective ground-truth scoring, wide variety of tasks, top models below 70%) are all supported by the experimental results. o1-preview achieves 64.7%, the highest score, confirming below 70%."
    134       },
    135       "causal_claims_justified": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Most claims are descriptive or correlational. Causal-sounding claims are appropriately hedged: 'likely due to the known bias from using gpt-4 itself as the LLM judge' (Section 3.3). The LLM judging experiment (Appendix A.2) provides evidence for the claim that LLM judges struggle with hard questions they cannot solve."
    139       },
    140       "generalization_bounded": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The Limitations section (Section 5) explicitly bounds the scope: 'there are still additions from which it would benefit... non-English language tasks... ground truth scoring... cannot be used for certain use cases... biases due to certain LLM families favoring certain prompt types.'"
    144       },
    145       "alternative_explanations_discussed": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Section 3.3 discusses alternative explanations for benchmark differences: 'We hypothesize that the strong performance of some models such as the gemini-1.5 models on ChatBot Arena compared to LiveBench may be due to having an output style that is preferred by humans.' Appendix A.7 discusses different forms of contamination."
    149       },
    150       "proxy_outcome_distinction": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The paper measures accuracy on specific tasks and calls it 'LiveBench score' without overclaiming. It does not frame task accuracy as measuring 'intelligence' or 'general capability' — claims match the granularity of measurements."
    154       }
    155     },
    156     "setup_transparency": {
    157       "model_versions_specified": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Specific model versions with snapshots are used throughout: 'gpt-4-0125-preview', 'claude-3-5-sonnet-20240620', 'o1-preview-2024-09-12', etc. Table 4 lists all 80+ model versions with citations."
    161       },
    162       "prompts_provided": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Example prompts are provided verbatim for multiple tasks throughout Sections 2.1-2.6 and Appendix A.3 (math competitions, zebra puzzles, typos, Web of Lies v2, spatial reasoning, connections). All code including prompts is released on GitHub."
    166       },
    167       "hyperparameters_reported": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 3 states: 'we perform single-turn evaluation with temperature 0, unless otherwise noted in the model card' and 'We run all open-source models with bfloat16.' Appendix A.5 describes how model hyperparameters are set to match model documentation."
    171       },
    172       "scaffolding_described": {
    173         "applies": false,
    174         "answer": false,
    175         "justification": "No agentic scaffolding is used. LiveBench performs direct single-turn model evaluation without tools, agents, or multi-step workflows."
    176       },
    177       "data_preprocessing_documented": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Detailed descriptions of question generation and processing are provided for each task category (Sections 2.1-2.6 and Appendix A.3), including how math questions are modified, how typos are injected, how tables are sampled, and how coding questions are truncated."
    181       }
    182     },
    183     "limitations_and_scope": {
    184       "limitations_section_present": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 5 is titled 'Conclusions, Limitations, and Future Work' and contains a substantive 'Limitations and Future Work' subsection discussing specific shortcomings."
    188       },
    189       "threats_to_validity_specific": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Specific threats are discussed: lack of non-English tasks, inability to evaluate open-ended questions with ground-truth scoring, prompt biases favoring certain LLM families (Section 5). Appendix A.7 discusses residual contamination in coding questions from November 2023."
    193       },
    194       "scope_boundaries_stated": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section 5 states specific boundaries: 'we hope to add non-English language tasks', 'ground truth scoring... cannot be used for certain use cases, such as write a travel guide to Hawaii', and 'there are still biases due to certain LLM families favoring certain prompt types.'"
    198       }
    199     },
    200     "data_integrity": {
    201       "raw_data_available": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Section 6 states: 'we open-source the leaderboard, all questions, all code to run API and open-source models, all model outputs for 40 models, and all code to score the models.' Raw model outputs and questions are publicly available."
    205       },
    206       "data_collection_described": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Detailed data collection procedures are described for each task: math competition sources (AMC12, AIME, SMC, USAMO, IMO), coding sources (LiveCodeBench, LeetCode), Kaggle/Socrata datasets, Guardian articles, ArXiv abstracts, and IMDb/Wikipedia synopses (Sections 2.1-2.6)."
    210       },
    211       "recruitment_methods_described": {
    212         "applies": false,
    213         "answer": false,
    214         "justification": "No human participants are involved. Data sources are publicly available benchmarks, competitions, news articles, and datasets."
    215       },
    216       "data_pipeline_documented": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "The full pipeline from data sources to final questions is documented for each task, including question generation, modification, scoring methodology (Sections 2.1-2.6, Appendix A.3), and the monthly update process (Section 2.7, Appendix A.6)."
    220       }
    221     },
    222     "conflicts_of_interest": {
    223       "funding_disclosed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "The footnote on page 1 states 'Sponsored by Abacus.AI' and lists author affiliations including Abacus.AI, NYU, Nvidia, UMD, USC, and Columbia."
    227       },
    228       "affiliations_disclosed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "Author affiliations are listed on page 1. Multiple first authors are from Abacus.AI, and the paper evaluates Dracarys models (an Abacus.AI product) alongside other models."
    232       },
    233       "funder_independent_of_outcome": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "Abacus.AI sponsored the research and multiple first authors are Abacus.AI employees. The benchmark evaluates Abacus.AI's own Dracarys models, which rank 14th overall (50.1%). The funder has a direct interest in the benchmark's outcomes and in how their models are ranked."
    237       },
    238       "financial_interests_declared": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No competing interests or financial interests statement is included in the paper. Given that Abacus.AI employees designed the benchmark and evaluate their own products, a formal disclosure would be expected."
    242       }
    243     },
    244     "contamination": {
    245       "training_cutoff_stated": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "The paper does not state the training data cutoff dates for the 40 models evaluated. It discusses contamination conceptually and uses temporal prevention (recent data), but individual model cutoffs are not listed."
    249       },
    250       "train_test_overlap_discussed": {
    251         "applies": true,
    252         "answer": true,
    253         "justification": "The entire paper is motivated by train/test overlap concerns. Appendix A.7 explicitly discusses: 'we acknowledge that LiveBench does not fully satisfy [test set contamination]: while nearly all questions are from June 2024 or more recent, there are some coding questions from November 2023.' The benchmark design uses temporal prevention."
    254       },
    255       "benchmark_contamination_addressed": {
    256         "applies": true,
    257         "answer": true,
    258         "justification": "Contamination is the paper's central concern. They address it through: (1) using recently-released data sources, (2) monthly question updates, (3) keeping 1/6 questions private, (4) modifying existing benchmark tasks to reduce memorization, and (5) discussing different types of contamination in Appendix A.7."
    259       }
    260     },
    261     "human_studies": {
    262       "pre_registered": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants are involved in this benchmark evaluation study."
    266       },
    267       "irb_or_ethics_approval": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants are involved in this benchmark evaluation study."
    271       },
    272       "demographics_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants are involved in this benchmark evaluation study."
    276       },
    277       "inclusion_exclusion_criteria": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants are involved in this benchmark evaluation study."
    281       },
    282       "randomization_described": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants are involved in this benchmark evaluation study."
    286       },
    287       "blinding_described": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "No human participants are involved in this benchmark evaluation study."
    291       },
    292       "attrition_reported": {
    293         "applies": false,
    294         "answer": false,
    295         "justification": "No human participants are involved in this benchmark evaluation study."
    296       }
    297     },
    298     "cost_and_practicality": {
    299       "inference_cost_reported": {
    300         "applies": true,
    301         "answer": true,
    302         "justification": "Table 16 provides approximate costs in USD for running LiveBench on each API model (e.g., o1-preview at ~$47.87, claude-3-haiku at ~$0.90). Table 15 provides mean and standard deviation of input/output tokens per task."
    303       },
    304       "compute_budget_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "Per-model API costs are listed in Table 16 but total computational budget is not stated. GPU hours for running open-source models are not reported. The paper only mentions that evaluation is 'easily within the computational budgets of the authors' institutions' (Section 2.7)."
    308       }
    309     },
    310     "experimental_rigor": {
    311       "seed_sensitivity_reported": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Results are from single runs with temperature 0. While temperature 0 reduces stochasticity, no sensitivity analysis across runs or seeds is reported. Some API implementations are not perfectly deterministic even at temperature 0."
    315       },
    316       "number_of_runs_stated": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "Section 3 states 'we perform single-turn evaluation with temperature 0,' implying a single run per model per question. Bootstrap confidence intervals are computed over questions, not multiple runs."
    320       },
    321       "hyperparameter_search_budget": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The paper states prompts are 'tailored for each category and task' (Section 2) but does not report how many prompt variants were tried or the budget spent on prompt design. Model hyperparameters are set from documentation, but prompt engineering effort is unreported."
    325       },
    326       "best_config_selection_justified": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "All models are evaluated with standardized settings (temperature 0, documented chat templates) and all 40 models' full results are reported in Table 2. No selective reporting of configurations."
    330       },
    331       "multiple_comparison_correction": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The paper compares 40 models across 18 tasks and reports many correlation coefficients (Tables 5-6) without any correction for multiple comparisons. Claims like 'X substantially outperforms Y' are made from raw score comparisons."
    335       },
    336       "self_comparison_bias_addressed": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "Abacus.AI employees designed the benchmark and evaluate their own Dracarys models (which rank 14th/40 overall). The potential bias of benchmark designers evaluating their own models is not acknowledged or discussed."
    340       },
    341       "compute_budget_vs_performance": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "The paper evaluates models ranging from 0.5B to 405B parameters but does not report performance as a function of compute budget. phi-3.5-moe with 6.6B active parameters is noted to outperform larger models, but no systematic compute-performance analysis is provided."
    345       },
    346       "benchmark_construct_validity": {
    347         "applies": true,
    348         "answer": true,
    349         "justification": "The paper extensively discusses what LiveBench measures versus alternatives. Section 3.3 compares with ChatBot Arena and Arena-Hard, analyzing where rankings diverge and hypothesizing why. The paper discusses biases of LLM judging and human judging approaches. Figure 2 analyzes inter-task correlations to assess category independence."
    350       },
    351       "scaffold_confound_addressed": {
    352         "applies": false,
    353         "answer": false,
    354         "justification": "No scaffolding is involved. LiveBench evaluates models directly via single-turn prompts without agents, tools, or multi-step workflows."
    355       }
    356     },
    357     "data_leakage": {
    358       "temporal_leakage_addressed": {
    359         "applies": true,
    360         "answer": true,
    361         "justification": "Temporal leakage is the paper's central concern. All benchmark questions use data from November 2023 or later, and the benchmark is updated monthly with new questions. Section 1 cites evidence of temporal contamination in other benchmarks (e.g., Codeforces performance dropping after training cutoff)."
    362       },
    363       "feature_leakage_addressed": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "The paper does not discuss whether the evaluation setup (prompt format, answer structure, or context) could leak answer information to the models. Appendix A.4 discusses parsing fairness but not feature leakage in the Kapoor & Narayanan sense."
    367       },
    368       "non_independence_addressed": {
    369         "applies": true,
    370         "answer": false,
    371         "justification": "The paper does not discuss potential non-independence between questions (e.g., multiple questions from the same math competition, questions from the same Kaggle dataset, or structural similarities between questions within a task)."
    372       },
    373       "leakage_detection_method": {
    374         "applies": true,
    375         "answer": true,
    376         "justification": "The paper uses concrete prevention methods: temporal splits (using data after training cutoffs), keeping 1/6 questions private each month, modifying questions from existing benchmarks (rearranging answer choices, changing prose), and not releasing question generation code. These are concrete anti-leakage measures, though detection methods (canary strings, membership inference) are not used."
    377       }
    378     }
    379   },
    380   "claims": [
    381     {
    382       "claim": "LiveBench is the first benchmark that combines frequently-updated questions from recent sources, objective ground-truth scoring without LLM judges, and diverse task coverage across six categories.",
    383       "evidence": "Section 1 describes the three desiderata and claims novelty. The benchmark includes 18 tasks across math, coding, reasoning, language, instruction following, and data analysis, with questions from recent sources and automated scoring.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "o1-preview-2024-09-12 performs the best overall, 6% better than all other models, with no current model achieving higher than 70% accuracy.",
    388       "evidence": "Table 1 shows o1-preview at 64.7% overall, with the next-best model (claude-3-5-sonnet-20241022) at 58.5%. All models are below 70%.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "LLM judges cannot accurately evaluate challenging math and reasoning questions, with error rates of 21-46%.",
    393       "evidence": "Tables 8-9 (Appendix A.2) show GPT-4-Turbo judge error rates of 21.4-42.0% on AMC12, AIME, SMC, and Zebra puzzles. The authors acknowledge this is a 'preliminary study' and 'the results are not definitive.'",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "LiveBench correlates well with ChatBot Arena (r=0.91) and Arena-Hard (r=0.88) but reveals meaningful differences between benchmarks.",
    398       "evidence": "Section 3.3 reports the correlation coefficients. Figures 4 and 6 show the comparisons and identify outlier models (e.g., GPT-4 models performing better on Arena-Hard, likely due to LLM judge bias).",
    399       "supported": "strong"
    400     },
    401     {
    402       "claim": "Model rankings remain stable across monthly benchmark updates (rank correlation >0.997) while the benchmark becomes harder over time.",
    403       "evidence": "Section 3.4 reports rank correlations >0.997 between updates and a 1.2% drop in median/mean scores across two updates.",
    404       "supported": "moderate"
    405     },
    406     {
    407       "claim": "Math competition performance (math_comp) is the greatest indicator of overall model performance.",
    408       "evidence": "Table 6 shows math_comp has the second-highest task correlation with overall LiveBench score (r=0.9035, after web_of_lies_v2 at r=0.9136). Section 3.2 states math_comp 'correlates the highest with average LiveBench performance.'",
    409       "supported": "moderate"
    410     },
    411     {
    412       "claim": "The best-performing open-source models (Llama-3.1-405b and Qwen2.5-72b) outperform GPT-4-turbo.",
    413       "evidence": "Table 2 shows meta-llama-3.1-405b-instruct-turbo at 51.1% and qwen2.5-coder-32b-instruct at 45.0%, while gpt-4-turbo-2024-04-09 scores 49.6%. However, qwen2.5-72b-instruct is not shown in the results table (qwen2.5-coder-32b-instruct is shown instead).",
    414       "supported": "moderate"
    415     }
    416   ],
    417   "red_flags": [
    418     {
    419       "flag": "Undisclosed conflict of interest",
    420       "detail": "Multiple first authors are from Abacus.AI, the paper's sponsor. Abacus.AI's Dracarys models are evaluated and rank 14th/40 (50.1%), but no competing interests statement is provided. The benchmark designers evaluating their own product creates potential for favorable task/prompt selection."
    421     },
    422     {
    423       "flag": "No significance tests for model comparisons",
    424       "detail": "Claims like 'o1-preview substantially outperforms all other models' and 'the best-performing open-source models outperform gpt-4-turbo' are based on comparing raw point estimates. While bootstrap CIs are shown in figures, no formal significance tests support the comparative claims in the text."
    425     },
    426     {
    427       "flag": "Residual contamination acknowledged",
    428       "detail": "Appendix A.7 acknowledges: 'there are some coding questions from November 2023, and the AMC questions have only undergone a low level of modification from their November 2023 version. Therefore, a limited fraction of LiveBench is likely contaminated on all recent LLMs.' This undermines the central contamination-resistance claim for some tasks."
    429     },
    430     {
    431       "flag": "Preliminary LLM judging study presented as evidence",
    432       "detail": "The LLM judging ablation (Appendix A.2) is described as 'preliminary' with 'not definitive' results, using only one judge model (GPT-4-Turbo) and two evaluated models. Despite this, it is used to support the paper's central argument against LLM judging."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "To the cutoff... and beyond? A longitudinal perspective on LLM data contamination",
    438       "authors": ["Manley Roberts", "Himanshu Thakur", "Christine Herlihy", "Colin White", "Samuel Dooley"],
    439       "year": 2024,
    440       "relevance": "Provides evidence of test set contamination in LLM benchmarks, including performance drops after training cutoff dates on Codeforces."
    441     },
    442     {
    443       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    444       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    445       "year": 2024,
    446       "arxiv_id": "2403.07974",
    447       "relevance": "Contamination-free code evaluation benchmark that LiveBench draws coding tasks from; shares the temporal-freshness approach."
    448     },
    449     {
    450       "title": "Chatbot Arena: An open platform for evaluating LLMs by human preference",
    451       "authors": ["Wei-Lin Chiang", "Lianmin Zheng"],
    452       "year": 2024,
    453       "arxiv_id": "2403.04132",
    454       "relevance": "Major human-preference LLM evaluation platform that LiveBench compares against (r=0.91 correlation)."
    455     },
    456     {
    457       "title": "From live data to high-quality benchmarks: The Arena-Hard pipeline",
    458       "authors": ["Tianle Li", "Wei-Lin Chiang", "Evan Frick"],
    459       "year": 2024,
    460       "relevance": "LLM-judge based benchmark that LiveBench compares against (r=0.88 correlation); demonstrates LLM judge biases."
    461     },
    462     {
    463       "title": "Challenging big-bench tasks and whether chain-of-thought can solve them",
    464       "authors": ["Mirac Suzgun", "Nathan Scales", "Nathanael Schärli"],
    465       "year": 2023,
    466       "arxiv_id": "2210.09261",
    467       "relevance": "Big-Bench Hard benchmark whose Web of Lies task LiveBench extends with a harder v2 version."
    468     },
    469     {
    470       "title": "Instruction-following evaluation for large language models",
    471       "authors": ["Jeffrey Zhou", "Tianjian Lu", "Swaroop Mishra"],
    472       "year": 2023,
    473       "arxiv_id": "2311.07911",
    474       "relevance": "IFEval benchmark for verifiable instruction following; LiveBench adapts its instruction set with live prompts from Guardian articles."
    475     },
    476     {
    477       "title": "Generalization or memorization: Data contamination and trustworthy evaluation for large language models",
    478       "authors": ["Yihong Dong", "Xue Jiang", "Huanyu Liu"],
    479       "year": 2024,
    480       "arxiv_id": "2402.15938",
    481       "relevance": "Analyzes data contamination and its effects on trustworthy LLM evaluation."
    482     },
    483     {
    484       "title": "A careful examination of large language model performance on grade school arithmetic",
    485       "authors": ["Hugh Zhang", "Jeff Da", "Dean Lee"],
    486       "year": 2024,
    487       "arxiv_id": "2405.00332",
    488       "relevance": "Demonstrates evidence that several models have overfit to the GSM8K benchmark, motivating contamination-resistant evaluation."
    489     },
    490     {
    491       "title": "Evaluation data contamination in LLMs: how do we measure it and (when) does it matter?",
    492       "authors": ["Aaditya K Singh", "Muhammed Yusuf Kocyigit"],
    493       "year": 2024,
    494       "arxiv_id": "2411.03923",
    495       "relevance": "Discusses measurement approaches for evaluation data contamination in LLMs."
    496     },
    497     {
    498       "title": "Length-controlled AlpacaEval: A simple way to debias automatic evaluators",
    499       "authors": ["Yann Dubois", "Balázs Galambosi", "Percy Liang", "Tatsunori B Hashimoto"],
    500       "year": 2024,
    501       "arxiv_id": "2404.04475",
    502       "relevance": "Addresses LLM judge biases including verbosity preference in automatic evaluation, relevant to LiveBench's argument against LLM judging."
    503     },
    504     {
    505       "title": "Functional benchmarks for robust evaluation of reasoning performance, and the reasoning gap",
    506       "authors": ["Saurabh Srivastava", "Anto PV", "Shashank Menon"],
    507       "year": 2024,
    508       "arxiv_id": "2402.19450",
    509       "relevance": "Modifies MATH dataset by changing numbers and finds performance declines, showing contamination effects on math benchmarks."
    510     },
    511     {
    512       "title": "MMLU-Pro: A more robust and challenging multi-task language understanding benchmark",
    513       "authors": ["Yubo Wang", "Xueguang Ma", "Ge Zhang"],
    514       "year": 2024,
    515       "arxiv_id": "2406.01574",
    516       "relevance": "Contemporary static LLM benchmark used on the HuggingFace Open LLM Leaderboard; context for LiveBench's dynamic approach."
    517     },
    518     {
    519       "title": "Data contamination quiz: A tool to detect and estimate contamination in large language models",
    520       "authors": ["Shahriar Golchin", "Mihai Surdeanu"],
    521       "year": 2023,
    522       "arxiv_id": "2311.06233",
    523       "relevance": "Tool for detecting data contamination in LLMs, relevant to benchmark contamination assessment."
    524     },
    525     {
    526       "title": "Proving test set contamination in black box language models",
    527       "authors": ["Yonatan Oren", "Nicole Meister", "Niladri Chatterji"],
    528       "year": 2023,
    529       "arxiv_id": "2310.17623",
    530       "relevance": "Provides methods for proving test set contamination in black-box LLMs."
    531     }
    532   ],
    533   "engagement_factors": {
    534     "practical_relevance": {
    535       "score": 3,
    536       "justification": "LiveBench is immediately usable as a fully open-source benchmark with pip-installable code, leaderboard, and monthly updates — practitioners can evaluate any model today."
    537     },
    538     "surprise_contrarian": {
    539       "score": 1,
    540       "justification": "The contamination problem is well-known; the paper confirms rather than challenges beliefs, though the LLM judging failure rates on hard questions add mild surprise."
    541     },
    542     "fear_safety": {
    543       "score": 0,
    544       "justification": "No AI risk or security concerns are raised; this is a benchmark methodology paper."
    545     },
    546     "drama_conflict": {
    547       "score": 2,
    548       "justification": "Implicitly argues that popular benchmarks (ChatBot Arena, Arena-Hard, MMLU) are compromised by contamination or LLM judge bias, and shows GPT-4 judges are biased toward their own outputs."
    549     },
    550     "demo_ability": {
    551       "score": 3,
    552       "justification": "Fully open-source on GitHub with all code, questions, and model outputs; a live leaderboard at livebench.ai; and instructions to evaluate new models."
    553     },
    554     "brand_recognition": {
    555       "score": 2,
    556       "justification": "Published at ICLR 2025, evaluates GPT-4/o1, Claude, Gemini; co-authored by Yann LeCun (NYU/Meta). Abacus.AI is less prominent but the models evaluated are household names."
    557     }
    558   }
    559 }

Impressum · Datenschutz