scan-v5.json (22941B)
1 { 2 "scan_version": 5, 3 "paper_type": "benchmark-creation", 4 "paper": { 5 "title": "LiveBench: A Challenging, Contamination-Limited LLM Benchmark", 6 "authors": [ 7 "Colin White", 8 "Samuel Dooley", 9 "Manley Roberts", 10 "Arka Pal", 11 "Benjamin Feuer", 12 "Siddhartha Jain", 13 "Ravid Shwartz-Ziv", 14 "Neel Jain", 15 "Khalid Saifullah", 16 "Sreemanti Dey", 17 "Shubh Agrawal", 18 "Sandeep Singh Sandha", 19 "Siddartha Naidu", 20 "Chinmay Hegde", 21 "Yann LeCun", 22 "Tom Goldstein", 23 "Willie Neiswanger", 24 "Micah Goldblum" 25 ], 26 "year": 2024, 27 "venue": "ICLR 2025", 28 "arxiv_id": "2406.19314", 29 "doi": "10.48550/arXiv.2406.19314" 30 }, 31 "checklist": { 32 "claims_and_evidence": { 33 "abstract_claims_supported": { 34 "applies": true, 35 "answer": true, 36 "justification": "All abstract claims are supported: top models achieve below 70% accuracy (Table 1/2), six task categories are demonstrated, monthly updates are described with completion details in Section 2.7, and all code/questions/answers are released at livebench.ai.", 37 "source": "haiku" 38 }, 39 "causal_claims_justified": { 40 "applies": true, 41 "answer": true, 42 "justification": "The causal-adjacent claims (e.g., 'gpt-4 models perform better on Arena-Hard likely due to known bias from using gpt-4 as the LLM judge') are appropriately hedged with 'likely due to' and 'hypothesize,' and the ablation in Appendix A.2 provides supporting evidence for LLM judge failure on hard tasks.", 43 "source": "haiku" 44 }, 45 "generalization_bounded": { 46 "applies": true, 47 "answer": true, 48 "justification": "The paper explicitly uses 'contamination-limited' rather than 'contamination-free' in the actual paper title (vs. arXiv metadata), and Appendix A.7 explicitly acknowledges that November 2023 coding questions and lightly-modified AMC questions may be contaminated on recent LLMs.", 49 "source": "haiku" 50 }, 51 "alternative_explanations_discussed": { 52 "applies": true, 53 "answer": true, 54 "justification": "The paper considers multiple explanations for cross-benchmark performance differences, including LLM judge bias (gpt-4 judging GPT-4), human preference for verbose outputs, and model output style advantages on ChatBot Arena.", 55 "source": "haiku" 56 }, 57 "proxy_outcome_distinction": { 58 "applies": true, 59 "answer": true, 60 "justification": "The paper is careful about what each task measures; the limitations section explicitly acknowledges that ground-truth scoring cannot cover open-ended tasks ('write a travel guide to Hawaii'), distinguishing benchmark-measurable capabilities from broader general intelligence.", 61 "source": "haiku" 62 } 63 }, 64 "limitations_and_scope": { 65 "limitations_section_present": { 66 "applies": true, 67 "answer": true, 68 "justification": "Section 5 includes a dedicated 'Limitations and Future Work' paragraph listing specific gaps: no non-English tasks, ground-truth scoring inapplicable to open-ended tasks, and prompt-type biases across LLM families.", 69 "source": "haiku" 70 }, 71 "threats_to_validity_specific": { 72 "applies": true, 73 "answer": true, 74 "justification": "Specific threats are identified: some November 2023 coding questions are likely contaminated on recent models (Appendix A.7), prompt format biases may advantage certain LLM families, and the Olympiad task notes results were from June 2024 before newer models were released.", 75 "source": "haiku" 76 }, 77 "scope_boundaries_stated": { 78 "applies": true, 79 "answer": true, 80 "justification": "Explicit scope boundaries are stated: benchmark cannot evaluate open-ended responses, currently English-only, does not cover all LLM capability types, and the private-question pool only partially prevents contamination of the monthly-updated questions.", 81 "source": "haiku" 82 } 83 }, 84 "conflicts_of_interest": { 85 "funding_disclosed": { 86 "applies": true, 87 "answer": true, 88 "justification": "Funding is disclosed in a footnote on page 1: 'Sponsored by Abacus.AI.' Authors are primarily affiliated with Abacus.AI (first affiliation listed).", 89 "source": "haiku" 90 }, 91 "affiliations_disclosed": { 92 "applies": true, 93 "answer": true, 94 "justification": "All author affiliations are disclosed: Abacus.AI, NYU, Nvidia, UMD, USC, and Columbia are listed on the title page.", 95 "source": "haiku" 96 }, 97 "funder_independent_of_outcome": { 98 "applies": true, 99 "answer": false, 100 "justification": "Abacus.AI sponsors the work and the lead authors are Abacus.AI employees; Abacus.AI's own models ('dracarys2-72b-instruct') are included in the evaluation without explicit disclosure of this conflict, creating a potential evaluation bias.", 101 "source": "haiku" 102 }, 103 "financial_interests_declared": { 104 "applies": true, 105 "answer": false, 106 "justification": "There is no competing interests or financial interests statement in the paper; only the sponsorship acknowledgment appears in a footnote, with no declaration of equity, patents, or consulting relationships.", 107 "source": "haiku" 108 } 109 }, 110 "scope_and_framing": { 111 "key_terms_defined": { 112 "applies": true, 113 "answer": true, 114 "justification": "Key terms are defined precisely: Appendix A.7 distinguishes two definitions of 'contamination' (test set contamination vs. task contamination/train-test distribution similarity), and each of the 18 tasks is defined with its measurement methodology.", 115 "source": "haiku" 116 }, 117 "intended_contribution_clear": { 118 "applies": true, 119 "answer": true, 120 "justification": "The contribution is stated explicitly and repeatedly: LiveBench is the first benchmark combining (1) frequently-updated questions from recent sources, (2) automatic objective ground-truth scoring, and (3) diverse task coverage across six categories.", 121 "source": "haiku" 122 }, 123 "engagement_with_prior_work": { 124 "applies": true, 125 "answer": true, 126 "justification": "Section 4 systematically compares LiveBench to prior benchmarks (ChatBot Arena, Arena-Hard, AlpacaEval, SEAL, LiveCodeBench, Omni-MATH, Open LLM Leaderboard) and explains specifically how LiveBench addresses their limitations rather than merely listing citations.", 127 "source": "haiku" 128 } 129 } 130 }, 131 "type_checklist": { 132 "benchmark-creation": { 133 "construct_design": { 134 "construct_validity_argued": { 135 "applies": true, 136 "answer": true, 137 "justification": "The paper argues why each task measures its claimed capability: math competitions test problem-solving with diverse question types, coding tasks use execution-based pass@1, and the Web of Lies v2 expansion with red herrings tests logical deduction rather than pattern matching.", 138 "source": "haiku" 139 }, 140 "difficulty_distribution_characterized": { 141 "applies": true, 142 "answer": true, 143 "justification": "Each task targets a 30-70% success rate on top models, difficulty variants are explicitly created for the Olympiad task (10/50/80% masking, Table 10-11), and bootstrap confidence intervals are reported for all 40 models in Figure 1/Figure 5.", 144 "source": "haiku" 145 }, 146 "ceiling_floor_effects_checked": { 147 "applies": true, 148 "answer": true, 149 "justification": "Ceiling effects are explicitly addressed: no model exceeds 70% accuracy, and the maintenance policy (Section 2.7) replaces the easiest tasks first precisely to prevent ceiling effects as models improve. Floor effects are not explicitly analyzed for the weakest models (phi-3-mini scores ~21%).", 150 "source": "haiku" 151 }, 152 "human_baseline_included": { 153 "applies": true, 154 "answer": false, 155 "justification": "No human baseline is reported for any task. The math competition questions are known human competitions, but the paper does not report human accuracy on the LiveBench versions of these tasks, making it impossible to assess where human performance falls relative to LLMs.", 156 "source": "haiku" 157 }, 158 "scoring_rubric_justified": { 159 "applies": true, 160 "answer": true, 161 "justification": "Scoring rubrics are justified throughout: pass@1 for coding (tests complete solution correctness), Levenshtein distance for plot unscrambling (captures partial ordering quality), regex-based permissive grading for math to avoid penalizing format variations, and the grading methodology appendix (A.4) justifies why LLM judges are avoided.", 162 "source": "haiku" 163 } 164 }, 165 "robustness": { 166 "contamination_resistance_designed": { 167 "applies": true, 168 "answer": true, 169 "justification": "Contamination resistance is the primary design goal: questions are sourced from post-June 2024 publications, new questions are withheld for one month, and the generation code is modified with each update so that distribution shifts make pretraining on old questions less useful.", 170 "source": "haiku" 171 }, 172 "temporal_robustness_discussed": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section 2.7 and Appendix A.6 provide a detailed maintenance plan: monthly updates replacing 1/6 of questions, priority given to oldest and easiest tasks, sustainability assessed in terms of computational burden (~50 models × 200 questions/month), and two completed updates demonstrating the plan works.", 176 "source": "haiku" 177 }, 178 "failure_modes_discussed": { 179 "applies": true, 180 "answer": true, 181 "justification": "Appendix A.7 explicitly identifies failure modes: some November 2023 coding questions and lightly-modified AMC questions may be contaminated; task contamination (distribution similarity) is discussed as a distinct remaining risk; Section 5 notes open-ended tasks cannot be covered.", 182 "source": "haiku" 183 }, 184 "baseline_implementations_provided": { 185 "applies": true, 186 "answer": true, 187 "justification": "The paper releases all questions, model answers for 40 models, and full codebase at https://github.com/livebench/livebench; Table 2 provides complete results for all 40 models across all 6 categories for reproducibility.", 188 "source": "haiku" 189 } 190 }, 191 "documentation": { 192 "dataset_documentation_complete": { 193 "applies": true, 194 "answer": true, 195 "justification": "Table 13 provides data sources for all 18 tasks, Table 15 provides token statistics, the paper includes a full datasheet (referenced at GitHub), and each task's collection methodology and preprocessing steps are described in Appendix A.3.", 196 "source": "haiku" 197 }, 198 "licensing_and_access_clear": { 199 "applies": true, 200 "answer": true, 201 "justification": "The repository is under Apache License 2.0 (Section B.1), the dataset is on HuggingFace, the leaderboard is at livebench.ai, and the README provides download instructions; community contributions are explicitly welcomed.", 202 "source": "haiku" 203 }, 204 "intended_use_specified": { 205 "applies": true, 206 "answer": true, 207 "justification": "The intended use (evaluating LLM capabilities across diverse tasks in a contamination-resistant way) is clearly stated, and limitations section specifies what the benchmark cannot evaluate (open-ended tasks, non-English capabilities), bounding appropriate conclusions.", 208 "source": "haiku" 209 } 210 } 211 } 212 }, 213 "claims": [ 214 { 215 "claim": "LiveBench is the first benchmark combining frequently-updated questions from recent sources, objective ground-truth scoring, and diverse task coverage across six categories.", 216 "evidence": "Abstract and Section 1 compare against prior benchmarks (ChatBot Arena uses human judges, Arena-Hard uses LLM judges, LiveCodeBench is coding-only), and Section 2 demonstrates the implementation of all three desiderata.", 217 "supported": "strong" 218 }, 219 { 220 "claim": "No current model achieves higher than 70% accuracy on LiveBench, demonstrating the benchmark is genuinely challenging.", 221 "evidence": "Table 1 and Table 2 show o1-preview at 64.7% as the top performer; Figure 5 shows all 40 models below 70%.", 222 "supported": "strong" 223 }, 224 { 225 "claim": "LLM judges have unacceptably high error rates (21-46%) when evaluating hard math and reasoning questions.", 226 "evidence": "Table 8 shows GPT-4-Turbo judging its own outputs on AMC12 with 38% error, AIME with 21.4%, SMC with 35.3%, Zebra puzzles with 42%. Table 9 shows judge-assigned scores diverge significantly from ground-truth scores.", 227 "supported": "strong" 228 }, 229 { 230 "claim": "LiveBench has 0.91 and 0.88 Pearson correlation with ChatBot Arena and Arena-Hard respectively, showing it captures similar model ranking information.", 231 "evidence": "Figure 4 and Section 3.3 report these correlations; Figure 6 shows scatter plots with best-fit lines for both comparisons.", 232 "supported": "strong" 233 }, 234 { 235 "claim": "Monthly question updates maintain consistent model rankings (rank correlation >0.997 between updates) while making the benchmark harder over time.", 236 "evidence": "Section 3.4 reports rank correlation >0.997 between original and first update, and first and second update; median/mean scores dropped ~1.2% across updates.", 237 "supported": "strong" 238 }, 239 { 240 "claim": "Some models (gpt-4-0125-preview, gpt-4-turbo) perform substantially better on Arena-Hard than LiveBench due to LLM judge bias.", 241 "evidence": "Figure 4 shows these models as notable outliers above the best-fit line in Arena-Hard vs. LiveBench comparison; the paper hypothesizes this is because GPT-4 judges favor GPT-4 outputs.", 242 "supported": "moderate" 243 }, 244 { 245 "claim": "LLM performance on Codeforces drops significantly after training cutoff dates, providing evidence of benchmark contamination in prior work.", 246 "evidence": "Paper cites Roberts et al. (2024) and Jain et al. (2024) for this claim; it is not demonstrated directly in this paper but used as motivation.", 247 "supported": "moderate" 248 } 249 ], 250 "methodology_tags": [ 251 "benchmark-eval", 252 "benchmark-creation", 253 "observational" 254 ], 255 "key_findings": "LiveBench introduces a contamination-resistant LLM benchmark by sourcing questions from recent (post-June 2024) information and updating them monthly, using only objective ground-truth scoring to eliminate LLM judge biases. No model exceeds 70% accuracy across 18 tasks in 6 categories, demonstrating persistent difficulty. LLM judges show 21-46% error rates on hard math and reasoning tasks, empirically validating the design choice to avoid them. Monthly question updates maintain rank-stable leaderboard consistency (Spearman r>0.997) while gradually increasing difficulty as models improve.", 256 "red_flags": [ 257 { 258 "flag": "Title contamination overclaim", 259 "detail": "The arXiv/registry title says 'Contamination-Free' but the actual published paper uses 'Contamination-Limited'; Appendix A.7 explicitly acknowledges November 2023 coding questions and lightly-modified AMC questions are likely contaminated in recent models." 260 }, 261 { 262 "flag": "No human baseline", 263 "detail": "The paper never reports human accuracy on any task, making it impossible to interpret what fraction of human performance any LLM achieves or whether the benchmark difficulty distribution is appropriate." 264 }, 265 { 266 "flag": "Funder evaluates own models", 267 "detail": "Abacus.AI sponsors the benchmark and includes its own 'Dracarys' model family in the leaderboard without explicit disclosure of this conflict; evaluation infrastructure is controlled by the funder." 268 }, 269 { 270 "flag": "No contamination empirical test", 271 "detail": "The paper argues the benchmark resists contamination by design but provides no empirical test comparing model performance before/after training on LiveBench data, leaving the core claim unvalidated." 272 } 273 ], 274 "cited_papers": [ 275 { 276 "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code", 277 "relevance": "Direct predecessor for coding tasks; also uses temporal splits for contamination resistance — LiveBench adopts 78 problems from LCB and uses the same anti-contamination philosophy." 278 }, 279 { 280 "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference", 281 "relevance": "Primary comparison benchmark; LiveBench achieves 0.91 correlation with Arena while avoiding human judging biases that Arena is subject to." 282 }, 283 { 284 "title": "From Live Data to High-Quality Benchmarks: The Arena-Hard Pipeline", 285 "relevance": "Second comparison benchmark; LiveBench shows some models (GPT-4) score substantially higher on Arena-Hard due to LLM judge bias." 286 }, 287 { 288 "title": "Challenging Big-Bench Tasks and Whether Chain-of-Thought Can Solve Them", 289 "relevance": "Source of Web of Lies task; LiveBench creates a harder v2 version after models reached near-100% on the original." 290 }, 291 { 292 "title": "Instruction-Following Evaluation for Large Language Models (IFEval)", 293 "relevance": "Source of instruction-following task design; LiveBench uses 16 of IFEval's 25 instructions with live news article prompts instead of static prompts." 294 }, 295 { 296 "title": "Measuring Mathematical Problem Solving with the MATH Dataset (AMPS/MATH)", 297 "relevance": "Source of synthetic math question generation approach; LiveBench creates AMPS_Hard by sampling from harder distributions than original AMPS." 298 }, 299 { 300 "title": "To the Cutoff... and Beyond? A Longitudinal Perspective on LLM Data Contamination", 301 "relevance": "Key motivation paper showing LLM performance on Codeforces drops after training cutoff, providing empirical evidence that contamination inflates pre-cutoff benchmark scores." 302 }, 303 { 304 "title": "Investigating Data Contamination in Modern Benchmarks for Large Language Models", 305 "relevance": "Background on contamination measurement methods that motivate LiveBench's design choices." 306 }, 307 { 308 "title": "A Survey on Evaluation of Large Language Models", 309 "relevance": "Comprehensive review referenced for situating LiveBench within the broader LLM evaluation literature." 310 } 311 ], 312 "engagement_factors": { 313 "practical_relevance": { 314 "score": 3, 315 "justification": "LiveBench is immediately usable by practitioners — live leaderboard at livebench.ai, Apache-licensed codebase, and actively maintained with monthly updates including dozens of frontier models." 316 }, 317 "surprise_contrarian": { 318 "score": 2, 319 "justification": "The empirical finding that LLM judges have 38-46% error rates on hard math/reasoning tasks directly challenges the widespread assumption that LLM-as-judge is reliable for hard benchmarks." 320 }, 321 "fear_safety": { 322 "score": 0, 323 "justification": "The paper raises no AI safety or risk concerns; it is focused on evaluation methodology improvement." 324 }, 325 "drama_conflict": { 326 "score": 1, 327 "justification": "Implicitly challenges major benchmarks (Arena-Hard, ChatBot Arena) by showing GPT-4 family models are inflated on those due to judge bias, but the critique is measured and not confrontational." 328 }, 329 "demo_ability": { 330 "score": 3, 331 "justification": "The benchmark is live and publicly accessible at livebench.ai with a real-time leaderboard; anyone can evaluate a new model using the released codebase." 332 }, 333 "brand_recognition": { 334 "score": 2, 335 "justification": "Yann LeCun (Meta/NYU) and Tom Goldstein are prominent co-authors; the benchmark evaluates flagship models from OpenAI, Anthropic, Google, and Meta, lending credibility and visibility." 336 } 337 }, 338 "hn_data": { 339 "threads": [ 340 { 341 "hn_id": "39458363", 342 "title": "Neural Network Diffusion", 343 "points": 223, 344 "comments": 86, 345 "url": "https://news.ycombinator.com/item?id=39458363" 346 }, 347 { 348 "hn_id": "39870037", 349 "title": "GenAI Detection Tools, Adversarial Techniques, Implications in Higher Education", 350 "points": 12, 351 "comments": 2, 352 "url": "https://news.ycombinator.com/item?id=39870037" 353 }, 354 { 355 "hn_id": "45222339", 356 "title": "Analog In-Memory Computing Attention Mechanism for Fast LLMs", 357 "points": 4, 358 "comments": 0, 359 "url": "https://news.ycombinator.com/item?id=45222339" 360 }, 361 { 362 "hn_id": "40617373", 363 "title": "Wavefront Threading Enables Effective High-Level Synthesis", 364 "points": 4, 365 "comments": 0, 366 "url": "https://news.ycombinator.com/item?id=40617373" 367 }, 368 { 369 "hn_id": "40840737", 370 "title": "The Remarkable Robustness of LLMs: Stages of Inference?", 371 "points": 2, 372 "comments": 0, 373 "url": "https://news.ycombinator.com/item?id=40840737" 374 }, 375 { 376 "hn_id": "44994135", 377 "title": "LNS-Madam: Low-Precision Training in Log Using Multiplicative Weight Update", 378 "points": 2, 379 "comments": 1, 380 "url": "https://news.ycombinator.com/item?id=44994135" 381 }, 382 { 383 "hn_id": "44381834", 384 "title": "Thought Anchors: Which LLM Reasoning Steps Matter?", 385 "points": 2, 386 "comments": 0, 387 "url": "https://news.ycombinator.com/item?id=44381834" 388 }, 389 { 390 "hn_id": "39879057", 391 "title": "GenAI Detection Tools, Adversarial Techniques and Implications in Higher Ed", 392 "points": 2, 393 "comments": 0, 394 "url": "https://news.ycombinator.com/item?id=39879057" 395 }, 396 { 397 "hn_id": "36702130", 398 "title": "Scaling datasets lead to more hateful content:(", 399 "points": 1, 400 "comments": 1, 401 "url": "https://news.ycombinator.com/item?id=36702130" 402 }, 403 { 404 "hn_id": "44503713", 405 "title": "Praise: Enhancing Product Descriptions with LLM-Driven Structured Insights", 406 "points": 1, 407 "comments": 0, 408 "url": "https://news.ycombinator.com/item?id=44503713" 409 } 410 ], 411 "top_points": 223, 412 "total_points": 253, 413 "total_comments": 90 414 } 415 }