scan.json (20958B)
1 { 2 "paper": { 3 "title": "From Crowdsourced Data to High-Quality Benchmarks: Arena-Hard and BenchBuilder Pipeline", 4 "authors": ["Tianle Li", "Wei-Lin Chiang", "Evan Frick", "Lisa Dunlap", "Tianhao Wu", "Banghua Zhu", "Joseph E. González", "Ion Stoica"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2406.11939" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "GitHub repository provided: https://github.com/lmarena/arena-hard-auto (Section 1, footnote 1)." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "Arena-Hard-Auto benchmark of 500 prompts is released as part of the open-source repository. The source data (Chatbot Arena, WildChat-1M) are publicly available datasets." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section found in the paper. Only mentions of specific models and APIs used." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided in the paper. The pipeline is described at a high level but specific commands or scripts to replicate results are not included." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": true, 36 "justification": "95% confidence intervals are computed via 100 rounds of bootstrapping on judgment results (Section 6.1). Confidence intervals shown in Figure 5 and used throughout the metrics." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "No formal significance tests (p-values, t-tests, etc.) are used. Comparisons between benchmarks rely on the proposed metrics (separability, agreement) without statistical tests of whether differences are significant." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Effect sizes are provided in context: e.g., '3x higher separation' compared to MT-Bench, specific percentage differences in separability (87.4% vs 22.6%), agreement (90.9% vs 26.6%), and Brier scores (0.069 vs 0.09)." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification for why 500 prompts were chosen, why 250 clusters were sampled, or why 20 models were used for evaluation. These are stated but not justified." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Variance is captured through bootstrapped confidence intervals (100 rounds of bootstrapping). Confidence intervals are shown in Figure 5 and used in the separability metric." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Comparisons against MT-Bench, AlpacaEval 2.0 LC, and Chatbot Arena (Table 1). Random baselines also compared (Table 2, Appendix Table 7)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "MT-Bench (2023), AlpacaEval 2.0 LC (2024), and Chatbot Arena are all contemporary and widely-used LLM benchmarks at time of publication." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Several ablation-like analyses: quality score threshold effects (Figure 3), random vs. curated baselines (Table 2, Table 7), different annotators (Llama-3-70B vs GPT-4-Turbo, Table 8), different judges (Table 4), and style control effects (Table 5)." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Multiple metrics used: Separability, Confidence Agreement, Spearman Correlation, Kendall Tau Correlation, and Brier Score (Table 1)." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": true, 83 "justification": "Chatbot Arena human preference rankings serve as the ground truth against which Arena-Hard-Auto is validated. The entire framework is evaluated by how well it aligns with human judgments." 84 }, 85 "held_out_test_set": { 86 "applies": false, 87 "answer": false, 88 "justification": "This is a benchmark construction paper, not a model training paper. The concept of held-out test sets does not apply in the traditional sense." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Topic cluster analysis provided (Figure 4), per-model breakdowns in multiple tables, per-quality-score breakdowns (Figure 3), and per-judge breakdowns (Table 4)." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Stylistic biases (length bias, self-bias) are discussed as failure modes in Sections 6.5 and 6.6. Table 5 shows how style manipulation can game the benchmark without style control." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Self-bias of GPT-4-Turbo judge is reported (Section 6.6, Appendix Table 10) showing it favors OpenAI models. Claude-3-Opus and Llama-3-70B perform worse as judges (Table 4)." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims of '3x higher separation' supported by Table 1 (87.4% vs 22.6%), '98.6% correlation' supported by Table 3/Appendix Table 9, '$20 cost' stated in Section 4.3/Table 1." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper makes causal claims about the quality score filtering improving benchmark quality. This is supported by controlled comparisons: curated vs. random baselines (Table 2, Table 7), and ablations across quality score thresholds (Figure 3)." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "Limitations section (Section 7) explicitly states the benchmark 'currently lacks evaluation for multi-turn and non-English interactions' and acknowledges the seven qualities 'may not fully capture the range of possible attributes, potentially skewing towards prompts in technical domains.'" 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper discusses stylistic bias as an alternative explanation for benchmark scores (Section 6.5), self-bias of judges (Section 6.6), and tests whether results hold under style control (Table 3)." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Specific model versions provided throughout: gpt-4-0314, gpt-4-1106-preview, gpt-4-turbo-2024-04-09, claude-3-opus-20240229, claude-3-sonnet-20240229, gemini-1.5-pro-0514, llama-3-70b-instruct, etc. (footnote 4, Table 4)." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Prompt templates for the judge are stated to be in Section C (Appendix). Style control system prompts are provided in Table 6 (e.g., 'You are a helpful assistant who thoroughly explains things with as much detail as possible.')." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "No temperature, top-p, or other sampling parameters reported for LLM API calls used in judging or annotation." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used. The pipeline is a data curation workflow, not an agent system." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Data pipeline documented: 200,000 initial prompts from Chatbot Arena, filtering of duplicates/multi-turn/non-English, clustering into 4,000 topics, quality scoring, threshold filtering (score <6 removed, cluster mean <5 removed), sampling 2 per cluster from 250 clusters (Section 4.2)." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Dedicated Section 7 'Limitations' discusses biases in the pipeline, lack of multi-turn and non-English coverage, and potential skew toward technical domains." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Specific threats identified: seven quality criteria may skew toward technical domains, lack of multi-turn data in crowdsourced sources, primary language proficiency of authors limiting non-English evaluation." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 7 explicitly states what is NOT covered: multi-turn interactions, non-English evaluation. The benchmark is bounded to single-turn English queries." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "The benchmark prompts are released via the GitHub repository. Source datasets (Chatbot Arena conversations, WildChat-1M) are publicly available." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Data sourced from Chatbot Arena (200,000 prompts) and WildChat-1M (150,000 queries). Collection via crowdsourced live platforms described in Sections 4.1-4.2." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants recruited for this study. Data comes from existing crowdsourced platforms (Chatbot Arena, WildChat). Human preferences used as ground truth are from the existing Chatbot Arena platform." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "Full pipeline documented: embedding with text-embedding-3-small, UMAP dimensionality reduction, HDBSCAN clustering into 4,000 topics, LLM quality scoring on 7 criteria, threshold filtering, balanced sampling across clusters (Section 4.1-4.2, Figure 2)." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding or acknowledgments section found in the paper." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "All authors listed with UC Berkeley affiliation. Authors are also affiliated with the LMSYS Chatbot Arena project, which is the source of the benchmark data." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding disclosed, so independence cannot be assessed. The authors operate the Chatbot Arena platform whose data and rankings are used as ground truth, creating a potential conflict." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement found in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper evaluates multiple LLMs on the benchmark but does not state training data cutoff dates for any of the models used." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of whether Arena-Hard-Auto prompts (sourced from Chatbot Arena) could overlap with training data of the evaluated models. Chatbot Arena conversations are public and could be in training sets." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": true, 230 "justification": "Section 2 discusses benchmark leakage as a motivation. The paper argues BenchBuilder enables 'continuous benchmark updates' to address contamination risk of static benchmarks. However, contamination of Arena-Hard-Auto itself is not directly tested." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants recruited for this study. Human preference data comes from existing Chatbot Arena platform." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants recruited for this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants recruited for this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants recruited for this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants recruited for this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants recruited for this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants recruited for this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "Evaluation cost per model reported as $20 (Table 1, abstract). Pipeline annotation cost reported as ~$500 with GPT-4-Turbo or ~$45 with Llama-3-70B (Section 4.3)." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "API costs are reported but total computational budget (GPU hours for embeddings, clustering, etc.) is not stated." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Arena-Hard-Auto achieves 98.6% correlation with Chatbot Arena human preference rankings.", 286 "evidence": "Table 3 shows 98.6% Confidence Agreement and 98.6% Spearman Correlation with style-controlled Chatbot Arena English Hard Prompts ranking. Table 1 shows 93.2% Spearman and 90.9% Confidence Agreement with overall Chatbot Arena.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "Arena-Hard-Auto provides 3x higher separation of model performances compared to MT-Bench.", 291 "evidence": "Table 1: Arena-Hard-Auto separability 87.4% vs MT-Bench 22.6% (3.87x). Figure 5 visually demonstrates the difference.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "Arena-Hard-Auto evaluation costs only $20 per model.", 296 "evidence": "Table 1 lists eval cost per model as $20. Section 4.3 provides cost breakdown for the pipeline annotation step.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "Style control effectively neutralizes gaming via response length or markdown manipulation.", 301 "evidence": "Table 5 shows that with style control, the 'detail' variant of Llama-3.1-70B no longer outperforms the base model, whereas without style control it gains ~9 points.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "BenchBuilder generalizes to different data sources (WildChat).", 306 "evidence": "Table 2 shows Wild-Hard-Auto (from WildChat) achieves 86.7% separability and 88.6% confidence agreement vs 75.6% and 36.4% for random baseline.", 307 "supported": "moderate" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "The paper introduces BenchBuilder, an automated pipeline for curating LLM benchmarks from crowdsourced data, and Arena-Hard-Auto, a 500-prompt benchmark. Arena-Hard-Auto achieves 87.4% separability (vs 22.6% for MT-Bench) and up to 98.6% agreement with human preferences at $20 per model evaluation. The paper also proposes style control methods that effectively mitigate length and formatting biases in LLM-as-a-judge evaluation, and demonstrates the pipeline generalizes across data sources (Chatbot Arena, WildChat).", 312 "red_flags": [ 313 { 314 "flag": "Authors evaluate their own platform", 315 "detail": "The authors operate LMSYS Chatbot Arena, whose rankings serve as ground truth for validating Arena-Hard-Auto. This circular dependency means the benchmark is validated against a system the authors control. High agreement could partly reflect shared biases in data sourcing and evaluation methodology." 316 }, 317 { 318 "flag": "Contamination of Arena-Hard-Auto not directly tested", 319 "detail": "While benchmark leakage is discussed as motivation, the paper does not test whether Arena-Hard-Auto prompts (sourced from public Chatbot Arena conversations) appear in training data of the evaluated models. Given Chatbot Arena data is publicly scraped, this is a real risk." 320 }, 321 { 322 "flag": "No hyperparameters for LLM calls", 323 "detail": "Temperature, top-p, and other sampling parameters for the judge and annotator LLM calls are not reported, despite these being known to significantly affect output." 324 } 325 ], 326 "cited_papers": [ 327 { 328 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 329 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 330 "year": 2023, 331 "relevance": "Foundational work on LLM-as-a-judge evaluation methodology that Arena-Hard-Auto builds upon." 332 }, 333 { 334 "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference", 335 "authors": ["Wei-Lin Chiang", "Lianmin Zheng", "Ying Sheng"], 336 "year": 2024, 337 "relevance": "The crowdsourced human evaluation platform whose data and rankings serve as ground truth for this work." 338 }, 339 { 340 "title": "Length-Controlled AlpacaEval: A Simple Way to Debias Automatic Evaluators", 341 "authors": ["Yann Dubois", "Balázs Galambosi", "Percy Liang", "Tatsunori B. Hashimoto"], 342 "year": 2024, 343 "arxiv_id": "2404.04475", 344 "relevance": "Key baseline benchmark and introduces length bias control methods that Arena-Hard-Auto extends." 345 }, 346 { 347 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 348 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"], 349 "year": 2024, 350 "relevance": "Prominent task-based LLM benchmark for code, relevant to the survey's scope of AI coding evaluation." 351 }, 352 { 353 "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions", 354 "authors": ["Terry Yue Zhuo"], 355 "year": 2024, 356 "arxiv_id": "2406.15877", 357 "relevance": "Contemporary code generation benchmark relevant to evaluating LLM programming capabilities." 358 }, 359 { 360 "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code", 361 "authors": ["Naman Jain"], 362 "year": 2024, 363 "arxiv_id": "2403.07974", 364 "relevance": "Live benchmark addressing contamination in code evaluation, directly relevant to benchmark methodology quality." 365 }, 366 { 367 "title": "AgentBench: Evaluating LLMs as Agents", 368 "authors": ["Xiao Liu"], 369 "year": 2023, 370 "relevance": "Benchmark for evaluating LLMs in agentic settings, relevant to the survey's coverage of agent evaluation." 371 }, 372 { 373 "title": "Evaluating Large Language Models Trained on Code", 374 "authors": ["Mark Chen"], 375 "year": 2021, 376 "arxiv_id": "2107.03374", 377 "relevance": "Introduces HumanEval benchmark for code generation, foundational to LLM code evaluation methodology." 378 }, 379 { 380 "title": "NLP Evaluation in Trouble: On the Need to Measure LLM Data Contamination for Each Benchmark", 381 "authors": ["Oscar Sainz"], 382 "year": 2023, 383 "relevance": "Directly addresses benchmark contamination measurement, a key methodological concern in the survey." 384 }, 385 { 386 "title": "WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild", 387 "authors": ["Bill Yuchen Lin"], 388 "year": 2024, 389 "arxiv_id": "2406.04770", 390 "relevance": "Contemporary benchmark using real user queries for LLM evaluation, comparable approach to Arena-Hard-Auto." 391 } 392 ] 393 }