scan.json (29202B)
1 { 2 "paper": { 3 "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference", 4 "authors": [ 5 "Wei-Lin Chiang", 6 "Lianmin Zheng", 7 "Ying Sheng", 8 "Anastasios N. Angelopoulos", 9 "Tianle Li", 10 "Dacheng Li", 11 "Banghua Zhu", 12 "Hao Zhang", 13 "Michael I. Jordan", 14 "Joseph E. Gonzalez", 15 "Ion Stoica" 16 ], 17 "year": 2024, 18 "venue": "arXiv", 19 "arxiv_id": "2403.04132" 20 }, 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The paper states the demo is publicly available at https://chat.lmsys.org and commits to making 'data and code available, ensuring that this platform is open-source and open-accessible' (Section 1). The platform is a working, accessible system." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "The paper states 'We will publicly release a human preference dataset with over 100K pairwise votes collected from Chatbot Arena' (Section 1, contribution bullet 3). A prior dataset release (LMSYS-Chat-1M) is also referenced. The commitment is to a specific dataset with specified size." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "No environment specifications, requirements.txt, Dockerfile, or dependency listings are provided in the paper. The platform is described at a high level but technical environment details for reproduction are absent." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "No step-by-step reproduction instructions are provided. The statistical methodology is described mathematically, but there are no scripts, commands, or README-like instructions for replicating the ranking computation or the platform setup." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": true, 48 "justification": "The paper extensively discusses confidence intervals for Bradley-Terry coefficients, including both sandwich and bootstrap methods (Section 5, Figure 5, Figure 6). Multiplicity-corrected intervals are shown in Figure 5 with rank ranges (e.g., '#1', '#3-7')." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": true, 53 "justification": "The paper develops a formal hypothesis testing framework for anomalous user detection using Fisher's combination test with Bonferroni correction (Section 5.1, Equation 10). The approximate ranking framework provides statistical guarantees via confidence sets (Equation 8)." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "Effect sizes are reported in context: 'random needs 6,800 samples and adaptive needs 4,400 samples' for the same precision, meaning 'the random baseline requires 54% and 5% more data' (Section 7.1). Agreement rates between crowd users and experts are reported as percentages (72%-83%, Table 3)." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "The expert validation study uses 160 battles (Section 6.3), but no justification is given for why 160 was chosen or whether this sample size is adequate for the claims made about vote quality. No power analysis is discussed." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": true, 68 "justification": "The simulation study in Section 7.1 and Appendix A reports coverage and average interval widths over 20 trials (Figure 6). The confidence intervals themselves represent uncertainty/variance in the BT coefficient estimates." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "The active sampling method is compared against random sampling (Section 7.1, Figure 7). The Arena Bench is compared against MT-Bench (Figure 4). Expert votes serve as a baseline for validating crowd votes (Section 6.3, Table 3)." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "MT-Bench (Zheng et al., 2023b) and AlpacaEval (Li et al., 2023) are contemporary baselines for LLM evaluation. The paper also compares against recent human preference datasets (Anthropic HH, OpenAssistant) in Table 1." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": false, 85 "justification": "No ablation study is provided. The paper does not systematically remove or modify individual components (e.g., the adaptive sampling rule, the anonymization filter, the moderation filter) to measure their individual contributions." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Multiple evaluation metrics are used: BT coefficients, Elo scores, win rates (Figure 2), agreement rates (Table 3), confidence interval coverage and width (Figure 6), true positive/negative rates for anomaly detection (Table 5), and topic diversity measures (Figure 3)." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": true, 95 "justification": "The entire paper is about human evaluation of LLMs. Expert validation is conducted in Section 6.3, where graduate students at UC Berkeley independently labeled 160 battles to validate crowd-sourced votes." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "For the simulation study evaluating confidence interval coverage (Section 7.1), synthetic data is generated separately. For the active sampling evaluation, the 213,576-point holdout set is used (Section 7.1). The expert validation uses a random sample of 160 battles separate from the main analysis." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Per-category breakdowns are provided: topic-level win rates (Table 2 shows GPT-4 vs Llama-2 win rates across 8 topic clusters), per-model win matrix (Figure 2), per-model vote counts (Figure 10), and per-model-pair agreement rates (Table 3)." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "Failure cases are discussed: false negatives in anomaly detection are explained ('those are from users do not always behave abnormally, making them harder to detect', Section 7.2). The 5-10% gap between crowd and expert agreement is attributed to crowd users 'making mistakes or overlooking factual errors' (Section 6.3). Appendix D.4 shows examples where there is no ground truth." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper reports that E-value-based approaches for anomaly detection 'did not perform well in terms of power' (Section 8, Future Directions). The anomaly detection method has significant false negative rates (Table 5). The improvement from adaptive sampling for BT score estimation is described as 'more subtle' (only 5%, Section 7.1)." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "The abstract claims that crowdsourced questions are 'sufficiently diverse and discriminating' (supported by Section 6.1 topic modeling and Section 6.2 discriminative analysis), that crowd votes are 'in good agreement with those of expert raters' (supported by Table 3, 72-83% agreement), and that the platform has '240K votes' (supported by Section 3.2). All claims have corresponding evidence." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "The paper's causal claims are modest and supported. The claim that adaptive sampling improves sample efficiency is demonstrated via controlled simulation where only the sampling method varies (Section 7.1, Figure 7). The ranking methodology is grounded in well-established statistical theory (Bradley-Terry model). The paper mostly describes and analyzes rather than making strong causal claims." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": true, 132 "justification": "The Limitations section (Section 8) explicitly bounds generalizations: 'user base will primarily consist of LLM hobbyists and researchers,' data 'might not accurately reflect the real-world usage of LLMs in production environments or specialized domains,' and the study 'concentrates on assessing the helpfulness of LLMs but overlooks their safety aspects.'" 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": true, 137 "justification": "The paper discusses alternative explanations for the crowd-expert agreement gap: 'mostly attributed to crowd user making mistakes or overlooking factual errors' vs. cases where 'both answers can be argued as being better than the other' (Section 6.3). The limitations section discusses how user base bias and prompt distribution could affect results." 138 } 139 }, 140 "setup_transparency": { 141 "model_versions_specified": { 142 "applies": true, 143 "answer": true, 144 "justification": "Specific model versions are used throughout: 'GPT-4-Turbo', 'GPT-4-0613', 'GPT-3.5-Turbo-0613', 'Llama-2-70b-chat', 'Llama-2-13b-chat', 'claude-2.1', 'mixtral-8x7b-instruct-v0.1', etc. (Figure 2, Tables 2-4). These include version identifiers and snapshot dates." 145 }, 146 "prompts_provided": { 147 "applies": true, 148 "answer": true, 149 "justification": "The system prompt for Arena Bench evaluation is provided in full in Appendix D.3. Example user prompts from different topic clusters are provided in Appendix D.1 and D.2. The platform's design is that users provide their own prompts (no preset prompts), which is documented." 150 }, 151 "hyperparameters_reported": { 152 "applies": true, 153 "answer": false, 154 "justification": "When GPT-4 is used as a judge (Section 6.2, 6.3, Arena Bench), no temperature or sampling parameters are reported. The topic modeling pipeline mentions specific parameters (UMAP dimension reduction to 5, HDBSCAN minimum cluster size 32) but LLM API parameters for the judge are missing." 155 }, 156 "scaffolding_described": { 157 "applies": false, 158 "answer": false, 159 "justification": "The paper does not use agentic scaffolding. Chatbot Arena is a web platform for collecting human preferences, not an agent-based system." 160 }, 161 "data_preprocessing_documented": { 162 "applies": true, 163 "answer": true, 164 "justification": "Data preprocessing is documented: keyword filtering to ensure anonymity (filtering conversations containing model names or company names), OpenAI moderation API for unsafe content flagging (3% of requests flagged), and the topic modeling pipeline with specific steps (embedding, UMAP, HDBSCAN) described in Section 6.1." 165 } 166 }, 167 "limitations_and_scope": { 168 "limitations_section_present": { 169 "applies": true, 170 "answer": true, 171 "justification": "A dedicated 'Limitations' subsection exists within Section 8 (Discussion), with substantive discussion of user base bias, prompt distribution bias, and the focus on helpfulness over safety." 172 }, 173 "threats_to_validity_specific": { 174 "applies": true, 175 "answer": true, 176 "justification": "Specific threats are discussed: the user base 'will primarily consist of LLM hobbyists and researchers' (specific population bias), data 'predominantly comes from our online chat interface' which 'might not accurately reflect the real-world usage of LLMs in production environments or specialized domains' (specific distribution bias), and the study 'overlooks their safety aspects' (specific scope limitation)." 177 }, 178 "scope_boundaries_stated": { 179 "applies": true, 180 "answer": true, 181 "justification": "The paper explicitly states what it does not show: it does not evaluate safety, it does not claim to represent production or specialized domain usage, and it acknowledges user base may be biased toward hobbyists/researchers (Section 8). The focus is explicitly on helpfulness evaluation via pairwise comparison." 182 } 183 }, 184 "data_integrity": { 185 "raw_data_available": { 186 "applies": true, 187 "answer": true, 188 "justification": "The paper commits to publicly releasing '100K pairwise preference votes' (Section 1) and references prior data release LMSYS-Chat-1M (Zheng et al., 2023a). The commitment is specific enough (100K pairwise votes from the Arena) and the prior release demonstrates follow-through." 189 }, 190 "data_collection_described": { 191 "applies": true, 192 "answer": true, 193 "justification": "Data collection is described in detail in Section 3: the interface design (pairwise comparison, anonymous models, user-provided prompts), the time period (April 2023 to January 2024), the number of users (~90K), votes (~240K), models (50+), and languages (100+). Terms of use and consent for data release are mentioned." 194 }, 195 "recruitment_methods_described": { 196 "applies": true, 197 "answer": false, 198 "justification": "The paper states users come to the free website but does not describe how users were recruited or made aware of the platform. The Limitations section acknowledges potential user base bias ('primarily consist of LLM hobbyists and researchers') but doesn't describe the recruitment channels (social media, blogs, academic networks, etc.) or analyze the selection bias this creates." 199 }, 200 "data_pipeline_documented": { 201 "applies": true, 202 "answer": true, 203 "justification": "The data pipeline is documented: user submits prompt to two anonymous models, votes on preference, conversations are filtered for model identity keywords and moderation (Section 3.1-3.2). The anonymization and moderation steps are described with quantitative details (3% flagged for unsafe content). Vote counts over time are shown in Figure 9." 204 } 205 }, 206 "conflicts_of_interest": { 207 "funding_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "Funding is disclosed in the Acknowledgments section: 'supported by sponsorship from Kaggle, MBZUAI, a16z, Together AI, Anyscale, and HuggingFace' and 'partly supported by Accenture, AMD, Google, IBM, Intel, Microsoft, Samsung SDS, SAP, Uber, and VMware.'" 211 }, 212 "affiliations_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "Author affiliations are listed: UC Berkeley, Stanford, UCSD. These are academic institutions, not the companies whose models are being evaluated (OpenAI, Google, Anthropic, etc.), so there is no direct product affiliation conflict." 216 }, 217 "funder_independent_of_outcome": { 218 "applies": true, 219 "answer": false, 220 "justification": "Several funders have a stake in the outcomes: Google, Microsoft, and AMD are sponsors, and models from Google (Gemini) and Microsoft-invested OpenAI (GPT-4) are evaluated on the platform. HuggingFace, Together AI, and Anyscale are LLM infrastructure companies that benefit from favorable LLM evaluations. This conflict is not acknowledged." 221 }, 222 "financial_interests_declared": { 223 "applies": true, 224 "answer": false, 225 "justification": "No competing interests or financial interests statement is provided. Given the extensive corporate sponsorship and the fact that the platform evaluates products from some sponsors, a financial interests declaration would be appropriate." 226 } 227 }, 228 "contamination": { 229 "training_cutoff_stated": { 230 "applies": false, 231 "answer": false, 232 "justification": "The paper evaluates LLMs via live human preference, not on a static benchmark that could be in training data. The core contribution is the platform and ranking methodology, not a benchmark evaluation of model knowledge. Contamination in the traditional sense (test data in training set) does not apply to a live pairwise comparison platform." 233 }, 234 "train_test_overlap_discussed": { 235 "applies": false, 236 "answer": false, 237 "justification": "Same reasoning as training_cutoff_stated: the paper's evaluation method uses fresh, live user prompts specifically designed to avoid the contamination problem of static benchmarks. This is explicitly positioned as an advantage over static benchmarks (Section 1)." 238 }, 239 "benchmark_contamination_addressed": { 240 "applies": false, 241 "answer": false, 242 "justification": "The paper introduces a live evaluation platform specifically to address contamination risks of static benchmarks. The prompts are user-generated in real time, so benchmark contamination in the traditional sense does not apply." 243 } 244 }, 245 "human_studies": { 246 "pre_registered": { 247 "applies": true, 248 "answer": false, 249 "justification": "The study involves human participants (90K+ users providing preference votes, and expert raters). No pre-registration is mentioned." 250 }, 251 "irb_or_ethics_approval": { 252 "applies": true, 253 "answer": false, 254 "justification": "The platform collects data from human users who interact with the website. Users accept terms of use (Section 3.1), but no IRB or ethics board approval is mentioned." 255 }, 256 "demographics_reported": { 257 "applies": true, 258 "answer": false, 259 "justification": "No demographics are reported for the crowd users. Language distribution is reported (77% English, 5% Chinese, etc.) but no information about user experience level, geographic distribution, age, or profession. Expert raters are identified only as 'graduate students at UC Berkeley' (footnote 4)." 260 }, 261 "inclusion_exclusion_criteria": { 262 "applies": true, 263 "answer": false, 264 "justification": "No inclusion or exclusion criteria for users are stated. Anyone who visits the website and accepts terms of use can participate. While this is an open platform, the lack of any characterization of who participates and who doesn't limits the ability to assess representativeness." 265 }, 266 "randomization_described": { 267 "applies": true, 268 "answer": true, 269 "justification": "The randomization of model pairs is described: 'two anonymous models are sampled' (Section 3.1), with the adaptive sampling rule formally specified in Equation 9. The models are presented anonymously with identities 'revealed only after voting.' Position randomization for Arena Bench evaluation is also described (Appendix D.3)." 270 }, 271 "blinding_described": { 272 "applies": true, 273 "answer": true, 274 "justification": "Blinding is described: models are presented anonymously ('anonymous, randomized battles', Section 1), and 'the models' identities revealed only after voting' (Section 3.1). For expert validation, 'The experts were given the prompts and answers blindly' (Section 6.3)." 275 }, 276 "attrition_reported": { 277 "applies": true, 278 "answer": false, 279 "justification": "No attrition information is reported. The paper mentions '1M users visit' but only '240K votes from about 90K users,' implying significant attrition from visit to vote. However, the conversion rate and reasons for non-participation are not analyzed." 280 } 281 }, 282 "cost_and_practicality": { 283 "inference_cost_reported": { 284 "applies": true, 285 "answer": false, 286 "justification": "The paper does not report the cost of running the platform, including API costs for proprietary models, hosting costs, or the cost of expert labeling. Given the platform serves 50+ models to 90K+ users, the operational cost is significant and unreported." 287 }, 288 "compute_budget_stated": { 289 "applies": true, 290 "answer": false, 291 "justification": "No computational budget is stated. The paper does not mention GPU hours, total API spend, or hardware used for running the platform or the statistical analysis." 292 } 293 } 294 }, 295 "claims": [ 296 { 297 "claim": "Crowdsourced user prompts in Chatbot Arena are diverse, covering 600 topic clusters with a long-tail distribution where the largest cluster accounts for only 1% of prompts.", 298 "evidence": "Section 6.1 describes the topic modeling pipeline using BERTopic, identifying 600 clusters. Figure 3 shows the top-16 clusters with the largest at 1% and low inter-cluster similarity.", 299 "supported": "strong" 300 }, 301 { 302 "claim": "Arena prompts can effectively distinguish between models of different capability levels, with GPT-4's win rate against Llama-2-70b-chat ranging from 53% to 97% depending on topic.", 303 "evidence": "Table 2 shows GPT-4's win rate across 8 topic clusters, from 96.7% on 'Python Game Programming Challenge' to 53.3% on 'Movie Recommendations & Ratings'. Section 6.2 discusses these results.", 304 "supported": "moderate" 305 }, 306 { 307 "claim": "Crowdsourced votes show high agreement with expert evaluations (72% to 83% agreement rate).", 308 "evidence": "Table 3 shows pairwise agreement rates between crowd users, two experts, and GPT-4 judge on 160 battles. Expert-expert agreement is 79.4-89.8%, crowd-expert is 72.8-83.1%.", 309 "supported": "moderate" 310 }, 311 { 312 "claim": "The adaptive sampling algorithm improves sample efficiency, requiring up to 54% fewer samples than random sampling to achieve the same precision.", 313 "evidence": "Section 7.1 reports that to estimate the win matrix to precision 0.2, random needs 6,800 samples and adaptive needs 4,400 (54% more for random). For BT score precision of 0.3, the improvement drops to 5%. Figure 7 shows interval widths.", 314 "supported": "strong" 315 }, 316 { 317 "claim": "The anomaly detection method is effective, reaching 90% true positive rate with 60-70% true negative rate.", 318 "evidence": "Table 5 shows confusion matrices at alpha=0.1 and alpha=0.3. At alpha=0.1: 13/14 predicted positive are actual positive (TPR ~93%), 24/36 predicted negative are actual negative (TNR ~67%). At alpha=0.3: 21/29 TP, 17/21 TN.", 319 "supported": "moderate" 320 }, 321 { 322 "claim": "The Bradley-Terry confidence intervals achieve proper coverage, centering around 1-alpha regardless of the number of models.", 323 "evidence": "Figure 6 shows simulation results with coverage centering around the nominal level for M=4,7,10,15,20. The simulation uses 20 trials per configuration. Appendix A provides additional comparison of bootstrap vs sandwich intervals.", 324 "supported": "strong" 325 } 326 ], 327 "methodology_tags": [ 328 "benchmark-eval", 329 "observational" 330 ], 331 "key_findings": "Chatbot Arena is a crowdsourced platform for evaluating LLMs through anonymous pairwise human preference comparisons, collecting 240K votes from 90K users across 100+ languages. The paper demonstrates that crowdsourced prompts are topically diverse (600 clusters) and effectively discriminate between models. Crowd votes show 72-83% agreement with expert evaluations, comparable to the 79-90% inter-expert agreement. The adaptive sampling algorithm improves sample efficiency by up to 54% over random sampling for win matrix estimation, and the Bradley-Terry ranking with sandwich confidence intervals provides statistically valid model rankings.", 332 "red_flags": [ 333 { 334 "flag": "Non-independent funders not acknowledged", 335 "detail": "The project is sponsored by Google, Microsoft, and other companies whose models (Gemini, GPT-4 via Microsoft's investment in OpenAI) are evaluated on the platform. This conflict of interest is not acknowledged anywhere in the paper." 336 }, 337 { 338 "flag": "Small expert validation sample", 339 "detail": "The expert validation (Section 6.3) uses only 160 battles between just 2 model pairs (GPT-4-Turbo vs Llama-2-13b and GPT-4-Turbo vs GPT-3.5-Turbo). This limited sample and model pair selection may not generalize to the full range of model comparisons on the platform, especially for closely matched models." 340 }, 341 { 342 "flag": "Expert raters are graduate students from the authors' institution", 343 "detail": "The 'experts' used for validation are 'graduate students at UC Berkeley' (footnote 4), the same institution as most authors. This could introduce institutional bias and the experts may not be independent evaluators." 344 }, 345 { 346 "flag": "Selection bias in user population uncharacterized", 347 "detail": "The platform attracts self-selected users from the internet, likely biased toward tech-savvy LLM enthusiasts. While acknowledged in limitations, the paper presents agreement rates and rankings as credible without formally assessing how this selection bias affects the results." 348 }, 349 { 350 "flag": "Anomaly detection evaluated on tiny sample", 351 "detail": "The anomaly detection method (Section 7.2) is evaluated on only 25 anomalous and 25 normal users, manually identified. This tiny evaluation set with subjective identification criteria makes the reported detection rates unreliable." 352 } 353 ], 354 "cited_papers": [ 355 { 356 "title": "Judging LLM-as-a-Judge with MT-bench and Chatbot Arena", 357 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 358 "year": 2023, 359 "relevance": "Introduces MT-Bench and the LLM-as-judge methodology, directly related to LLM evaluation methods." 360 }, 361 { 362 "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback", 363 "authors": ["Yuntao Bai"], 364 "year": 2022, 365 "arxiv_id": "2204.05862", 366 "relevance": "Foundational RLHF work from Anthropic, directly relevant to human preference evaluation of LLMs." 367 }, 368 { 369 "title": "Evaluating large language models trained on code", 370 "authors": ["Mark Chen"], 371 "year": 2021, 372 "arxiv_id": "2107.03374", 373 "relevance": "Introduces HumanEval benchmark for code generation, relevant to LLM evaluation methodology and contamination concerns." 374 }, 375 { 376 "title": "Holistic evaluation of language models", 377 "authors": ["Percy Liang"], 378 "year": 2022, 379 "arxiv_id": "2211.09110", 380 "relevance": "HELM is a comprehensive LLM evaluation framework, directly relevant to benchmarking methodology." 381 }, 382 { 383 "title": "AlpacaEval: An automatic evaluator of instruction-following models", 384 "authors": ["Xuechen Li"], 385 "year": 2023, 386 "relevance": "Automatic LLM evaluation tool using model-as-judge, relevant to comparison with human preference evaluation." 387 }, 388 { 389 "title": "Rethinking benchmark and contamination for language models with rephrased samples", 390 "authors": ["Shuo Yang", "Wei-Lin Chiang", "Lianmin Zheng"], 391 "year": 2023, 392 "arxiv_id": "2311.04850", 393 "relevance": "Addresses benchmark contamination in LLM evaluation, directly relevant to evaluation methodology quality." 394 }, 395 { 396 "title": "Proving test set contamination in black box language models", 397 "authors": ["Yonatan Oren"], 398 "year": 2023, 399 "arxiv_id": "2310.17623", 400 "relevance": "Formal methods for detecting test set contamination, relevant to LLM benchmark integrity." 401 }, 402 { 403 "title": "LMSYS-Chat-1M: A large-scale real-world LLM conversation dataset", 404 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 405 "year": 2023, 406 "relevance": "Large-scale LLM conversation dataset from the same platform, relevant to data collection and LLM evaluation." 407 }, 408 { 409 "title": "Dynabench: Rethinking benchmarking in NLP", 410 "authors": ["Douwe Kiela"], 411 "year": 2021, 412 "relevance": "Proposes live benchmarking with human-in-the-loop for NLP, directly relevant to the motivation and methodology of Chatbot Arena." 413 }, 414 { 415 "title": "Training language models to follow instructions with human feedback", 416 "authors": ["Long Ouyang"], 417 "year": 2022, 418 "relevance": "InstructGPT paper on RLHF methodology, foundational to the human preference evaluation paradigm." 419 }, 420 { 421 "title": "Can large language models be an alternative to human evaluations?", 422 "authors": ["Cheng-Han Chiang", "Hung-yi Lee"], 423 "year": 2023, 424 "relevance": "Explores LLM-as-judge methodology as replacement for human evaluation, directly relevant to evaluation methodology." 425 }, 426 { 427 "title": "GPT-4 technical report", 428 "authors": ["OpenAI"], 429 "year": 2023, 430 "arxiv_id": "2303.08774", 431 "relevance": "GPT-4 is a primary model evaluated on the platform and used as a judge, relevant to LLM capability evaluation." 432 } 433 ] 434 }