scan.json (28025B)
1 { 2 "paper": { 3 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 4 "authors": [ 5 "Lianmin Zheng", 6 "Wei-Lin Chiang", 7 "Ying Sheng", 8 "Siyuan Zhuang", 9 "Zhanghao Wu", 10 "Yonghao Zhuang", 11 "Zi Lin", 12 "Zhuohan Li", 13 "Dacheng Li", 14 "Eric P. Xing", 15 "Hao Zhang", 16 "Joseph E. Gonzalez", 17 "Ion Stoica" 18 ], 19 "year": 2023, 20 "venue": "NeurIPS 2023 Track on Datasets and Benchmarks", 21 "arxiv_id": "2306.05685" 22 }, 23 "scan_version": 2, 24 "active_modules": ["experimental_rigor", "data_leakage"], 25 "checklist": { 26 "artifacts": { 27 "code_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The paper provides a GitHub repository URL: https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge, referenced in the abstract and throughout." 31 }, 32 "data_released": { 33 "applies": true, 34 "answer": true, 35 "justification": "The paper states: 'We publicly release 80 MT-bench questions, 3K expert votes, and 30K conversations with human preferences for future study.' Data release is described in Appendix C.3." 36 }, 37 "environment_specified": { 38 "applies": true, 39 "answer": false, 40 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is mentioned. Training uses 8x A100 GPUs but no software environment details beyond the FastChat repository." 41 }, 42 "reproduction_instructions": { 43 "applies": true, 44 "answer": false, 45 "justification": "While code is released and training details are in Appendix E, there are no step-by-step reproduction instructions for replicating the main evaluation experiments (agreement calculations, bias experiments)." 46 } 47 }, 48 "statistical_methodology": { 49 "confidence_intervals_or_error_bars": { 50 "applies": true, 51 "answer": false, 52 "justification": "Agreement rates and win rates are reported as point estimates (e.g., '85% agreement', '65.0% consistency') without confidence intervals or error bars." 53 }, 54 "significance_tests": { 55 "applies": true, 56 "answer": false, 57 "justification": "The paper claims GPT-4 matches human agreement and that certain biases exist, but no statistical significance tests are applied. Comparisons are made by directly comparing percentages." 58 }, 59 "effect_sizes_reported": { 60 "applies": true, 61 "answer": true, 62 "justification": "The paper reports concrete effect sizes with context: 'GPT-4 favors itself with a 10% higher win rate; Claude-v1 favors itself with a 25% higher win rate' (Section 3.3). Agreement rates are given with absolute values and baselines (e.g., 85% vs random 50%)." 63 }, 64 "sample_size_justified": { 65 "applies": true, 66 "answer": false, 67 "justification": "No justification is given for the sample sizes: 80 MT-bench questions, 3K expert votes, 3K Arena sample. No power analysis or discussion of whether these sizes are sufficient for the claims made." 68 }, 69 "variance_reported": { 70 "applies": true, 71 "answer": false, 72 "justification": "No variance, standard deviation, or spread measures are reported for any of the main results (agreement rates, win rates, bias measurements)." 73 } 74 }, 75 "evaluation_design": { 76 "baselines_included": { 77 "applies": true, 78 "answer": true, 79 "justification": "Multiple LLM judges are compared (GPT-4, GPT-3.5, Claude-v1) against human evaluations, and random agreement baselines are explicitly stated (e.g., 'R = 33%', 'R = 50%' in Tables 5-6)." 80 }, 81 "baselines_contemporary": { 82 "applies": true, 83 "answer": true, 84 "justification": "The baselines include the strongest available models at the time (GPT-4, GPT-3.5, Claude-v1), which were state-of-the-art in 2023." 85 }, 86 "ablation_study": { 87 "applies": true, 88 "answer": true, 89 "justification": "Multiple ablations are conducted: different prompt designs (default, rename, score, short), few-shot vs zero-shot, chain-of-thought vs reference-guided, single-turn vs multi-turn prompt designs (Section 3.4, 3.5, Appendix D)." 90 }, 91 "multiple_metrics": { 92 "applies": true, 93 "answer": true, 94 "justification": "Multiple metrics are used: agreement rate (with and without ties), consistency rate (position bias), win rate, per-category breakdown, and failure rate under attacks." 95 }, 96 "human_evaluation": { 97 "applies": true, 98 "answer": true, 99 "justification": "Human evaluation is central to the paper: 58 expert-level human labelers on MT-bench (3K votes) and crowdsourced human votes on Chatbot Arena (30K votes). Section 4.1." 100 }, 101 "held_out_test_set": { 102 "applies": true, 103 "answer": false, 104 "justification": "For the fine-tuned Vicuna judge (Appendix F), a validation/test split is described (20K train, 2K validation, 3K test). However, for the main MT-bench evaluation, the same 80 questions are used for both development and reporting without a held-out split." 105 }, 106 "per_category_breakdown": { 107 "applies": true, 108 "answer": true, 109 "justification": "Per-category breakdowns are provided: Table 7 shows category-wise win rates (writing, roleplay, reasoning, math, coding, extraction, STEM, humanities). Figure 20 shows category-wise scores. Table 10 shows position bias by category." 110 }, 111 "failure_cases_discussed": { 112 "applies": true, 113 "answer": true, 114 "justification": "Multiple failure cases are discussed with concrete examples: position bias (Figure 11), verbosity bias (Figure 12), math grading failures (Figure 13), reasoning failures (Figure 14), chain-of-thought failures (Figure 15), and multi-turn referencing errors (Figure 16)." 115 }, 116 "negative_results_reported": { 117 "applies": true, 118 "answer": true, 119 "justification": "Negative results include: CoT prompt still fails on math (Table 4, Figure 15), few-shot prompts can shift bias rather than remove it (Table 12, GPT-3.5 bias shifts from first to second position), and the self-enhancement bias study is inconclusive." 120 } 121 }, 122 "claims_and_evidence": { 123 "abstract_claims_supported": { 124 "applies": true, 125 "answer": true, 126 "justification": "The abstract claims >80% agreement between GPT-4 and humans, supported by Tables 5-6 showing 85% agreement (S2 setup). Claims about biases and mitigation strategies are supported in Sections 3.3-3.4." 127 }, 128 "causal_claims_justified": { 129 "applies": true, 130 "answer": false, 131 "justification": "The paper makes causal claims about biases (e.g., 'position bias' causing different judgments) based on observational swapping experiments without formal causal analysis. The claim that RLHF training causes alignment ('these models are often trained with RLHF, they already exhibit strong human alignment') is not empirically tested." 132 }, 133 "generalization_bounded": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper generalizes broadly: 'LLM-as-a-judge is a scalable and explainable way to approximate human preferences' based on results from only 6 models on 80 questions (MT-bench) and one crowdsourced platform. The title and claims extend well beyond the tested setting." 137 }, 138 "alternative_explanations_discussed": { 139 "applies": true, 140 "answer": false, 141 "justification": "Alternative explanations are largely absent. For example, the high agreement with humans could be because GPT-4 and experts share training biases rather than GPT-4 being 'correct.' The paper does not discuss whether the expert labelers' exposure to GPT-4 judgments (Section 4.2, showing GPT-4 judgments to humans who disagreed) may have contaminated the agreement measure." 142 }, 143 "proxy_outcome_distinction": { 144 "applies": true, 145 "answer": false, 146 "justification": "The paper measures agreement rate as a proxy for evaluation quality but does not discuss the gap between 'agreement with humans' and 'correct evaluation.' Agreement could reflect shared biases rather than correctness. The paper frames agreement as validation of LLM-as-a-judge without acknowledging this proxy gap." 147 } 148 }, 149 "setup_transparency": { 150 "model_versions_specified": { 151 "applies": true, 152 "answer": false, 153 "justification": "The paper uses 'GPT-4', 'GPT-3.5', 'Claude-v1' without specific API versions or snapshot dates. One appendix example mentions 'gpt-4-0314' but the main experiments do not consistently specify versions." 154 }, 155 "prompts_provided": { 156 "applies": true, 157 "answer": true, 158 "justification": "Full prompt templates are provided in Appendix A (Figures 5-10) for all judge variants: default pairwise, single answer grading, chain-of-thought, reference-guided, multi-turn pairwise, and reference-guided multi-turn." 159 }, 160 "hyperparameters_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Training hyperparameters are reported in Appendix E: 'global batch size=128, learning=2e-5, epochs=3, seq length=2048.' Temperature 0.7 is stated for generating similar answers in position bias experiments (Section 3.3)." 164 }, 165 "scaffolding_described": { 166 "applies": false, 167 "answer": false, 168 "justification": "No agentic scaffolding is used. The LLM judge is a single prompt-response evaluation, not an agentic pipeline." 169 }, 170 "data_preprocessing_documented": { 171 "applies": true, 172 "answer": true, 173 "justification": "Data preprocessing is described: 'we convert the HTML back to markdown and filter out some inappropriate or low-quality samples, which results in 125K conversations after data cleaning' (Appendix E). PII cleaning and toxic content tagging for data release is described in Appendix C.3." 174 } 175 }, 176 "limitations_and_scope": { 177 "limitations_section_present": { 178 "applies": true, 179 "answer": true, 180 "justification": "Section 6 'Discussion' contains a 'Limitations' subsection discussing neglect of safety evaluation, combining multiple helpfulness dimensions into a single metric, and preliminary nature of bias mitigations." 181 }, 182 "threats_to_validity_specific": { 183 "applies": true, 184 "answer": false, 185 "justification": "The limitations are relatively generic: 'emphasizes helpfulness but largely neglects safety', 'multiple dimensions...are all combined into a single metric.' No specific threats are discussed, such as the 80-question sample being too small for category-level claims or the expert population being non-representative." 186 }, 187 "scope_boundaries_stated": { 188 "applies": true, 189 "answer": false, 190 "justification": "The paper does not explicitly state what the results do NOT show. It acknowledges safety is not addressed but does not bound the generalization of agreement claims (e.g., only tested with models available in early 2023, only English, only certain question types)." 191 } 192 }, 193 "data_integrity": { 194 "raw_data_available": { 195 "applies": true, 196 "answer": true, 197 "justification": "The paper releases 80 MT-bench questions, 3K expert votes, and 30K Arena conversations with human preferences. Raw vote data is available for verification." 198 }, 199 "data_collection_described": { 200 "applies": true, 201 "answer": true, 202 "justification": "Appendix C describes the data collection process in detail: MT-bench uses a specific voting interface (Figure 17-18), Chatbot Arena uses anonymous battles (Figure 19), and consent processes are described." 203 }, 204 "recruitment_methods_described": { 205 "applies": true, 206 "answer": true, 207 "justification": "MT-bench: 'we obtained their consent by letting them sign an application form. We pay them $20 for judging 20 questions (~$35/hr). The participants are mostly graduate students from more than ten universities' (Appendix C.1). Arena: 'any user can use this platform without registration' (Appendix C.2)." 208 }, 209 "data_pipeline_documented": { 210 "applies": true, 211 "answer": true, 212 "justification": "The pipeline from data collection to analysis is documented: question generation → model answer generation → human/LLM evaluation → agreement calculation. ShareGPT data pipeline: HTML→markdown→filtering→125K conversations (Appendix E)." 213 } 214 }, 215 "conflicts_of_interest": { 216 "funding_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Acknowledgment section lists funding: 'gifts from Anyscale, Astronomer, Google, IBM, Intel, Lacework, Microsoft, MBZUAI, Samsung SDS, Uber, and VMware. Lianmin Zheng is supported by a Meta Ph.D. Fellowship.'" 220 }, 221 "affiliations_disclosed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Author affiliations are clearly listed: UC Berkeley, UC San Diego, Carnegie Mellon University, Stanford, MBZUAI." 225 }, 226 "funder_independent_of_outcome": { 227 "applies": true, 228 "answer": false, 229 "justification": "Google and Microsoft both have competing LLM products (PaLM/Bard, GPT-4 via partnership). The paper evaluates GPT-4 favorably as the best judge, and these funders have potential interest in LLM evaluation outcomes. The conflict is not discussed." 230 }, 231 "financial_interests_declared": { 232 "applies": true, 233 "answer": false, 234 "justification": "No competing interests statement is present. Several funders (Google, Microsoft, Meta) have direct financial interests in LLM evaluation outcomes." 235 } 236 }, 237 "contamination": { 238 "training_cutoff_stated": { 239 "applies": true, 240 "answer": false, 241 "justification": "The paper does not state training data cutoff dates for GPT-4, GPT-3.5, or Claude-v1. This is relevant because the models are being evaluated as judges." 242 }, 243 "train_test_overlap_discussed": { 244 "applies": true, 245 "answer": false, 246 "justification": "No discussion of whether MT-bench questions or similar evaluation tasks appeared in the training data of the judge models. GPT-4 could have been trained on evaluation-style prompts." 247 }, 248 "benchmark_contamination_addressed": { 249 "applies": true, 250 "answer": false, 251 "justification": "MT-bench uses MMLU questions as starting points (Figure 1) and the models evaluated (GPT-4, GPT-3.5) may have been trained on MMLU data. This contamination risk is not discussed." 252 } 253 }, 254 "human_studies": { 255 "pre_registered": { 256 "applies": true, 257 "answer": false, 258 "justification": "No pre-registration is mentioned for the human evaluation study with 58 expert labelers or the Chatbot Arena crowdsourced data collection." 259 }, 260 "irb_or_ethics_approval": { 261 "applies": true, 262 "answer": false, 263 "justification": "No IRB or ethics board approval is mentioned despite collecting data from 58 human labelers and thousands of Chatbot Arena users." 264 }, 265 "demographics_reported": { 266 "applies": true, 267 "answer": false, 268 "justification": "Labelers are described only as 'mostly graduate students from more than ten universities.' No detailed demographics (experience level, field, language proficiency, gender, etc.) are reported. Arena users are characterized only by '2114 unique IPs.'" 269 }, 270 "inclusion_exclusion_criteria": { 271 "applies": true, 272 "answer": false, 273 "justification": "No inclusion/exclusion criteria are stated for expert labelers beyond being graduate students, or for Arena participants beyond accepting terms of use." 274 }, 275 "randomization_described": { 276 "applies": true, 277 "answer": true, 278 "justification": "Randomization is described: 'let each human evaluate at least 20 random multi-turn questions' (Section 4.1). In Chatbot Arena, users interact with 'two anonymous models simultaneously' with random model assignment." 279 }, 280 "blinding_described": { 281 "applies": true, 282 "answer": true, 283 "justification": "Blinding is described for Chatbot Arena: 'users can interact with two anonymous models simultaneously' and 'the identities of the models disclosed post-voting' (Section 2.3). MT-bench also uses anonymous model labels." 284 }, 285 "attrition_reported": { 286 "applies": true, 287 "answer": false, 288 "justification": "No attrition data is reported. The paper states 'A user can skip up to 5 questions' (Appendix C.1) but does not report how many skipped or how many labelers completed the full set." 289 } 290 }, 291 "cost_and_practicality": { 292 "inference_cost_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "The paper mentions that few-shot prompts make API calls '4× more expensive' (Section 3.4) but does not report actual API costs for running the LLM judge evaluations." 296 }, 297 "compute_budget_stated": { 298 "applies": true, 299 "answer": true, 300 "justification": "Training compute is partially stated: '8x A100 GPUs' and 'The longest single training run takes around 2 days' (Appendix E). Human evaluation cost is implied ($20 per 20 questions × 58 labelers)." 301 } 302 }, 303 "experimental_rigor": { 304 "seed_sensitivity_reported": { 305 "applies": true, 306 "answer": false, 307 "justification": "No seed sensitivity analysis is reported. Position bias experiments use temperature=0.7 but do not report variance across runs." 308 }, 309 "number_of_runs_stated": { 310 "applies": true, 311 "answer": false, 312 "justification": "The number of runs for LLM judge evaluations is not explicitly stated. Position bias tests swap positions (2 runs per pair) but main agreement results do not state run counts." 313 }, 314 "hyperparameter_search_budget": { 315 "applies": true, 316 "answer": false, 317 "justification": "No hyperparameter search budget is reported for the judge prompts. Multiple prompt variants are tested (default, rename, score, short) but the selection process is not documented." 318 }, 319 "best_config_selection_justified": { 320 "applies": true, 321 "answer": false, 322 "justification": "The paper tests multiple prompt variants and selects 'default' with the conservative swapping approach, but does not justify this selection on a validation set. The choice appears ad hoc." 323 }, 324 "multiple_comparison_correction": { 325 "applies": false, 326 "answer": false, 327 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 328 }, 329 "self_comparison_bias_addressed": { 330 "applies": true, 331 "answer": true, 332 "justification": "The paper explicitly examines self-enhancement bias in Section 3.3, noting GPT-4 favors itself with a 10% higher win rate and Claude-v1 with 25%, though concluding the study is inconclusive due to limited data." 333 }, 334 "compute_budget_vs_performance": { 335 "applies": false, 336 "answer": false, 337 "justification": "Compute differences between judge models are negligible (all are API calls with similar prompt lengths)." 338 }, 339 "benchmark_construct_validity": { 340 "applies": true, 341 "answer": true, 342 "justification": "The paper extensively discusses what MT-bench measures vs. what existing benchmarks measure (Section 1, 2.1). It argues that conventional benchmarks fail to capture human preferences and positions MT-bench as complementary, not replacement. Section 5 explicitly compares with MMLU and TruthfulQA." 343 }, 344 "scaffold_confound_addressed": { 345 "applies": false, 346 "answer": false, 347 "justification": "No scaffolding is involved in the LLM judge setup; models are called directly via API." 348 } 349 }, 350 "data_leakage": { 351 "temporal_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "No discussion of whether GPT-4 or other judge models were trained on evaluation-style data or MT-bench-like questions created before the training cutoff." 355 }, 356 "feature_leakage_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of whether the judge model's training on RLHF preference data biases it toward certain evaluation patterns that happen to align with the human labelers' training." 360 }, 361 "non_independence_addressed": { 362 "applies": true, 363 "answer": false, 364 "justification": "No discussion of whether MT-bench questions share structural similarity with GPT-4's RLHF training data, which could inflate agreement." 365 }, 366 "leakage_detection_method": { 367 "applies": true, 368 "answer": false, 369 "justification": "No concrete leakage detection or prevention method is applied." 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "Strong LLM judges like GPT-4 can match human preferences with over 80% agreement, the same level as human-human agreement.", 376 "evidence": "Table 5 shows GPT-4 pairwise comparison achieves 85% agreement with humans (S2 setup) on MT-bench, while human-human agreement is 81%. Table 6 shows 87% on Chatbot Arena.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "LLM judges exhibit position bias, verbosity bias, self-enhancement bias, and limited math/reasoning grading ability.", 381 "evidence": "Position bias: Table 2 (GPT-4 consistency only 65%). Verbosity: Table 3 (GPT-4 8.7% failure vs 91.3% for others). Self-enhancement: Figure 3(b) (10-25% win rate inflation). Math: Table 4 and Figures 13-15.", 382 "supported": "strong" 383 }, 384 { 385 "claim": "Position bias can be mitigated by swapping positions, and math grading failures can be reduced by reference-guided judging.", 386 "evidence": "Section 3.4: conservative swapping approach. Table 4: reference-guided reduces failure rate from 70% (14/20) to 15% (3/20) on math questions.", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "GPT-4 single-answer grading matches pairwise comparison and human preferences well, making it a more scalable alternative.", 391 "evidence": "Tables 5-6 show GPT-4 single-answer grading achieving comparable agreement rates (85% S2 on MT-bench, 85% on Arena).", 392 "supported": "strong" 393 }, 394 { 395 "claim": "Fine-tuning on high-quality dialog datasets consistently improves MMLU performance and the improvement scales with data size.", 396 "evidence": "Table 8 shows Vicuna-7B variants improving MMLU from 35.2% (LLaMA-7B) to 47.1% (Vicuna-7B all) with increasing training data.", 397 "supported": "moderate" 398 } 399 ], 400 "methodology_tags": ["benchmark-eval"], 401 "key_findings": "Strong LLM judges (GPT-4) achieve over 80% agreement with human evaluations on both controlled (MT-bench) and crowdsourced (Chatbot Arena) settings, matching the level of human-human agreement. The paper identifies and partially mitigates key biases in LLM-as-a-judge: position bias (addressed via position swapping), verbosity bias (GPT-4 most resistant), and limited math/reasoning grading (addressed via reference-guided judging). MT-bench and traditional benchmarks like MMLU complement each other, with fine-tuning improving both human preference alignment and benchmark scores.", 402 "red_flags": [ 403 { 404 "flag": "Potential contamination of human agreement measure", 405 "detail": "When human votes differed from GPT-4, GPT-4's judgments were shown to humans, who changed their choices 34% of the time (Section 4.2). This feedback loop could inflate the reported human-GPT-4 agreement rate, as humans were influenced by the system being evaluated." 406 }, 407 { 408 "flag": "No statistical significance tests", 409 "detail": "Key claims about agreement rates and bias differences are made by comparing raw percentages without any significance tests, confidence intervals, or uncertainty quantification." 410 }, 411 { 412 "flag": "Self-evaluation conflict", 413 "detail": "The authors created Vicuna, and the paper evaluates Vicuna favorably as competitive with other open models. The evaluation framework itself was partly built around distinguishing Vicuna's strengths." 414 }, 415 { 416 "flag": "Small benchmark size for category-level claims", 417 "detail": "MT-bench has only 80 questions (10 per category). Category-wise conclusions (Table 7, Table 10) are drawn from very small per-category samples, making them unreliable." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback", 423 "authors": ["Yuntao Bai"], 424 "year": 2022, 425 "arxiv_id": "2204.05862", 426 "relevance": "Foundational RLHF paper for aligning LLMs with human preferences, directly relevant to the alignment methodology discussed." 427 }, 428 { 429 "title": "Evaluating large language models trained on code", 430 "authors": ["Mark Chen"], 431 "year": 2021, 432 "arxiv_id": "2107.03374", 433 "relevance": "Introduced HumanEval benchmark for code generation evaluation, one of the core-knowledge benchmarks discussed." 434 }, 435 { 436 "title": "Measuring massive multitask language understanding", 437 "authors": ["Dan Hendrycks"], 438 "year": 2020, 439 "arxiv_id": "2009.03300", 440 "relevance": "MMLU benchmark used as a comparison point for evaluating model variants in Table 8." 441 }, 442 { 443 "title": "Holistic evaluation of language models", 444 "authors": ["Percy Liang"], 445 "year": 2022, 446 "arxiv_id": "2211.09110", 447 "relevance": "HELM benchmark framework for comprehensive LLM evaluation, positioned as insufficient for human preference assessment." 448 }, 449 { 450 "title": "AlpacaFarm: A simulation framework for methods that learn from human feedback", 451 "authors": ["Yann Dubois"], 452 "year": 2023, 453 "arxiv_id": "2305.14387", 454 "relevance": "Concurrent work on simulating human feedback for LLM training, directly related to LLM-as-a-judge methodology." 455 }, 456 { 457 "title": "Self-instruct: Aligning language model with self generated instructions", 458 "authors": ["Yizhong Wang"], 459 "year": 2022, 460 "relevance": "Instruction-following alignment method relevant to the evaluation of instruction-tuned models." 461 }, 462 { 463 "title": "Training language models to follow instructions with human feedback", 464 "authors": ["Long Ouyang"], 465 "year": 2022, 466 "relevance": "InstructGPT paper establishing RLHF methodology for instruction following, core to the LLM-as-a-judge approach." 467 }, 468 { 469 "title": "Chain of thought prompting elicits reasoning in large language models", 470 "authors": ["Jason Wei"], 471 "year": 2022, 472 "arxiv_id": "2201.11903", 473 "relevance": "Chain-of-thought prompting technique adapted for the CoT judge variant in this paper." 474 }, 475 { 476 "title": "Large language models are not fair evaluators", 477 "authors": ["Peiyi Wang"], 478 "year": 2023, 479 "arxiv_id": "2305.17926", 480 "relevance": "Concurrent work examining fairness and biases of LLMs as evaluators, directly complementary." 481 }, 482 { 483 "title": "LLaMA: Open and efficient foundation language models", 484 "authors": ["Hugo Touvron"], 485 "year": 2023, 486 "arxiv_id": "2302.13971", 487 "relevance": "Base model used for Vicuna fine-tuning and evaluated across all experiments in this paper." 488 } 489 ] 490 }