scan.json (24709B)
1 { 2 "paper": { 3 "title": "ChatEval: Towards Better LLM-based Evaluators through Multi-Agent Debate", 4 "authors": [ 5 "Chi-Min Chan", 6 "Weize Chen", 7 "Yusheng Su", 8 "Jianxuan Yu", 9 "Zhiyuan Liu", 10 "Jie Fu", 11 "Wei Xue", 12 "Shanghang Zhang" 13 ], 14 "year": 2023, 15 "venue": "arXiv preprint", 16 "arxiv_id": "2308.07201" 17 }, 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The paper provides a GitHub link: https://github.com/chanchimin/ChatEval (stated in the abstract and footnote 1). The code is built on top of the AgentVerse framework." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The paper uses publicly available benchmarks: FairEval (Wang et al., 2023b) with human annotations from Wu et al. (2023), and Topical-Chat (Gopalakrishnan et al., 2019) with annotations from Mehri & Eskenazi (2020). These are standard public datasets." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "No environment specifications, requirements.txt, Dockerfile, or dependency lists are mentioned in the paper. The only setup detail is that they use OpenAI GPT models via API." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions are provided in the paper. While code is released, the paper itself does not include commands or a reproducibility guide." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "No confidence intervals or error bars are reported. Tables 1-4 report only point estimates for accuracy and correlation coefficients without any uncertainty quantification." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "No statistical significance tests are reported. The paper claims multi-agent outperforms single-agent and FairEval baselines based solely on comparing point estimates (e.g., 63.8% vs 61.3%) without any significance testing." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "The paper reports percentage improvements with baseline context: 'the multi-agent-based method improves the accuracy by 6.2% for ChatGPT and 2.5% for GPT-4' (Section 3.4), and 'ChatEval improves the average Spearman and Kendall-Tau correlation by 0.096 (16.3%) and 0.057 (10.0%)' (Section 3.5), providing enough context to assess magnitude." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "No justification for sample sizes. The FairEval benchmark has only 80 questions and Topical-Chat uses 60 dialogue contexts. No discussion of whether these sizes are adequate for the claims made, and no power analysis." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "No variance or standard deviation is reported across runs. Temperature is set to 0 for reproducibility, but no mention of multiple runs or variance in results." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "Multiple baselines are included: single-agent evaluation, FairEval (MEC+BPC) for open-ended QA, G-EVAL for dialogue response generation, and traditional metrics (ROUGE-L, BLEU-4, BERTScore). Human annotator performance is also shown." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "Baselines include FairEval (Wang et al., 2023b) and G-EVAL (Liu et al., 2023b), both from 2023, which were contemporary and competitive at the time of writing." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "Section 4 provides multiple ablations: diverse vs. same role prompts (Table 3), different communication strategies (Table 4), varying role numbers (Figure 3a), and varying discussion turns (Figure 3b). Each isolates a specific component." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "Multiple metrics are used: Accuracy and Kappa correlation coefficient for FairEval, and Spearman and Kendall-Tau correlations for Topical-Chat, across four dimensions (naturalness, coherence, engagingness, groundedness)." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": true, 92 "justification": "The paper's core evaluation is correlation with human judgments. Human annotations serve as the ground truth: three annotators for FairEval (Table 1) and human evaluators for Topical-Chat (Mehri & Eskenazi, 2020). The system's outputs are compared against these human assessments." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": false, 97 "justification": "No separation between dev and test sets is mentioned. It appears the same benchmark data was used for both tuning the system configuration (number of agents, communication strategy, discussion turns) and reporting final results." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Table 2 provides per-dimension breakdowns for Topical-Chat (naturalness, coherence, engagingness, groundedness) rather than just overall averages." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": false, 107 "justification": "No failure case analysis is provided. The qualitative analysis in Section 4.4 only shows a successful example where ChatEval reaches the correct consensus. No discussion of cases where the debate leads to worse outcomes." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "Some negative results are reported: increasing discussion turns does not improve performance (Figure 3b, Section 4.3), simultaneous-talk strategies underperform one-by-one (Table 4), and same role prompts fail to improve over single-agent (Table 3). Also, ChatGPT multi-agent sometimes underperforms GPT-4 single-agent." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims ChatEval 'delivers superior accuracy and correlation in alignment with human assessment' and that 'diverse role prompts are essential.' Both are supported by Tables 1-2 (superior performance) and Table 3 (diverse roles necessary)." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper makes causal claims through ablations: diverse role prompts improve performance (Table 3, controlled comparison), multi-agent debate improves over single-agent (Tables 1-2, controlled comparison). The ablation design with controlled single-variable manipulation is adequate for these claims." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The title claims 'Better LLM-based Evaluators' broadly, but results are only on two benchmarks (80 open-ended questions, 60 dialogues) with two model families (GPT-3.5-turbo, GPT-4). The paper does not sufficiently bound its generalizations to these specific settings. The conclusion states the approach 'contributes to improving the evaluation performance concerning text quality' without qualifying the narrow evaluation scope." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "No alternative explanations for the results are discussed. For instance, the paper does not consider whether the improvement could be due to prompt engineering effects rather than multi-agent debate per se, or whether majority voting alone (without debate) could achieve similar gains." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper says 'GPT-4 and ChatGPT (GPT-3.5-turbo)' (Section 3.1) but does not specify snapshot dates or API versions (e.g., gpt-4-0613, gpt-3.5-turbo-0613). These model behaviors change across versions." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "The full prompt template is provided in Table 6 (Appendix A), including the system message, slot structure, and chat history format. Role descriptions for all five personas are also provided in Appendix A with their complete text." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": false, 151 "justification": "Only temperature=0 is mentioned (Section 3.1). No other hyperparameters are reported — top-p, max tokens, presence/frequency penalty are not specified." 152 }, 153 "scaffolding_described": { 154 "applies": true, 155 "answer": true, 156 "justification": "The multi-agent scaffolding is described in detail: Section 2 describes debater agents, role specification, and three communication strategies. Appendix B provides formal algorithms (Algorithms 1-3) with pseudocode for each strategy. The chat history management, majority vote/average score extraction, and summarizer component are all documented." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": false, 161 "justification": "No documentation of data preprocessing. The paper does not describe how the FairEval or Topical-Chat data was prepared for input to the system, how the annotations were formatted, or any preprocessing steps applied." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": false, 168 "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The conclusion (Section 6) is brief and contains no limitations discussion." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "No specific threats to validity are discussed anywhere in the paper." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "No explicit scope boundaries are stated. The paper does not state what the results do NOT show, what settings were not tested, or what claims the authors are NOT making." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "The raw evaluation outputs from the LLM agents are not made available. While the underlying benchmarks are public, the intermediate debate transcripts and per-question model outputs are not released for verification." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "The paper describes that human annotations for FairEval come from Wu et al. (2023) with three annotators using majority vote (Section 3.2), and Topical-Chat annotations come from Mehri & Eskenazi (2020) with 60 dialogue contexts and 6 systems per context (Section 3.2)." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants were recruited for this study. The paper uses existing benchmark datasets with pre-existing human annotations. NA because the data source is standard benchmarks." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": false, 200 "justification": "The pipeline from benchmark data to final evaluation results is not documented in detail. How the raw annotations were processed, how majority votes were extracted from multi-agent outputs, and the exact data flow are not explained step by step." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding sources or acknowledgments section is present in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are clearly stated: Tsinghua University, Hong Kong University of Science and Technology, and Peking University. None of the authors appear to be affiliated with OpenAI whose models are evaluated." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of funding disclosure does not confirm the work is unfunded." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement is present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "The paper does not state the training data cutoff for GPT-4 or GPT-3.5-turbo. The FairEval benchmark includes questions from Vicuna's evaluation set, which could have been in the models' training data." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No discussion of whether the benchmark questions or expected answers could have been seen during model training. The FairEval benchmark and Topical-Chat data were publicly available before the models' training cutoffs." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "No discussion of benchmark contamination risk. The FairEval questions originated from the Vicuna evaluation set (Chiang et al., 2023) and Topical-Chat was published in 2019 — both publicly available before GPT-4's training data cutoff." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants were recruited for this study. The paper uses pre-existing human annotation datasets." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants were recruited for this study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants were recruited for this study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants were recruited for this study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants were recruited for this study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants were recruited for this study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants were recruited for this study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "No inference costs, API costs, or tokens consumed are reported. The multi-agent approach involves multiple LLM calls per evaluation (multiple agents x multiple rounds), but the cost is never quantified." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "No total computational budget, API spend, or hardware requirements are mentioned. Given the approach multiplies LLM calls, this is a significant omission." 289 } 290 } 291 }, 292 "claims": [ 293 { 294 "claim": "ChatEval with multi-agent debate achieves higher alignment with human preferences compared to single-agent evaluation, improving accuracy by 6.2% for ChatGPT and 2.5% for GPT-4 on FairEval.", 295 "evidence": "Table 1: ChatGPT single-agent 53.8% vs multi-agent 60.0%; GPT-4 single-agent 61.3% vs multi-agent 63.8% (Section 3.4).", 296 "supported": "moderate" 297 }, 298 { 299 "claim": "ChatEval surpasses FairEval's best results (MEC+BPC) in both ChatGPT and GPT-4 settings.", 300 "evidence": "Table 1: FairEval ChatGPT MEC+BPC 58.7% vs ChatEval 60.0%; FairEval GPT-4 MEC+BPC 62.5% vs ChatEval 63.8% (Section 3.4).", 301 "supported": "weak" 302 }, 303 { 304 "claim": "ChatEval improves average Spearman and Kendall-Tau correlation by 16.3% and 10.0% respectively over G-EVAL-4 on Topical-Chat.", 305 "evidence": "Table 2: G-EVAL-4 avg Spearman 0.588, avg Kendall-Tau 0.575 vs GPT-4 MA avg Spearman 0.684, avg Kendall-Tau 0.632 (Section 3.5).", 306 "supported": "moderate" 307 }, 308 { 309 "claim": "Diverse role prompts are essential in multi-agent debate; using the same role prompt leads to performance degradation.", 310 "evidence": "Table 3: Same role prompt achieves 53.8% accuracy (same as single-agent), while diverse role prompt achieves 60.0% (Section 4.1).", 311 "supported": "moderate" 312 }, 313 { 314 "claim": "Increasing discussion turns does not lead to performance improvement.", 315 "evidence": "Figure 3b shows no significant upward trend in accuracy or Kappa as discussion turns increase from 1 to 5 (Section 4.3).", 316 "supported": "moderate" 317 } 318 ], 319 "methodology_tags": [ 320 "benchmark-eval" 321 ], 322 "key_findings": "ChatEval proposes a multi-agent debate framework for LLM-based text evaluation, where multiple LLM agents with diverse role prompts discuss and arrive at consensus evaluations. On FairEval (open-ended QA) and Topical-Chat (dialogue generation), the multi-agent approach improves accuracy by 2.5-6.2% over single-agent baselines and outperforms FairEval and G-EVAL baselines on correlation with human judgments. The paper finds that diverse role prompts are necessary for the debate to be effective, one-by-one communication outperforms simultaneous strategies, and increasing discussion turns provides no benefit.", 323 "red_flags": [ 324 { 325 "flag": "No significance testing on small benchmarks", 326 "detail": "Claims of improvement are based on comparing point estimates on 80 (FairEval) and 60 (Topical-Chat) examples without any significance tests. The 2.5% GPT-4 improvement on FairEval (61.3% to 63.8%) represents only 2 additional correct answers out of 80, which is well within random noise." 327 }, 328 { 329 "flag": "No cost analysis for multiplied API calls", 330 "detail": "The multi-agent approach requires N agents x T turns of LLM calls per evaluation, but costs are never reported. This is a significant practical concern for a method that claims to replace human evaluation — the cost multiplier could negate the economic advantage over human annotators." 331 }, 332 { 333 "flag": "No limitations section", 334 "detail": "The paper contains no limitations, threats to validity, or scope boundary discussion. This is a significant omission for a paper making broad claims about evaluation quality." 335 }, 336 { 337 "flag": "Configuration tuning on test data", 338 "detail": "The communication strategy, number of agents, and discussion turns appear to have been tuned on the same benchmark data used for final evaluation. No held-out test set is mentioned, raising concerns about overfitting the configuration to these specific benchmarks." 339 }, 340 { 341 "flag": "Contamination risk unaddressed", 342 "detail": "The benchmarks (FairEval from Vicuna evaluation, Topical-Chat from 2019) were publicly available before GPT-4's training data cutoff, but no contamination analysis is performed. If the models have seen the evaluation questions or responses during training, the correlation metrics may be inflated." 343 }, 344 { 345 "flag": "Qualitative analysis shows only success case", 346 "detail": "The qualitative analysis in Section 4.4 presents only one example where agents reach the correct consensus. No failure cases are analyzed, making it impossible to understand when and how the approach breaks down." 347 } 348 ], 349 "cited_papers": [ 350 { 351 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 352 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 353 "year": 2023, 354 "arxiv_id": "2306.05685", 355 "relevance": "Foundational work on LLM-as-a-judge paradigm that ChatEval extends to multi-agent setting." 356 }, 357 { 358 "title": "G-EVAL: NLG Evaluation Using GPT-4 with Better Human Alignment", 359 "authors": ["Yang Liu", "Dan Iter", "Yichong Xu"], 360 "year": 2023, 361 "arxiv_id": "2303.16634", 362 "relevance": "Key baseline for LLM-based text evaluation using GPT-4, proposes probability-weighted scoring." 363 }, 364 { 365 "title": "Large Language Models are Not Fair Evaluators", 366 "authors": ["Peiyi Wang", "Lei Li", "Liang Chen"], 367 "year": 2023, 368 "arxiv_id": "2305.17926", 369 "relevance": "Identifies position bias in LLM evaluation, proposes calibration strategies that ChatEval compares against." 370 }, 371 { 372 "title": "Improving Factuality and Reasoning in Language Models through Multiagent Debate", 373 "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba"], 374 "year": 2023, 375 "arxiv_id": "2305.14325", 376 "relevance": "Multi-agent debate framework for improving LLM factuality, foundational for ChatEval's approach." 377 }, 378 { 379 "title": "Encouraging Divergent Thinking in Large Language Models through Multi-Agent Debate", 380 "authors": ["Tian Liang", "Zhiwei He", "Wenxiang Jiao"], 381 "year": 2023, 382 "arxiv_id": "2305.19118", 383 "relevance": "Multi-agent debate for improving LLM reasoning, relevant to understanding debate dynamics in LLMs." 384 }, 385 { 386 "title": "CAMEL: Communicative Agents for 'Mind' Exploration of Large Scale Language Model Society", 387 "authors": ["Guohao Li", "Hasan Abed Al Kader Hammoud"], 388 "year": 2023, 389 "arxiv_id": "2303.17760", 390 "relevance": "Role-playing cooperative agent framework relevant to multi-agent LLM collaboration." 391 }, 392 { 393 "title": "Communicative Agents for Software Development", 394 "authors": ["Chen Qian", "Xin Cong", "Cheng Yang"], 395 "year": 2023, 396 "arxiv_id": "2307.07924", 397 "relevance": "Multi-agent LLM framework for software development, demonstrates communicative agents in programming domain." 398 }, 399 { 400 "title": "Generative Agents: Interactive Simulacra of Human Behavior", 401 "authors": ["Joon Sung Park", "Joseph C O'Brien"], 402 "year": 2023, 403 "arxiv_id": "2304.03442", 404 "relevance": "Generative agents with memory systems simulating human behavior, relevant to agent architecture design." 405 }, 406 { 407 "title": "PRD: Peer Rank and Discussion Improve Large Language Model Based Evaluations", 408 "authors": ["Ruosen Li", "Teerth Patel", "Xinya Du"], 409 "year": 2023, 410 "arxiv_id": "2307.02762", 411 "relevance": "Concurrent work on multi-agent LLM evaluation using peer discussion, directly comparable to ChatEval." 412 }, 413 { 414 "title": "Large Language Models are Diverse Role-Players for Summarization Evaluation", 415 "authors": ["Ning Wu", "Ming Gong", "Linjun Shou"], 416 "year": 2023, 417 "arxiv_id": "2303.15078", 418 "relevance": "Diverse role-playing for LLM evaluation, directly inspired ChatEval's role specification design." 419 }, 420 { 421 "title": "Benchmarking Foundation Models with Language-Model-as-an-Examiner", 422 "authors": ["Yushi Bai", "Jiahao Ying", "Yixin Cao"], 423 "year": 2023, 424 "arxiv_id": "2306.04181", 425 "relevance": "Proposes decentralized LLM evaluation for fairer assessment, relevant to LLM-as-judge methodology." 426 }, 427 { 428 "title": "Training Language Models to Follow Instructions with Human Feedback", 429 "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"], 430 "year": 2022, 431 "relevance": "InstructGPT paper describing RLHF training paradigm underlying the models used in ChatEval experiments." 432 } 433 ] 434 }