scan.json (25205B)
1 { 2 "paper": { 3 "title": "Beyond Static Datasets: A Deep Interaction Approach to LLM Evaluation", 4 "authors": ["Jiatong Li", "Rui Li", "Qi Liu"], 5 "year": 2023, 6 "venue": "Under Review (arXiv preprint)", 7 "arxiv_id": "2309.04369" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "The paper states 'Our source code is available at https://anonymous.4open.science/r/DeepEval-112F' in the abstract. This is an anonymized repository link, which provides a working URL." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses publicly available datasets: MBPP for code generation and IWSLT 2017 for machine translation. The idiom database and public goods game are custom but the code repository likely contains them. The public datasets used are standard and referenced." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No environment specifications, requirements.txt, Dockerfile, or dependency list is provided in the paper. There is no mention of library versions or setup instructions." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided. While the code repository is linked, the paper does not include a 'Reproducing Results' section or describe how to run the experiments." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": true, 36 "justification": "The public goods game results are presented as boxplots in Figure 4 showing the distribution across 10 runs, including quartiles and ranges. However, the other tasks (Code Review, Machine Translation, Idiom Solitaire) report only point estimates in tables without confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper makes comparative claims (e.g., 'GPT-4 reaches the state-of-the-art performance') but no statistical significance tests are applied. Differences between models are compared by eyeballing scores in tables without any p-values or formal tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": false, 46 "justification": "No effect sizes are reported. The paper presents raw scores but does not contextualize the magnitude of differences between models using any effect size measures." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification is given for the sample sizes. The public goods game uses 10 repeated runs, idiom solitaire uses 30 randomly sampled idioms, and code/translation use portions of MBPP and IWSLT 2017, but no rationale is provided for these choices." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "The public goods game boxplots in Figure 4 show variance across 10 runs including quartiles, median, min, and max. However, for Code Review, Machine Translation, and Idiom Solitaire, only mean scores are reported without any variance or standard deviation." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": false, 63 "justification": "The paper does not compare DeepEval against any existing evaluation method (e.g., MT-Bench, AlpacaEval, standard benchmarks). It only compares LLMs against each other within its own framework, without validating whether its rankings are consistent with established evaluation approaches." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": false, 68 "justification": "No baselines are included, so there are no contemporary baselines to assess. The paper mentions MT-Bench, PandaLM, AlpacaEval, and GameEval in related work but does not compare against any of them." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": false, 73 "justification": "No ablation study is performed. The framework has multiple components (message pool, referee, synchronous algorithm, symmetric vs. asymmetric tasks) but none are individually ablated to demonstrate their contribution." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper uses different metrics across tasks: payoff in public goods game, winning rate and successful hit in idiom solitaire, programmer score and reviewer score in code review, and translator score and proofreader score in machine translation." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation is included. The referee/judge in DeepEval is an LLM, and the paper does not validate the LLM-judge's ratings against human judgments. This is a significant gap since the paper proposes an alternative to human-based evaluation but never validates against it." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": false, 88 "justification": "The paper does not describe a held-out test set or train/test split. MBPP and IWSLT data are used for evaluation but no mention is made of how splits were handled or whether any tuning was performed on a separate dev set." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down per task (public goods game, idiom solitaire, code review, machine translation), per role (programmer vs. reviewer, translator vs. proofreader), and per language pair (DE-EN, EN-FR, EN-ZH) in Tables 3-6." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 4.6 (Case Study) and Appendix A.2 show specific failure cases, including Claude 2 failing at idiom solitaire (producing a non-idiom response) and translation errors. Figure 5 illustrates specific failure modes." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": false, 103 "justification": "No negative results are reported. Every experiment shows the framework producing differentiated rankings. The paper does not discuss any configurations, task designs, or settings that failed or produced unreliable results." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims the framework can evaluate LLMs through deep interaction in real-world tasks and demonstrates effectiveness through experiments on four tasks. The results sections cover all four tasks with quantitative results. The abstract is relatively modest in its claims." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": false, 115 "justification": "The paper makes implicit causal claims such as 'GPT-4 and Claude 2 have a better decision-making ability' and 'ChatGPT is stronger than GPT-4' in Chinese tasks. These are causal attributions about model abilities based on performance in specific game settings without controlling for confounds like prompt sensitivity or game-specific strategies." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper claims to evaluate 'LLMs in real-world scenarios' and to be 'a general evaluation method that can be applied to a host of real-world tasks,' but tests on only four specific tasks (a game theory game, a Chinese word game, code review on MBPP, and machine translation on IWSLT). No explicit bounds are stated on what settings these results do or do not generalize to." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No alternative explanations for the results are discussed. For example, performance differences could be driven by prompt format sensitivity, system message handling, or the specific LLM used as referee/judge, but none of these confounds are addressed." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Exact model versions are specified in Section 4.1.2: 'gpt-3.5-turbo-0613' for ChatGPT, 'gpt-4-0613' for GPT-4, 'claude-2' for Claude 2, and 'chat-bison-001' for PaLM." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Full prompt texts for all four evaluation tasks are provided in Appendix A.1, including initial prompts, round-by-round prompts, and judge prompts for each task. The actual text used is given, not just descriptions." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "No hyperparameters are reported. Temperature, top-p, max tokens, and other sampling parameters for the LLM API calls are not specified. The public goods game uses alpha as a multiplier but LLM inference parameters are absent." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The agentic scaffolding (message pool, referee, synchronous interaction algorithm) is described in detail in Section 3.3-3.4 with Algorithm 1. The workflow, message passing, and role assignment mechanisms are clearly documented." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": false, 152 "justification": "Data preprocessing is poorly documented. For machine translation, the paper says 'We split the dataset into paragraph-level segments for the test set' but does not describe the splitting criteria. For idiom solitaire, 30 idioms are 'randomly sampled' from an unspecified database. How MBPP samples were selected is not described." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "There is no limitations, threats to validity, or discussion section. The paper goes directly from experiments (Section 4) to conclusion (Section 5) without acknowledging any limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No threats to validity are discussed anywhere in the paper. There is no mention of potential issues with using LLM-as-judge, prompt sensitivity, sample size limitations, or other threats." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "No scope boundaries are stated. The paper does not explicitly state what settings or claims are excluded from its evaluation. The conclusion mentions future work but does not delineate what the current results do not show." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "Raw interaction logs, LLM outputs, and referee evaluations are not made available. Only aggregated scores are presented in tables and figures." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "The data collection procedure is described: LLMs interact through the framework with the synchronous algorithm (Algorithm 1), the referee collects and evaluates interaction histories. The process for each task is specified in Sections 3.7 and 4.1." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants are involved. The study evaluates LLMs only, using standard datasets (MBPP, IWSLT) and custom game scenarios." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": false, 191 "justification": "The pipeline from raw LLM interactions to final scores is not fully documented. For example, in code review and machine translation, the judge model scores are averaged but how invalid responses or parsing failures are handled is not described. The number of examples used from MBPP is not stated." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding information is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants or sponsors." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly listed: all three authors are from the University of Science and Technology of China (School of Data Science and School of Computer Science). No affiliation with any LLM company being evaluated." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of any funding disclosure is noted." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is included in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "No training data cutoff dates are stated for any of the four models used. This is relevant because MBPP was published in 2021 and the models (especially GPT-4 and ChatGPT) may have seen it during training." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of potential train/test overlap. MBPP is a well-known benchmark that could be in the training data of the models tested, but this is not addressed." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "MBPP was published in 2021 and is widely available online. The IWSLT 2017 dataset is also public. No discussion of whether these benchmarks could have been in the training data of the 2023-era models." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants are involved in this study. It evaluates LLMs only." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants are involved in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants are involved in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants are involved in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants are involved in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants are involved in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants are involved in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inference costs are reported despite the framework requiring multiple LLM API calls per evaluation (multi-round interactions for each task, repeated across multiple runs). The cost of running GPT-4, ChatGPT, Claude 2, and PaLM APIs is not mentioned." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No computational budget is stated. The total API spend, number of API calls, or wall-clock time for the experiments is not reported." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "The deep interaction-based framework can evaluate LLMs in dynamic real-world scenarios where static datasets cannot.", 286 "evidence": "The paper demonstrates the framework on four tasks (public goods game, idiom solitaire, code review, machine translation) in Sections 4.2-4.5 with multi-round interactions between LLMs.", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "GPT-4 has the state-of-the-art overall performance among the four tested LLMs.", 291 "evidence": "GPT-4 ranks first in public goods game mode 1 (Figure 4a), code review (Table 3, highest programmer and reviewer scores), and DE-EN/EN-FR machine translation (Tables 4-5). However, ChatGPT outperforms GPT-4 in idiom solitaire (Table 1), EN-ZH translation (Table 6), and public goods game mode 2.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "ChatGPT performs prominently in tasks related to Chinese.", 296 "evidence": "ChatGPT has the highest winning rate in idiom solitaire (Table 1, average 0.77) and the best EN-ZH translation scores (Table 6). Section 4.5 notes consistency between idiom vocabulary and Chinese translation results.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "The framework ensures fairness through anonymity and synchronous interaction.", 301 "evidence": "The fairness condition (Condition 1) and synchronous interaction algorithm (Algorithm 1) are described in Sections 3.2-3.4. However, no empirical evidence is provided that these conditions actually affect evaluation outcomes or that violations would produce different results.", 302 "supported": "weak" 303 }, 304 { 305 "claim": "The evaluation results converge with multiple repetitions (stableness condition).", 306 "evidence": "The public goods game is run 10 times with boxplots showing variance in Figure 4. However, no formal convergence analysis is provided, and for the other three tasks, convergence is not explicitly demonstrated.", 307 "supported": "weak" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "The paper proposes DeepEval, a framework for evaluating LLMs through multi-round deep interactions rather than static benchmarks. Using game-theoretic principles, four evaluation tasks (public goods game, idiom solitaire, code review, machine translation) are designed to assess LLMs in interactive settings. Across these tasks, GPT-4 generally performs best overall while ChatGPT excels in Chinese-related tasks. The framework uses an LLM referee for scoring but does not validate these scores against human judgments or compare the framework's rankings against established evaluation methods.", 312 "red_flags": [ 313 { 314 "flag": "No validation against human evaluation or established benchmarks", 315 "detail": "The paper proposes an evaluation framework but never validates whether its rankings align with human judgments or existing evaluation methods like MT-Bench, AlpacaEval, or standard code generation benchmarks. The LLM-as-judge referee is used without any inter-rater reliability assessment." 316 }, 317 { 318 "flag": "No limitations section", 319 "detail": "The paper has no limitations, threats to validity, or discussion section. This is a significant omission for a paper proposing a new evaluation methodology, as there are many potential confounds (judge model bias, prompt sensitivity, small sample sizes)." 320 }, 321 { 322 "flag": "Circular evaluation concern", 323 "detail": "The referee/judge is itself an LLM (unspecified which model). If GPT-4 is used as the judge to evaluate GPT-4 and its competitors, there is a potential self-preferencing bias. The paper does not state which model serves as referee or address this concern." 324 }, 325 { 326 "flag": "No statistical significance tests", 327 "detail": "Comparative claims about which model is 'state-of-the-art' are made based on raw score comparisons without any statistical tests. Score differences in Tables 3-6 are often very small (e.g., 8.97 vs. 8.85 vs. 8.86 in code review) and could easily be within noise." 328 }, 329 { 330 "flag": "Missing hyperparameters", 331 "detail": "Temperature, top-p, and other sampling parameters are not reported for any of the four LLMs, despite these parameters significantly affecting LLM output quality and consistency." 332 }, 333 { 334 "flag": "Benchmark contamination risk unaddressed", 335 "detail": "MBPP (2021) is used for code review evaluation with 2023-era models that likely saw MBPP during training. This could inflate performance and affect the validity of the code review evaluation, but the paper does not acknowledge this risk." 336 } 337 ], 338 "cited_papers": [ 339 { 340 "title": "Program Synthesis with Large Language Models", 341 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell I. Nye"], 342 "year": 2021, 343 "arxiv_id": "2108.07732", 344 "relevance": "Introduces MBPP benchmark used in the code review evaluation task, a key benchmark for LLM code generation evaluation." 345 }, 346 { 347 "title": "A Survey on Evaluation of Large Language Models", 348 "authors": ["Yupeng Chang", "Xu Wang", "Jindong Wang"], 349 "year": 2023, 350 "arxiv_id": "2307.03109", 351 "relevance": "Comprehensive survey on LLM evaluation methods, directly relevant to the survey's scope of assessing evaluation methodology quality." 352 }, 353 { 354 "title": "PaLM: Scaling Language Modeling with Pathways", 355 "authors": ["Aakanksha Chowdhery", "Sharan Narang", "Jacob Devlin"], 356 "year": 2022, 357 "arxiv_id": "2204.02311", 358 "relevance": "Describes PaLM, one of the four LLMs evaluated in the study, relevant to understanding LLM capability evaluation." 359 }, 360 { 361 "title": "AlpacaFarm: A Simulation Framework for Methods that Learn from Human Feedback", 362 "authors": ["Yann Dubois", "Xuechen Li", "Rohan Taori"], 363 "year": 2023, 364 "arxiv_id": "2305.14387", 365 "relevance": "LLM evaluation framework using simulation, relevant to automated LLM evaluation methodology." 366 }, 367 { 368 "title": "GPT-4 Technical Report", 369 "authors": ["OpenAI"], 370 "year": 2023, 371 "arxiv_id": "2303.08774", 372 "relevance": "Technical report for GPT-4, one of the main models evaluated in LLM benchmark studies." 373 }, 374 { 375 "title": "GameEval: Evaluating LLMs on Conversational Games", 376 "authors": ["Dan Qiao", "Chenfei Wu", "Yaobo Liang"], 377 "year": 2023, 378 "arxiv_id": "2308.10032", 379 "relevance": "Uses conversational games for LLM evaluation, closely related approach to DeepEval's interaction-based evaluation." 380 }, 381 { 382 "title": "PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization", 383 "authors": ["Yidong Wang", "Zhuohao Yu", "Zhengran Zeng"], 384 "year": 2023, 385 "arxiv_id": "2306.05087", 386 "relevance": "Automated LLM evaluation using a discriminative language model, relevant to LLM-as-judge methodology." 387 }, 388 { 389 "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena", 390 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 391 "year": 2023, 392 "arxiv_id": "2306.05685", 393 "relevance": "Foundational work on LLM-as-judge evaluation methodology, directly relevant to evaluating the validity of using LLMs as referees." 394 }, 395 { 396 "title": "PromptBench: Towards Evaluating the Robustness of Large Language Models on Adversarial Prompts", 397 "authors": ["Kaijie Zhu", "Jindong Wang", "Jiaheng Zhou"], 398 "year": 2023, 399 "arxiv_id": "2306.04528", 400 "relevance": "Evaluates adversarial robustness of LLMs, relevant to understanding LLM evaluation methodology and robustness benchmarks." 401 }, 402 { 403 "title": "Efficiently Measuring the Cognitive Ability of LLMs: An Adaptive Testing Perspective", 404 "authors": ["Yan Zhuang", "Qi Liu", "Yuting Ning"], 405 "year": 2023, 406 "arxiv_id": "2306.10512", 407 "relevance": "Adaptive testing framework for LLMs, directly relevant to efficient and dynamic LLM evaluation methodology." 408 }, 409 { 410 "title": "Synchromesh: Reliable Code Generation from Pre-trained Language Models", 411 "authors": ["Gabriel Poesia", "Alex Polozov", "Vu Le"], 412 "year": 2022, 413 "relevance": "Addresses reliable code generation from LLMs, relevant to code generation evaluation." 414 }, 415 { 416 "title": "Natural Language to Code Generation in Interactive Data Science Notebooks", 417 "authors": ["Pengcheng Yin", "Wen-Ding Li", "Kefan Xiao"], 418 "year": 2023, 419 "relevance": "Interactive code generation approach relevant to evaluating LLMs in real-world coding scenarios." 420 } 421 ] 422 }