scan.json (30504B)
1 { 2 "paper": { 3 "title": "Not All Metrics Are Guilty: Improving NLG Evaluation by Diversifying References", 4 "authors": [ 5 "Tianyi Tang", 6 "Hongyuan Lu", 7 "Yuchen Eleanor Jiang", 8 "Haoyang Huang", 9 "Dongdong Zhang", 10 "Wayne Xin Zhao", 11 "Tom Kocmi", 12 "Furu Wei" 13 ], 14 "year": 2023, 15 "venue": "arXiv", 16 "arxiv_id": "2305.15067", 17 "doi": "10.48550/arXiv.2305.15067" 18 }, 19 "scan_version": 2, 20 "active_modules": ["experimental_rigor", "data_leakage"], 21 "methodology_tags": ["benchmark-eval"], 22 "key_findings": "Diversifying references using LLMs (GPT-3.5-turbo-instruct) with diverse linguistic prompts significantly improves correlation between automatic NLG evaluation metrics and human judgments across machine translation (WMT22), text summarization (SummEval), and image captioning (PASCAL-50S). The method is compatible with both traditional metrics (BLEU, ROUGE) and LLM-based metrics (GEMBA, ChatGPT-eval), with maximum aggregation outperforming mean aggregation. Mixing ten diverse prompts covering different linguistic aspects yields better results than any single prompt type, and performance continues to improve up to ~20 references before saturating.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "GitHub repository link provided in the abstract: https://github.com/RUCAIBox/Div-Ref. The paper states 'We release all the code and data at https://github.com/RUCAIBox/Div-Ref to facilitate research.'" 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "They use publicly available benchmarks (WMT22, SummEval, PASCAL-50S) and release their diversified references at the GitHub repository. The paper states code and data are released." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "No mention of requirements.txt, Dockerfile, conda environment, or detailed environment setup in the paper. The paper only mentions specific model checkpoints used for individual metrics in Appendix A.1 but no environment specification." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "No step-by-step reproduction instructions are provided in the paper. While code is released, the paper lacks a 'Reproducing Results' section or specific commands to run." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "All results in Figures 1-4 and Tables 3-5 report point estimates only (e.g., '77.7' system-level accuracy, '19.4' Kendall Tau). No confidence intervals, error bars, or ± notation anywhere in the paper." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper claims improvements ('consistent correlation improvements', 'significant effects') based solely on comparing raw numbers without any statistical significance test. No p-values, t-tests, or bootstrap tests reported." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "Tables 3-4 report 'Average Gains' columns showing improvements in context (e.g., '+3.0' system-level, '+2.9' segment-level for Zh-En). Results show baseline and improved values side by side (e.g., BLEU from 14.5 to 19.4), providing sufficient context to assess effect magnitude." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "No discussion of sample sizes or power analysis. The paper uses existing benchmarks without justifying whether the number of evaluation instances is adequate for the claimed improvements." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "No variance, standard deviation, or spread measures reported anywhere. Results are single-run despite using stochastic generation (nucleus sampling with top_p=0.9). The paper acknowledges in limitations that 'The OpenAI API also is non-deterministic' but never measures this variance." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Single-Ref baseline is compared against Div-Ref throughout. Ablation analysis (Section 4.3) also compares against three paraphrasing models (PEGASUS, Parrot, QCPG) and LLaMA-2-70b-chat." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "Metrics include contemporary SOTA methods: COMET (2020), BLEURT (2020), GEMBA (2023), ChatGPT-eval (2023). Paraphrasing baselines include QCPG (2022) and LLaMA-2 (2023). All are recent and competitive." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "Section 4.3 presents comprehensive ablation across five dimensions: (1) diversifying model choice, (2) instruction prompt design, (3) aggregation function, (4) post-filtering effect, (5) number of diversified references. Results shown in Tables 3-5 and Figure 4." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "16 metrics across 5 categories evaluated: character-based (ChrF), word-based (BLEU, ROUGE variants, METEOR, CIDEr, SPICE), embedding-based (BERTScore, MoverScore), trained (BLEURT, Prism, COMET, BARTScore), and LLM-based (GEMBA, ChatGPT-eval). Listed in Table 2." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": false, 97 "justification": "No human evaluation of the diversified references or the method's outputs. The 94.6% quality assessment of generated sentences was done by GPT-3.5, not humans. The paper uses existing human annotations from WMT22/SummEval/PASCAL-50S but does not conduct its own human evaluation." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": true, 102 "justification": "Evaluation uses standard held-out benchmark test sets: WMT22 Metrics Shared Task, SummEval, and PASCAL-50S. These are established evaluation benchmarks not used for any tuning." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Results broken down by language pair (Zh-En, En-De, En-Ru) in Figure 1 and Tables 3-4, by summarization aspect (coherence, consistency, fluency, relevance) in Figure 2, and by caption difficulty setting (HC, HI, HM, MM) in Figure 3." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": false, 112 "justification": "No failure case analysis or examples of where Div-Ref fails to improve or hurts performance. The limitations section mentions generic domain limitations but does not analyze specific failure instances." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "Mean aggregation hurts performance in some settings (En-De: -2.8 average gains, Table 4). BERTScore slightly drops in HM setting (90.0→90.3 but not meaningful). Multilingual diverse prompts show 'gains are also not obvious' for non-English. LLM diversifying ability 'in non-English is not as good as that in English.'" 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "Abstract claims that diversifying references 'can significantly enhance the correlation between automatic evaluation and human evaluation' are supported by Figures 1-3 showing consistent improvements. The claim of compatibility with LLM-based evaluation is supported by GEMBA and ChatGPT-eval improvements." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "The paper's causal claim that reference diversification improves correlation is supported by controlled ablation (Section 4.3): systematically varying model, prompts, aggregation, filtering, and number of references while holding other factors constant. This single-variable manipulation constitutes adequate causal evidence for the claim." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": false, 134 "justification": "The title 'Improving NLG Evaluation by Diversifying References' implies general NLG applicability, but only three specific tasks are tested (translation, summarization, image captioning). The limitations acknowledge 'diverse prompts may fail in specialized domains, such as finance and biomedicine' but the main claims and title are not bounded to tested settings." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper does not consider alternative explanations for why diversified references improve correlation. For instance: does maximum aggregation simply inflate scores to match human tendencies? Could the improvement be an artifact of GPT-3.5 generating references that are more stylistically similar to the hypothesis models? These alternatives are not discussed." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": true, 144 "justification": "The paper clearly measures correlation between automatic metrics and human judgments (Kendall Tau, Spearman, pairwise accuracy) and frames claims accordingly. They do not overclaim beyond 'improving correlation with human evaluation' — the proxy matches the claim granularity." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": false, 151 "justification": "The paper specifies 'gpt-3.5-turbo-instruct' and 'gpt-3.5-turbo' but these are marketing names without version snapshots or API dates. For evaluation models, specific checkpoints are given (e.g., 'BLEURT-20', 'Unbabel/wmt22-comet-da') but the core LLM used for diversification lacks a pinned version." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": true, 156 "justification": "All ten diversifying prompts are provided verbatim in Section 3.2.2. The paraphrasing prompt is given in Section 3.2.1. The filtering prompt is given in Section 4.3. These are complete and reproducible." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 4.1.3 reports temperature=1, top_p=0.9 for diversification. GEMBA uses text-davinci-003 with temperature=0. ChatGPT-eval uses gpt-3.5-turbo with temperature=0. Number of diversified sentences = 10." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "No agentic scaffolding is used. The method is a straightforward prompt-response pipeline for generating paraphrases." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 4.1.1 describes each benchmark's structure and evaluation protocol. Section 4.1.3 describes the diversification pipeline (10 instructions, one sentence per instruction, max aggregation). Appendix A.1 specifies exact implementations and checkpoints for each metric." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": true, 178 "justification": "A dedicated 'Limitations' section is present after the conclusion, discussing multiple specific concerns including domain limitations, cost constraints, non-determinism, and model availability." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": true, 183 "justification": "The limitations section identifies specific threats: (1) diverse prompts may fail in specialized domains like finance and biomedicine, (2) omission of GEMBA ablation due to text-davinci-003 cost, (3) OpenAI API non-determinism affecting reproducibility, (4) risk of OpenAI removing existing models." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": true, 188 "justification": "The limitations section explicitly states what was not tested: specialized domains (finance, biomedicine), the optimal cost-effectiveness trade-off between number of references and evaluation time, and that GEMBA ablation was omitted. It also identifies need for 'fine-grained prompts tailored to address the specific challenges posed by professional terminology.'" 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": true, 195 "justification": "The benchmarks used (WMT22, SummEval, PASCAL-50S) are all publicly available. The paper releases diversified references and code at https://github.com/RUCAIBox/Div-Ref. The underlying data can be independently verified." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section 4.1.1 describes each benchmark: WMT22 includes competitor model outputs rated by experts via MQM schema for 3 language pairs; SummEval has 200 summaries from 16 models on CNN/DailyMail with human ratings on 4 aspects; PASCAL-50S has 4,000 triple instances with human preferences." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants recruited. The paper uses existing benchmark data with pre-collected human annotations from prior shared tasks." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "The pipeline is documented: (1) take benchmark references, (2) apply 10 diverse prompts via GPT-3.5-turbo-instruct, (3) generate one diversified sentence per prompt, (4) evaluate metrics with max aggregation across original + diversified references, (5) compute correlation with human scores. Section 3.2 and 4.1.3 describe this." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Acknowledgement section states: 'This work was partially supported by Beijing Natural Science Foundation under Grant No. L233008 and 4222027.'" 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Author affiliations are clearly listed: Renmin University of China, Microsoft Research Asia, Microsoft, CUHK, AIWaves Inc. The first author notes 'This work was done during internship at MSRA.'" 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": true, 227 "justification": "The funder (Beijing Natural Science Foundation) is an academic funding agency with no financial stake in the outcome. While some authors are from Microsoft and the paper uses OpenAI models (Microsoft is an OpenAI investor), the disclosed funder itself is independent." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests statement or financial interests declaration in the paper. Authors from Microsoft Research Asia use OpenAI API products but no financial interests are declared or disclaimed." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": true, 238 "answer": false, 239 "justification": "The training data cutoff for gpt-3.5-turbo-instruct is not stated. This matters because if the model was trained on WMT22, SummEval, or PASCAL-50S data, the diversified references could be influenced by memorized benchmark content." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": true, 243 "answer": false, 244 "justification": "No discussion of whether GPT-3.5 may have seen the benchmark references or hypotheses during training. WMT data and SummEval/CNN-DailyMail data are widely available online and likely in GPT-3.5's training set." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": true, 248 "answer": false, 249 "justification": "The benchmarks used (WMT22, SummEval based on CNN/DailyMail, PASCAL-50S from 2015) were all publicly available before GPT-3.5's training cutoff. No contamination analysis is performed despite this being directly relevant to the validity of LLM-generated references." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study. The paper uses existing benchmark human annotations from prior work." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants recruited or studied." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in this study." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "No API costs, tokens consumed, or wall-clock time reported for generating diversified references. The limitations section mentions 'the high cost of text-davinci-003' for GEMBA but never quantifies the cost of their own diversification method." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "No total computational budget stated. The paper requires calling GPT-3.5 for every reference sentence across three benchmarks but provides no cost figures, GPU hours, or total API spend." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "Results are not reported across multiple random seeds despite using stochastic generation (nucleus sampling with top_p=0.9). The paper acknowledges API non-determinism in limitations but never measures seed sensitivity." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": false, 310 "justification": "The number of experimental runs is never stated. It appears results are from a single generation pass, but this is not explicitly confirmed." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "No hyperparameter search budget reported. The ablation analyzes different settings (models, prompts, aggregation, number of references) but does not state total configurations tried or compute spent on this exploration." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": true, 320 "justification": "Section 4.3 presents ablation results for all configurations tested (Tables 3-5, Figure 4), showing why the chosen settings (GPT-3.5, diverse prompts, max aggregation, 10 references) were selected. All alternatives are reported transparently." 321 }, 322 "multiple_comparison_correction": { 323 "applies": false, 324 "answer": false, 325 "justification": "No statistical significance tests are performed in the paper, so multiple comparison correction is not applicable. The paper compares raw metric values only." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "The authors do not acknowledge the bias of evaluating their own diversification method. They generate references, evaluate them, and report improvements without discussing potential author-evaluation bias." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "Figure 4 shows performance vs. number of references (0-100) but does not frame this in terms of compute cost. No discussion of the compute trade-off between generating more references and the diminishing returns observed." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": false, 340 "justification": "The paper does not discuss whether correlation with human judgment (Kendall Tau, Spearman) is the right measure of metric quality. The assumption that higher correlation = better metric is taken as given without examining construct validity of the meta-evaluation." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "No scaffolding is involved. The method is a direct prompt-response pipeline for generating paraphrases." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of temporal leakage. GPT-3.5-turbo-instruct's training data likely includes text from WMT shared tasks, SummEval (CNN/DailyMail), and PASCAL-50S, all published years before the model's training cutoff. This could bias the diversified references." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether the LLM having potentially memorized the reference sentences affects the quality and diversity of the generated paraphrases. If GPT-3.5 has seen the benchmarks, the 'diversified' references may be less independent than assumed." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "No discussion of whether the diversified references are truly independent of the benchmark data GPT-3.5 may have been trained on." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": false, 367 "justification": "No leakage detection or prevention method applied. No canary strings, membership inference, overlap analysis, or decontamination performed." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "Diversifying references using LLMs significantly improves correlation between automatic NLG evaluation metrics and human evaluation across translation, summarization, and captioning tasks.", 374 "evidence": "Figures 1-3 show consistent improvements across all three benchmarks: system-level pairwise accuracy improves by +3.0 on average for WMT22 translation (Figure 1a), Spearman correlation improves across all four summarization aspects (Figure 2), and accuracy improves for image captioning in most settings (Figure 3).", 375 "supported": "strong" 376 }, 377 { 378 "claim": "GPT-3.5-turbo-instruct outperforms traditional paraphrasing models (PEGASUS, Parrot, QCPG) and LLaMA-2-70b-chat for reference diversification.", 379 "evidence": "Table 3 shows GPT-3.5 achieves +2.9 average gains at segment level for Zh-En vs +2.0 (PEGASUS), +1.5 (Parrot), +1.5 (QCPG), and +1.4 (LLaMA-2). Tables 3-4 show consistent advantage across language pairs.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "Maximum aggregation outperforms mean aggregation for combining multiple reference scores.", 384 "evidence": "Tables 3-4 show mean aggregation drops performance in most settings (e.g., -1.1 segment-level Zh-En, -2.8 En-De). The paper explains: 'averaging multiple reference scores could introduce noise from low-quality reference scores' (Section 4.3).", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "The method is compatible with LLM-based evaluation metrics, further improving their correlation with human judgment.", 389 "evidence": "GEMBA improves from 85.8 to 86.1 at system level (Figure 1a). ChatGPT-eval improves across all four summarization dimensions: coherence (+8.4), consistency (+4.7), fluency (+8.4), relevance (+6.6) (Figure 2).", 390 "supported": "strong" 391 }, 392 { 393 "claim": "94.6% of LLM-generated diversified sentences preserve the original semantics.", 394 "evidence": "Section 3.2.3 states: 'We employ another excellent GPT 3.5 to judge whether the generated sentence conveys the same meaning of given reference. The results show that 94.6% of the generated sentences are suitable.'", 395 "supported": "weak" 396 }, 397 { 398 "claim": "Mixing diverse prompts is better than using any single prompt type repeatedly.", 399 "evidence": "Table 5 shows mixing prompts yields +2.9 average gain vs individual prompts ranging from +0.4 to +2.4 on Zh-En segment-level Kendall Tau.", 400 "supported": "moderate" 401 } 402 ], 403 "red_flags": [ 404 { 405 "flag": "No statistical significance tests", 406 "detail": "All improvements are claimed based on raw number comparisons (e.g., 14.5→19.4 Kendall Tau) without any significance testing. With stochastic generation and potentially noisy metrics, some claimed improvements may not be statistically significant." 407 }, 408 { 409 "flag": "LLM evaluating LLM outputs", 410 "detail": "The 94.6% quality figure for generated references comes from GPT-3.5 judging GPT-3.5's own outputs. This is circular evaluation — the model may judge its own outputs more favorably. No human verification of reference quality was performed." 411 }, 412 { 413 "flag": "No seed sensitivity or variance analysis", 414 "detail": "Despite using stochastic generation (nucleus sampling, top_p=0.9) and a non-deterministic API, no variance across runs is reported. The paper acknowledges this in limitations but all reported results appear to be single-run." 415 }, 416 { 417 "flag": "Contamination risk unaddressed", 418 "detail": "GPT-3.5 likely saw WMT data, CNN/DailyMail, and PASCAL captions during training. If the model memorized benchmark references, the 'diversified' references may be biased by training data rather than truly exploring the semantic space. This is never discussed." 419 }, 420 { 421 "flag": "Potential conflict of interest", 422 "detail": "Multiple authors are from Microsoft Research Asia and Microsoft. The paper relies heavily on OpenAI's GPT-3.5 (Microsoft is a major OpenAI investor). The method's value proposition depends on GPT-3.5 being effective, which aligns with Microsoft's commercial interests. This conflict is not acknowledged." 423 } 424 ], 425 "cited_papers": [ 426 { 427 "title": "Large language models are state-of-the-art evaluators of translation quality", 428 "authors": ["Tom Kocmi", "Christian Federmann"], 429 "year": 2023, 430 "arxiv_id": "2302.14520", 431 "relevance": "Demonstrates LLM-as-judge for machine translation evaluation (GEMBA metric), directly relevant to LLM evaluation capabilities." 432 }, 433 { 434 "title": "Is chatgpt a good nlg evaluator? a preliminary study", 435 "authors": ["Jiaan Wang", "Yunlong Liang", "Fandong Meng"], 436 "year": 2023, 437 "arxiv_id": "2303.04048", 438 "relevance": "Evaluates ChatGPT's capability as an NLG evaluator, relevant to understanding LLM evaluation reliability." 439 }, 440 { 441 "title": "Can large language models be an alternative to human evaluations?", 442 "authors": ["Cheng-Han Chiang", "Hung-yi Lee"], 443 "year": 2023, 444 "arxiv_id": "2305.01937", 445 "relevance": "Investigates whether LLMs can replace human evaluation, directly relevant to AI evaluation methodology." 446 }, 447 { 448 "title": "GPTEval: NLG evaluation using GPT-4 with better human alignment", 449 "authors": ["Yang Liu", "Dan Iter", "Yichong Xu"], 450 "year": 2023, 451 "arxiv_id": "2303.16634", 452 "relevance": "Uses GPT-4 for NLG evaluation aligned with human judgments, relevant to LLM-based evaluation approaches." 453 }, 454 { 455 "title": "Is chatgpt a good translator? yes with gpt-4 as the engine", 456 "authors": ["WX Jiao", "WX Wang", "JT Huang"], 457 "year": 2023, 458 "arxiv_id": "2301.08745", 459 "relevance": "Evaluates ChatGPT/GPT-4 translation capability, relevant to LLM capability assessment methodology." 460 }, 461 { 462 "title": "Exploring the use of large language models for reference-free text quality evaluation: A preliminary empirical study", 463 "authors": ["Yi Chen", "Rui Wang", "Haiyun Jiang"], 464 "year": 2023, 465 "arxiv_id": "2304.00723", 466 "relevance": "Explores LLMs for reference-free evaluation, relevant to understanding NLG evaluation approaches." 467 }, 468 { 469 "title": "BERTScore: Evaluating text generation with BERT", 470 "authors": ["Tianyi Zhang", "Varsha Kishore", "Felix Wu"], 471 "year": 2020, 472 "relevance": "Foundational neural evaluation metric used extensively in LLM-generated text evaluation." 473 }, 474 { 475 "title": "Benchmarking large language models for news summarization", 476 "authors": ["Tianyi Zhang", "Faisal Ladhak", "Esin Durmus"], 477 "year": 2023, 478 "arxiv_id": "2301.13848", 479 "relevance": "LLM benchmarking study for summarization, relevant to evaluation methodology for LLM capabilities." 480 }, 481 { 482 "title": "ChatGPT as a factual inconsistency evaluator for abstractive text summarization", 483 "authors": ["Zheheng Luo", "Qianqian Xie", "Sophia Ananiadou"], 484 "year": 2023, 485 "arxiv_id": "2303.15621", 486 "relevance": "Uses ChatGPT for factual consistency evaluation, relevant to LLM-as-judge methodology." 487 }, 488 { 489 "title": "Crosslingual generalization through multitask finetuning", 490 "authors": ["Niklas Muennighoff", "Thomas Wang", "Lintang Sutawika"], 491 "year": 2022, 492 "arxiv_id": "2211.01786", 493 "relevance": "Cross-lingual LLM capabilities study, relevant to understanding multilingual LLM evaluation." 494 } 495 ] 496 }