scan.json (31091B)
1 { 2 "paper": { 3 "title": "Sample-Efficient Human Evaluation of Large Language Models via Maximum Discrepancy Competition", 4 "authors": [ 5 "Kehua Feng", 6 "Keyan Ding", 7 "Hongzhi Tan", 8 "Kede Ma", 9 "Zhihua Wang", 10 "Shuangquan Guo", 11 "Yuzhou Cheng", 12 "Ge Sun", 13 "Guozhou Zheng", 14 "Qiang Zhang", 15 "Huajun Chen" 16 ], 17 "year": 2024, 18 "venue": "arXiv", 19 "arxiv_id": "2404.08008", 20 "doi": "10.48550/arXiv.2404.08008" 21 }, 22 "scan_version": 2, 23 "active_modules": ["experimental_rigor", "data_leakage"], 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "The abstract states 'Code is available at https://github.com/weiji-Feng/MAD-Eval.'" 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": false, 34 "justification": "The paper explicitly mentions code availability but does not state that the 120K evolved instruction pool or human annotation data is released. Seed datasets (GSM8K, CAMEL, AlpacaEval, CodeAlpaca) are public, but the novel evolved instructions and annotation results are not explicitly made available." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "Appendix C mentions 'two NVIDIA GeForce RTX 4090 GPUs' and the use of vLLM, but no requirements.txt, Dockerfile, conda environment, or detailed library versions are provided." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "While code is released and the experimental setup is described in detail (Appendix B-D), the paper does not include step-by-step reproduction instructions with commands to run." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": false, 51 "justification": "All Elo ratings in Tables 1-8 are reported as point estimates without confidence intervals or error bars, despite using 1000 bootstrap resamples for aggregation. The bootstrap uncertainty is not shown." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": true, 56 "justification": "Spearman's ρ with p-values is reported for ranking comparisons: e.g., 'Spearman's ρ = 0.965 with p = 5.93 × 10^−12' (Section 4.6), 'ρ = 0.993 with p = 2.17 × 10^−13' (Table 4)." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Spearman's ρ values (0.965, 0.993, 0.989, 0.986) serve as effect sizes quantifying agreement with gold-standard rankings. Elo rating differences between models are also reported, providing magnitude context." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "No justification is given for why 13 evaluators were chosen, why K=10 instructions per pair, or why 8 LLMs were selected. No power analysis is discussed." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": false, 71 "justification": "Despite performing 1000 bootstrap resamples for Elo ratings, the variance or standard deviation of the bootstrap distributions is not reported. Tables show only point-estimate Elo ratings. For random sampling (Table 5), mean ρ = 0.791 with std = 0.069 is reported, but the main method's variance is absent." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "Section 4.4 compares MAD Competition against 5 alternative sampling strategies: random sampling, KL divergence, cross-entropy (Boubdir et al., 2023), Anchor Points (Vivek et al., 2023), and DiffUse (Ashury-Tahan et al., 2024). Section 4.3 compares against Chatbot Arena, AlpacaEval-2.0, and CompassRank." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": true, 83 "justification": "Baselines include DiffUse (2024), Anchor Points (2023), and the current Chatbot Arena leaderboard. These represent contemporary state-of-the-art approaches." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": true, 88 "justification": "Section 4.5 presents ablation studies on three key dimensions: semantic discrepancy measure (Table 3), sample size K (Figure 4), and diversity weight λ." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "The paper reports Elo ratings, Spearman's ρ rank correlations, per-task rankings, pairwise win rates (Figure 6), and inter-annotator agreement (83.39%)." 94 }, 95 "human_evaluation": { 96 "applies": true, 97 "answer": true, 98 "justification": "The paper's core contribution is a human evaluation methodology. Thirteen STEM-trained postgraduates perform 3-AFC pairwise preference tasks on 1,120 comparison pairs (Appendix D.4)." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": false, 103 "justification": "There is no separation between development and test data. The hyperparameters K and λ are tuned on the same evaluation data used for the main results. The validation against external leaderboards (Chatbot Arena, GSM8K) serves as external validation but not as a held-out test set." 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "Table 1 provides per-task breakdowns across four tasks (Understanding, Reasoning, Writing, Coding) with separate Elo ratings and rankings for each. Figure 6 shows per-task pairwise win rate heatmaps." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": true, 113 "justification": "Appendix E.4 presents extensive case studies of failure cases, including five counterexample categories for GPT-4-Turbo (Tables 27-31): algorithm explanation, instruction comprehension, code constraint, instruction adherence. Table 9 summarizes model weaknesses." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": true, 118 "justification": "The paper reports where alternative methods fail (Table 2, 8), discusses discrepancies between their rankings and established leaderboards (Section 4.3), and shows that without diversity (λ=0), instructions cluster around narrow themes (Section 4.5)." 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "The abstract claims the method 'recovers gold-standard model rankings with a handful of MAD-selected instructions.' This is supported by Spearman's ρ = 0.965 with Chatbot Arena (Table 7), ρ = 0.993 with MATH500 (Table 4), and ρ = 0.986 with Chatbot Arena Conversations (Table 5)." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper makes implicit causal claims about MAD Competition promoting sample efficiency. These are supported by ablation studies (Section 4.5) that systematically vary components (discrepancy measure, K, λ) and demonstrate their contribution to performance." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": true, 135 "justification": "Claims are generally bounded to the tested settings. The abstract specifies 'eight widely used LLMs across four tasks.' The Limitations section acknowledges that the method 'may become computationally prohibitive' at larger scales and that 'underlying data distributions may differ between our curated instruction set and the Chatbot Arena benchmark.'" 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": true, 140 "justification": "Section 4.4 compares against 5 alternative sampling strategies and analyzes why they fail (KL divergence concentrates on poetry, cross-entropy favors academic tasks). Section 4.3 discusses why leaderboard discrepancies arise (e.g., WizardLM-13B's strong instruction-following on AlpacaEval)." 141 }, 142 "proxy_outcome_distinction": { 143 "applies": true, 144 "answer": true, 145 "justification": "The paper acknowledges that 'underlying data distributions may differ between our curated instruction set and the Chatbot Arena benchmark, this single comparison may not fully attest to real-world robustness' (Section 4.6). It validates against multiple external gold standards to assess the proxy-outcome gap." 146 } 147 }, 148 "setup_transparency": { 149 "model_versions_specified": { 150 "applies": true, 151 "answer": true, 152 "justification": "Appendix C specifies exact model versions: 'GPT-4-1106-preview', 'GPT-3.5-Turbo-1106', 'Gemini-1.0-Pro', 'WizardLM-13B-V1.2', 'Vicuna-13B-V1.5'. The 20-model experiment (Table 7) also uses versioned names like 'GPT-4o-2024-05-13'." 153 }, 154 "prompts_provided": { 155 "applies": true, 156 "answer": true, 157 "justification": "Full prompt texts are provided in the appendix: instruction evolution prompts (Tables 10-13), LLM-based evaluator prompts (Tables 14-17), and the GPT-4-Turbo semantic similarity prompt (Table 20)." 158 }, 159 "hyperparameters_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Appendix C: temperature = 0.7, top-p = 1.0, max sequence length = 2,048 (1,024 for Qwen). Appendix A: Elo parameters τ = 400, η = 4. Section 4.5: λ = 1.0, K = 10." 163 }, 164 "scaffolding_described": { 165 "applies": false, 166 "answer": false, 167 "justification": "No agentic scaffolding is used. The method directly queries LLMs for responses and applies MAD selection." 168 }, 169 "data_preprocessing_documented": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section 4.1 and Appendix B describe the instruction pool construction in detail: 3K seed instructions per task from established datasets, 10 rounds of automated evolution using 3 closed-source models, yielding 30K evolved instructions per task (120K total)." 173 } 174 }, 175 "limitations_and_scope": { 176 "limitations_section_present": { 177 "applies": true, 178 "answer": true, 179 "justification": "A dedicated 'Limitations' section follows the Conclusion, discussing three specific limitations: brute-force search scalability, variable cognitive load on evaluators, and O(N²K) scaling for many models." 180 }, 181 "threats_to_validity_specific": { 182 "applies": true, 183 "answer": true, 184 "justification": "The Limitations section discusses specific threats: brute-force search 'may become computationally prohibitive as pools scale to millions,' variable cognitive load means 'certain pairs of responses may be inherently harder to judge,' and O(N²K) scaling 'may strain annotation budgets.' These are specific to the proposed method." 185 }, 186 "scope_boundaries_stated": { 187 "applies": true, 188 "answer": true, 189 "justification": "The paper states specific scope boundaries: 'this single comparison may not fully attest to real-world robustness' (Section 4.6), acknowledges that 'developing fully automated pipelines remains an open challenge,' and notes cognitive load and scalability limitations." 190 } 191 }, 192 "data_integrity": { 193 "raw_data_available": { 194 "applies": true, 195 "answer": false, 196 "justification": "The raw human annotation data (individual annotator judgments for 1,120 comparisons) is not stated as publicly available. Only code is explicitly released." 197 }, 198 "data_collection_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Appendix B describes the instruction pool construction (seed collection, 10 rounds of evolution). Appendix D describes the human evaluation process in detail: evaluator selection, briefing, GUI design, annotation protocol." 202 }, 203 "recruitment_methods_described": { 204 "applies": true, 205 "answer": true, 206 "justification": "Appendix D.1 describes evaluator selection criteria: graduate-level STEM training, language proficiency requirements (NCEE ≥125, CET-6 ≥500, or native English), disciplinary foundation, and Python proficiency. Appendix D.4 states 'Thirteen graduate students with strong STEM backgrounds are recruited.'" 207 }, 208 "data_pipeline_documented": { 209 "applies": true, 210 "answer": true, 211 "justification": "The full pipeline is documented: seed instructions → instruction evolution (Appendix B) → LLM response generation → MAD selection (Algorithm 1) → human 3-AFC annotation (Appendix D) → Elo rating with bootstrapping (Section 3.3)." 212 } 213 }, 214 "conflicts_of_interest": { 215 "funding_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "The Acknowledgements section lists funding: National Natural Science Foundation of China (62301480 and 62302433), Zhejiang Provincial programs (2025C01097 and 2024C01135), and Hangzhou West Lake Pearl Project (TD2023017)." 219 }, 220 "affiliations_disclosed": { 221 "applies": true, 222 "answer": true, 223 "justification": "Author affiliations are clearly listed: Zhejiang University, ZJU-Hangzhou Global Scientific and Technological Innovation Center, Shanghai Electric Group Co., City University of Hong Kong, and associated research centers." 224 }, 225 "funder_independent_of_outcome": { 226 "applies": true, 227 "answer": true, 228 "justification": "Funding comes from Chinese government research grants (NSFC, provincial programs) which have no direct financial stake in the ranking outcomes of specific LLMs." 229 }, 230 "financial_interests_declared": { 231 "applies": true, 232 "answer": false, 233 "justification": "No competing interests or financial interests statement is present in the paper." 234 } 235 }, 236 "contamination": { 237 "training_cutoff_stated": { 238 "applies": true, 239 "answer": false, 240 "justification": "The paper does not state the training data cutoff dates for any of the 8 evaluated LLMs (GPT-4-Turbo, GPT-3.5-Turbo, Gemini-Pro, or the open-source models)." 241 }, 242 "train_test_overlap_discussed": { 243 "applies": true, 244 "answer": false, 245 "justification": "The instruction pool is evolved from GSM8K, CAMEL, AlpacaEval, and CodeAlpaca — all publicly available before the models' training cutoffs. No discussion of whether evolved instructions or their source benchmarks overlap with model training data." 246 }, 247 "benchmark_contamination_addressed": { 248 "applies": true, 249 "answer": false, 250 "justification": "GSM8K (2021), CAMEL, AlpacaEval, and CodeAlpaca were all published before the evaluated models' training periods. The evolved instructions derive from these public benchmarks. No contamination analysis or discussion is provided." 251 } 252 }, 253 "human_studies": { 254 "pre_registered": { 255 "applies": true, 256 "answer": false, 257 "justification": "No pre-registration is mentioned. No link to OSF, AsPredicted, or any pre-registration platform." 258 }, 259 "irb_or_ethics_approval": { 260 "applies": true, 261 "answer": false, 262 "justification": "No IRB or ethics board approval is mentioned. Appendix D.2 describes informed consent procedures but does not reference any ethics review." 263 }, 264 "demographics_reported": { 265 "applies": true, 266 "answer": false, 267 "justification": "Evaluators are described only as 'Thirteen graduate students with strong STEM backgrounds' (Appendix D.4). Selection criteria are detailed (D.1) but actual demographics (gender, age, years of experience, geographic distribution) are not reported." 268 }, 269 "inclusion_exclusion_criteria": { 270 "applies": true, 271 "answer": true, 272 "justification": "Appendix D.1 provides detailed inclusion criteria: language proficiency (NCEE English ≥125 and CET-6 ≥500, or native English), high-school mastery of STEM subjects, professional Python proficiency, and commitment to 2+ hour sessions." 273 }, 274 "randomization_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "This is an annotation study, not an experimental study with treatment/control conditions. All evaluators perform the same 3-AFC task; there is no randomization to different conditions." 278 }, 279 "blinding_described": { 280 "applies": true, 281 "answer": true, 282 "justification": "Figure 5 shows the evaluation GUI where responses are presented as 'A' and 'B' without revealing model identities. Evaluators select 'A win,' 'Tie,' or 'B win,' with no information about which model generated which response." 283 }, 284 "attrition_reported": { 285 "applies": true, 286 "answer": false, 287 "justification": "The paper reports 13 evaluators were recruited (Appendix D.4) but does not mention whether all completed their assigned annotations or if any dropped out." 288 } 289 }, 290 "cost_and_practicality": { 291 "inference_cost_reported": { 292 "applies": true, 293 "answer": false, 294 "justification": "No API costs, tokens consumed, or per-comparison costs are reported for running 8 LLMs on 120K instructions across 4 tasks, or for the human annotation effort." 295 }, 296 "compute_budget_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "The paper mentions 'two NVIDIA GeForce RTX 4090 GPUs' (Appendix C) for open-source models but does not report total GPU hours, API spend, training time, or annotation costs." 300 } 301 }, 302 "experimental_rigor": { 303 "seed_sensitivity_reported": { 304 "applies": true, 305 "answer": false, 306 "justification": "Model responses are generated with temperature 0.7 (stochastic), but the paper does not report whether different random seeds produce different MAD selections or rankings. For random sampling, 3 seeds are tested (Table 5), but MAD Competition's seed sensitivity is not assessed." 307 }, 308 "number_of_runs_stated": { 309 "applies": true, 310 "answer": true, 311 "justification": "The paper states '1000 bootstrap datasets' for Elo aggregation (Section 3.3), 'at least five different students' per annotation pair (Appendix D.4), and 'three trials with seeds 657, 216, and 849' for random sampling comparison (Section 4.6)." 312 }, 313 "hyperparameter_search_budget": { 314 "applies": true, 315 "answer": true, 316 "justification": "Section 4.5 explicitly reports the configurations tested: λ ∈ {0, 0.5, 1.0, 1.5, 2.0} for diversity weight, K from 1 to 9 for sample size, and 3 semantic discrepancy measures (BERTScore, GPT-4-Turbo, text-embedding-ada-002)." 317 }, 318 "best_config_selection_justified": { 319 "applies": true, 320 "answer": true, 321 "justification": "Section 4.5 justifies K=10 (correlations exceed 0.95 for K≥5 and reach 1.0 for K≥8) and λ=1.0 (intermediate value balances discrepancy vs. diversity; λ≤0.5 clusters around narrow themes, λ=2.0 inflates ties)." 322 }, 323 "multiple_comparison_correction": { 324 "applies": true, 325 "answer": false, 326 "justification": "The paper reports multiple Spearman's ρ correlations with p-values across different experimental conditions (Tables 4, 5, 7) but does not apply any correction for multiple comparisons (Bonferroni, Holm, etc.)." 327 }, 328 "self_comparison_bias_addressed": { 329 "applies": true, 330 "answer": false, 331 "justification": "The authors propose MAD Competition and compare it against their own implementations of 5 alternative sampling strategies. They do not acknowledge or discuss the potential bias of evaluating their own system's implementations of baselines." 332 }, 333 "compute_budget_vs_performance": { 334 "applies": true, 335 "answer": false, 336 "justification": "DiffUse uses 630 instructions per experiment while MAD Competition uses 210, yet the paper does not present a formal compute-budget-vs-performance analysis. Table 5 compares at matched sample sizes (1K comparisons) for one experiment, but this is not systematic." 337 }, 338 "benchmark_construct_validity": { 339 "applies": true, 340 "answer": true, 341 "justification": "Section 4.3 discusses whether the evaluation actually measures what is claimed by comparing against multiple gold standards (Chatbot Arena, AlpacaEval-2.0, CompassRank) and analyzing why discrepancies arise (e.g., AlpacaEval's emphasis on instruction-following vs. human preferences)." 342 }, 343 "scaffold_confound_addressed": { 344 "applies": false, 345 "answer": false, 346 "justification": "No scaffolding is involved. LLMs are directly queried for responses without agentic scaffolding." 347 } 348 }, 349 "data_leakage": { 350 "temporal_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "The instruction pool is evolved from GSM8K (2021), CAMEL, AlpacaEval, and CodeAlpaca — all published before the models' training periods. No discussion of whether models may have seen these benchmark problems or their variants during training." 354 }, 355 "feature_leakage_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of whether the evaluation setup leaks answer information. For reasoning tasks, evolved instructions include exemplar answers (Appendix B), but it's unclear if this information affects the evaluation." 359 }, 360 "non_independence_addressed": { 361 "applies": true, 362 "answer": false, 363 "justification": "No discussion of whether evolved instructions share structural similarities with model training data, or whether the 10-round evolution process produces near-duplicates." 364 }, 365 "leakage_detection_method": { 366 "applies": true, 367 "answer": false, 368 "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, temporal splits, or decontamination pipelines are employed." 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "MAD Competition recovers gold-standard model rankings with only K=10 instructions per model pair", 375 "evidence": "Table 2 shows MAD Competition best matches the GSM8K golden ranking among 6 sampling strategies. Table 4 shows Spearman's ρ = 0.993 (p = 2.17e-13) with MATH500. Table 5 shows ρ = 0.986 with Chatbot Arena Conversations using 1K MAD-selected comparisons vs. ρ = 0.791 for random 1K.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Reliable rankings emerge from as few as 5 MAD-selected instructions per model pair", 380 "evidence": "Figure 4 shows Spearman's ρ exceeds 0.95 for K ≥ 5 and reaches 1.0 for K ≥ 8 across all four tasks.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "MAD Competition significantly outperforms random sampling and four alternative strategies in ranking fidelity", 385 "evidence": "Tables 2 and 8 show MAD Competition produces the smallest ranking discrepancies from gold standards. Random, KL, cross-entropy, Anchor Points, and DiffUse all show notable ranking errors. Section 4.6: MAD ρ = 0.986 vs. random mean ρ = 0.791 (std = 0.069).", 386 "supported": "strong" 387 }, 388 { 389 "claim": "LLM-based evaluators (GPT-4o) can substitute for human evaluators with Spearman's ρ > 0.95", 390 "evidence": "Table 6 shows Spearman's ρ between GPT-4o and human rankings exceeds 0.95 across all four tasks and the overall ranking.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "The method scales to 20 LLMs with strong agreement (ρ = 0.965) with Chatbot Arena", 395 "evidence": "Table 7 shows the 20-model ranking with Spearman's ρ = 0.965 (p = 5.93e-12) against Chatbot Arena. However, this uses LLM-based evaluation (GPT-4o), not human evaluation.", 396 "supported": "moderate" 397 }, 398 { 399 "claim": "The diversity term in MAD Competition prevents narrow clustering and ensures broader task coverage", 400 "evidence": "Section 4.5: λ ≤ 0.5 causes clustering around a few themes (e.g., poetry), while λ = 1.0 produces diverse instruction sets. Tables 18-19 compare selected instructions with and without diversity.", 401 "supported": "moderate" 402 } 403 ], 404 "methodology_tags": ["benchmark-eval"], 405 "key_findings": "The paper introduces MAD Competition, a sample-efficient evaluation method that adaptively selects maximally discriminative instructions for pairwise LLM comparison. With only 280 human comparisons (K=10 per model pair for 8 models), the method achieves near-perfect rank correlation (ρ = 0.965-0.993) with established leaderboards including Chatbot Arena. The diversity-aware instruction selection explicitly prevents narrow clustering and outperforms five alternative sampling strategies. The method extends to 20 models and generalizes across external datasets (MATH500, Chatbot Arena Conversations).", 406 "red_flags": [ 407 { 408 "flag": "Circular evaluation design for reasoning task", 409 "detail": "The mathematical reasoning instruction pool is derived from GSM8K via instruction evolution, and the 'gold standard' for validating rankings is GSM8K accuracy itself. This circularity may inflate the reported agreement between MAD-selected rankings and the gold standard." 410 }, 411 { 412 "flag": "Complete absence of contamination analysis", 413 "detail": "All seed datasets (GSM8K, CAMEL, AlpacaEval, CodeAlpaca) were publicly available before the evaluated models' training periods. The evolved instructions derive from these. No analysis of whether models have seen these benchmarks or similar problems in training, which could undermine the validity of the discrepancy-based selection." 414 }, 415 { 416 "flag": "Small evaluator pool", 417 "detail": "Only 13 STEM postgraduates from Chinese universities serve as evaluators for all 1,120 comparisons, with inter-annotator agreement of 83.39%. This narrow demographic may not represent the broader user population, and the small pool limits generalizability of the human preference signal." 418 }, 419 { 420 "flag": "No uncertainty quantification on main results", 421 "detail": "Despite performing 1000 bootstrap resamples for Elo aggregation, no confidence intervals, error bars, or standard deviations are reported in any results table. The reader cannot assess the stability of the reported rankings." 422 }, 423 { 424 "flag": "Scaling claim based on LLM-based evaluation, not human evaluation", 425 "detail": "The 20-model experiment (Table 7) uses GPT-4o as judge, not human evaluators. The scaling claim is predicated on LLM-based evaluation, which introduces the very biases the paper argues human evaluation should address." 426 } 427 ], 428 "cited_papers": [ 429 { 430 "title": "Chatbot Arena: An open platform for evaluating LLMs by human preference", 431 "authors": ["Wei-Lin Chiang", "Lianmin Zheng", "Ying Sheng"], 432 "year": 2024, 433 "arxiv_id": "2403.04132", 434 "relevance": "Major LLM evaluation platform using Elo-based human preference ranking; serves as gold-standard comparison for the proposed method." 435 }, 436 { 437 "title": "Judging LLM-as-a-judge with MT-Bench and Chatbot Arena", 438 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 439 "year": 2023, 440 "arxiv_id": "2306.05685", 441 "relevance": "Foundational work on using LLMs as evaluation judges and MT-Bench as a conversation benchmark." 442 }, 443 { 444 "title": "Humans or LLMs as the judge? A study on judgement biases", 445 "authors": ["Guiming Hardy Chen", "Shunian Chen", "Ziche Liu"], 446 "year": 2024, 447 "arxiv_id": "2402.10669", 448 "relevance": "Documents biases in LLM-based evaluation including position, format, verbosity, and self-enhancement bias." 449 }, 450 { 451 "title": "A survey on evaluation of large language models", 452 "authors": ["Yupeng Chang", "Xu Wang", "Jindong Wang"], 453 "year": 2024, 454 "relevance": "Comprehensive survey of LLM evaluation methods spanning benchmarks, automated evaluation, and human assessment." 455 }, 456 { 457 "title": "GPT-4 technical report", 458 "authors": ["OpenAI"], 459 "year": 2023, 460 "arxiv_id": "2303.08774", 461 "relevance": "Technical report for GPT-4, one of the primary models evaluated and used as evaluation infrastructure in this study." 462 }, 463 { 464 "title": "AlpacaFarm: A simulation framework for methods that learn from human feedback", 465 "authors": ["Yann Dubois", "Xuechen Li", "Rohan Taori"], 466 "year": 2023, 467 "arxiv_id": "2305.14387", 468 "relevance": "Framework using GPT-4 as LLM-as-judge for evaluating instruction-following; source of evaluation methodology." 469 }, 470 { 471 "title": "PandaLM: An automatic evaluation benchmark for LLM instruction tuning optimization", 472 "authors": ["Yidong Wang", "Zhuohao Yu", "Zhengran Zeng"], 473 "year": 2023, 474 "arxiv_id": "2306.05087", 475 "relevance": "Open-source LLM-based pairwise comparison evaluator, alternative to proprietary model judges." 476 }, 477 { 478 "title": "Prometheus: Inducing fine-grained evaluation capability in language models", 479 "authors": ["Seungone Kim", "Jamin Shin", "Yejin Cho"], 480 "year": 2023, 481 "relevance": "Open-source LLM evaluator with fine-grained scoring rubrics for customizable evaluation criteria." 482 }, 483 { 484 "title": "Training language models to follow instructions with human feedback", 485 "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"], 486 "year": 2022, 487 "relevance": "RLHF methodology for training instruction-following LLMs; foundational work for the evaluated models." 488 }, 489 { 490 "title": "Label-efficient model selection for text generation", 491 "authors": ["Shir Ashury-Tahan", "Ariel Gera", "Benjamin Sznajder"], 492 "year": 2024, 493 "arxiv_id": "2402.07891", 494 "relevance": "DiffUse method for label-efficient LLM evaluation using embedding-difference clustering; baseline comparison." 495 }, 496 { 497 "title": "Anchor points: Benchmarking models with much fewer examples", 498 "authors": ["Rajan Vivek", "Kawin Ethayarajh", "Diyi Yang"], 499 "year": 2023, 500 "arxiv_id": "2309.08638", 501 "relevance": "K-Medoids-based sample selection for efficient LLM benchmarking; baseline comparison method." 502 }, 503 { 504 "title": "An in-depth look at Gemini's language abilities", 505 "authors": ["Syeda Nahida Akter", "Zichun Yu", "Aashiq Muhamed"], 506 "year": 2023, 507 "arxiv_id": "2312.11444", 508 "relevance": "Detailed evaluation of Gemini's capabilities including HumanEval pass@1 rates referenced in this paper." 509 } 510 ] 511 }