scan.json (29584B)
1 { 2 "paper": { 3 "title": "Grading Scale Impact on LLM-as-a-Judge: Human-LLM Alignment Is Highest on 0-5 Grading Scale", 4 "authors": [ 5 "Weiyue Li", 6 "Minda Zhao", 7 "Weixuan Dong", 8 "Jiahui Cai", 9 "Yuze Wei", 10 "Michael Pocress", 11 "Yi Li", 12 "Wanyan Yuan", 13 "Xiaoyue Wang", 14 "Ruoyu Hou", 15 "Kaiyuan Lou", 16 "Wenqi Zeng", 17 "Yutong Yang", 18 "Yilun Du", 19 "Mengyu Wang" 20 ], 21 "year": 2026, 22 "venue": "arXiv", 23 "arxiv_id": "2601.03444", 24 "doi": "10.48550/arXiv.2601.03444" 25 }, 26 "scan_version": 3, 27 "active_modules": ["experimental_rigor", "data_leakage"], 28 "checklist": { 29 "artifacts": { 30 "code_released": { 31 "applies": true, 32 "answer": false, 33 "justification": "No code repository, GitHub link, or archive URL is provided anywhere in the paper or appendices." 34 }, 35 "data_released": { 36 "applies": true, 37 "answer": false, 38 "justification": "The underlying benchmarks (STS-B, MT-Bench, ToxiGen, etc.) are public, but the paper's primary data contribution — the collected human and LLM ratings across scales — is not released or linked." 39 }, 40 "environment_specified": { 41 "applies": true, 42 "answer": false, 43 "justification": "No environment specifications, dependency files, or library versions are provided. The paper does not describe the software environment used for computing ICC values or running LLM APIs." 44 }, 45 "reproduction_instructions": { 46 "applies": true, 47 "answer": false, 48 "justification": "No reproduction instructions, scripts, or step-by-step guide are provided for replicating the experiments." 49 } 50 }, 51 "statistical_methodology": { 52 "confidence_intervals_or_error_bars": { 53 "applies": true, 54 "answer": false, 55 "justification": "All ICC and nMAE values in Tables 1-6 are reported as point estimates without confidence intervals or error bars." 56 }, 57 "significance_tests": { 58 "applies": true, 59 "answer": false, 60 "justification": "The paper claims 0-5 is the best scale and 0-10 is consistently the weakest based on comparing raw ICC/nMAE values across tables, but no significance tests (e.g., bootstrap test on ICC differences) are performed to determine whether these differences are statistically meaningful." 61 }, 62 "effect_sizes_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "The paper reports raw ICC and nMAE values but does not report formal effect sizes. Differences between scales (e.g., ICC of 0.853 vs 0.805) are stated without quantifying effect magnitude using standard measures like Cohen's d." 66 }, 67 "sample_size_justified": { 68 "applies": true, 69 "answer": false, 70 "justification": "The choice of 150 items (25 per benchmark), 12 human raters, and 6 LLMs is not justified. No power analysis or sample size rationale is given for any of these design parameters." 71 }, 72 "variance_reported": { 73 "applies": true, 74 "answer": false, 75 "justification": "Main results are single-run ICC/nMAE point estimates. The temperature ablation (Table 6) shows results at four temperatures but only for two models and without reporting variance across temperatures." 76 } 77 }, 78 "evaluation_design": { 79 "baselines_included": { 80 "applies": true, 81 "answer": true, 82 "justification": "The study compares three grading scales (0-5, 0-10, 0-100) against each other and also contrasts their ICC approach against Pearson correlation (used by Lee et al. 2025), as discussed in Section 5.4 and Figure 2." 83 }, 84 "baselines_contemporary": { 85 "applies": true, 86 "answer": true, 87 "justification": "The comparison with Lee et al. (2025) Pearson correlation approach and the use of contemporary LLM models (GPT-4o, Gemini-2.5-flash, DeepSeek-v3.2) makes the baselines current." 88 }, 89 "ablation_study": { 90 "applies": true, 91 "answer": true, 92 "justification": "Section 5.3 presents a temperature ablation study testing robustness across T ∈ {0.1, 0.4, 0.7, 1.0} for two models, verifying that scale ordering is not an artifact of decoding strategy." 93 }, 94 "multiple_metrics": { 95 "applies": true, 96 "answer": true, 97 "justification": "The paper uses both ICC (absolute agreement) and nMAE (normalized mean absolute error) throughout, and explicitly discusses why both are informative (Section 5.4, Figure 2)." 98 }, 99 "human_evaluation": { 100 "applies": true, 101 "answer": true, 102 "justification": "The entire study centers on human evaluation — 12 graduate student annotators provide ratings across all items and scales in a fully crossed design (Sections 3.2, 4.2)." 103 }, 104 "held_out_test_set": { 105 "applies": true, 106 "answer": false, 107 "justification": "No train/test split is employed. The same 150 items are used for all analyses with no separation for tuning vs. evaluation, though the study does not involve model training or tuning." 108 }, 109 "per_category_breakdown": { 110 "applies": true, 111 "answer": true, 112 "justification": "Extensive per-benchmark breakdowns are provided in Tables 1, 3, 4a, and Appendix Tables 14-15, showing reliability and agreement for each of the six benchmarks separately." 113 }, 114 "failure_cases_discussed": { 115 "applies": true, 116 "answer": true, 117 "justification": "Section 5.4 provides error analysis with representative poorly-aligned cases (Table 16) and well-aligned cases (Table 17), analyzing why specific examples lead to high/low agreement." 118 }, 119 "negative_results_reported": { 120 "applies": true, 121 "answer": true, 122 "justification": "The paper reports that on MT-Bench, 0-10 actually outperforms 0-5 for human-LLM agreement (Table 4a: ICC 0.570 vs 0.517), contradicting the overall finding. The paper also reports that LLM panel reliability drops sharply on subjective benchmarks." 123 } 124 }, 125 "claims_and_evidence": { 126 "abstract_claims_supported": { 127 "applies": true, 128 "answer": true, 129 "justification": "Abstract claims about 0-5 maximizing agreement (Table 2), scale-dependent consistency drops on subjective benchmarks (Table 1), and gender subgroup differences (Table 5) are all supported by the results." 130 }, 131 "causal_claims_justified": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper claims scale choice 'shifts' human-LLM agreement. The fully crossed within-subjects design (same raters, same items, different scales, randomized order) with session spacing of ~1 week adequately supports this causal claim." 135 }, 136 "generalization_bounded": { 137 "applies": true, 138 "answer": false, 139 "justification": "The title 'Human-LLM Alignment Is Highest on 0-5 Grading Scale' presents a general finding, but the evidence comes from only 12 graduate student raters, 6 LLMs, and 6 benchmarks. While the limitations section acknowledges the graduate student sample, the title and abstract do not bound the claim to the tested setting." 140 }, 141 "alternative_explanations_discussed": { 142 "applies": true, 143 "answer": false, 144 "justification": "The paper does not substantively discuss alternative explanations for why 0-5 yields better alignment. Possible explanations (e.g., cognitive load, anchor point effects from psychometric literature, model tokenization artifacts) are not explored. The temperature ablation rules out one confounder but no others." 145 }, 146 "proxy_outcome_distinction": { 147 "applies": true, 148 "answer": true, 149 "justification": "The paper carefully distinguishes ICC from Pearson correlation (Figure 2, Section 5.4) and explains why ICC measures absolute agreement while correlation only measures relative ranking. The paper's claims match the granularity of its measurements." 150 } 151 }, 152 "setup_transparency": { 153 "model_versions_specified": { 154 "applies": true, 155 "answer": false, 156 "justification": "The paper lists 'gpt-4o' and 'gemini-2.5-flash' without snapshot dates or API version identifiers. Per schema rules, marketing names like 'GPT-4o' without snapshot dates do not count as specified versions. Other models (Llama-3.3-70B-Instruct, Qwen3-32B, Mistral-7B-Instruct-v0.3) have more specific identifiers." 157 }, 158 "prompts_provided": { 159 "applies": true, 160 "answer": true, 161 "justification": "Full prompt templates for all six benchmarks are provided in Appendix C (Tables 8-13), including system messages and user instructions. Placeholders are for data items from public benchmarks and scale bounds, so prompts are fully reconstructable." 162 }, 163 "hyperparameters_reported": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 4.3 states: 'For all LLM judges, we set the default temperature to 1. We also use non-thinking mode for all models.' However, top-p, max tokens, and other API parameters are not reported." 167 }, 168 "scaffolding_described": { 169 "applies": false, 170 "answer": false, 171 "justification": "No agentic scaffolding is used. LLMs are prompted directly with a single judging template and return a score." 172 }, 173 "data_preprocessing_documented": { 174 "applies": true, 175 "answer": false, 176 "justification": "The paper states '150 items total, with 25 items sampled from each benchmark' but does not describe the sampling method (random, stratified, etc.). The linear mapping to [0,1] for inter-scale ICC is documented, but item selection criteria are not." 177 } 178 }, 179 "limitations_and_scope": { 180 "limitations_section_present": { 181 "applies": true, 182 "answer": true, 183 "justification": "A dedicated 'Limitations' section is present with substantive discussion of the graduate student sample, task difficulty, and generalizability constraints." 184 }, 185 "threats_to_validity_specific": { 186 "applies": true, 187 "answer": true, 188 "justification": "The limitations section identifies specific threats: 'The human raters are all graduate students and thus do not represent the broader populations,' 'some items...are inherently difficult even for humans,' and 'uncertainty can lower agreement with LLMs and may also make ICC estimates sensitive to benchmark composition.'" 189 }, 190 "scope_boundaries_stated": { 191 "applies": true, 192 "answer": false, 193 "justification": "The limitations discuss factors that constrain generality but do not explicitly state what the results do NOT show or what claims the authors are NOT making. No equivalent of 'what the evidence does not show' statements." 194 } 195 }, 196 "data_integrity": { 197 "raw_data_available": { 198 "applies": true, 199 "answer": false, 200 "justification": "The raw rating data (individual human and LLM scores per item per scale) is not released or made available for independent verification." 201 }, 202 "data_collection_described": { 203 "applies": true, 204 "answer": true, 205 "justification": "Section 4.2 and Appendix D describe the data collection: Label Studio platform, randomized item/scale order, separate sessions spaced ~1 week apart, fractional scoring allowed, 12 annotators with informed consent." 206 }, 207 "recruitment_methods_described": { 208 "applies": true, 209 "answer": true, 210 "justification": "Appendix A states: 'Human annotators are recruited on a voluntary basis from graduate students across multiple institutions. Participation is entirely voluntary, and all annotators provide informed consent.'" 211 }, 212 "data_pipeline_documented": { 213 "applies": true, 214 "answer": false, 215 "justification": "The pipeline from raw ratings to final ICC values has unexplained steps. How were the 25 items per benchmark sampled? Were any annotations excluded? How were LLM outputs parsed? These steps are not documented." 216 } 217 }, 218 "conflicts_of_interest": { 219 "funding_disclosed": { 220 "applies": true, 221 "answer": false, 222 "justification": "No funding source, grants, or sponsorship is mentioned anywhere in the paper. There is no acknowledgments section addressing funding." 223 }, 224 "affiliations_disclosed": { 225 "applies": true, 226 "answer": true, 227 "justification": "Author affiliations are listed: Harvard University, CMU, Stanford University, UC San Diego. The authors are not affiliated with the LLM companies whose models they evaluate." 228 }, 229 "funder_independent_of_outcome": { 230 "applies": true, 231 "answer": false, 232 "justification": "No funder is disclosed, so independence cannot be assessed. The absence of funding disclosure makes this NO." 233 }, 234 "financial_interests_declared": { 235 "applies": true, 236 "answer": false, 237 "justification": "No competing interests or financial interests statement is present in the paper." 238 } 239 }, 240 "contamination": { 241 "training_cutoff_stated": { 242 "applies": true, 243 "answer": false, 244 "justification": "No training data cutoff dates are stated for any of the six LLM judges. The models (GPT-4o, Gemini-2.5-flash, etc.) likely trained on data containing these benchmarks, but this is not discussed." 245 }, 246 "train_test_overlap_discussed": { 247 "applies": true, 248 "answer": false, 249 "justification": "The benchmarks used (STS-B from 2017, MT-Bench from 2023, etc.) predate the models and are widely available online. The potential that models memorized benchmark items or associated human ratings is not discussed." 250 }, 251 "benchmark_contamination_addressed": { 252 "applies": true, 253 "answer": false, 254 "justification": "All six benchmarks were published before the models' training periods (STS-B 2017, SummEval 2021, MT-Bench 2023, etc.). The risk that models have seen these items and associated evaluations during training is not addressed." 255 } 256 }, 257 "human_studies": { 258 "pre_registered": { 259 "applies": true, 260 "answer": false, 261 "justification": "No pre-registration is mentioned. No link to OSF, AsPredicted, or any registry." 262 }, 263 "irb_or_ethics_approval": { 264 "applies": true, 265 "answer": false, 266 "justification": "No IRB or ethics board approval is mentioned despite involving human annotators exposed to potentially sensitive content (ToxiGen). The ethical considerations section discusses risks but does not mention IRB review." 267 }, 268 "demographics_reported": { 269 "applies": true, 270 "answer": true, 271 "justification": "Demographics reported: 12 graduate students, 6 female and 6 male, from multiple institutions (Harvard, CMU, Stanford, UC San Diego). Education level and gender distribution are specified." 272 }, 273 "inclusion_exclusion_criteria": { 274 "applies": true, 275 "answer": false, 276 "justification": "The only stated criterion is 'graduate students across multiple institutions' with voluntary participation. No formal inclusion/exclusion criteria, screening process, or eligibility requirements are described." 277 }, 278 "randomization_described": { 279 "applies": true, 280 "answer": true, 281 "justification": "Section 4.2 describes randomization: 'Human raters complete three rating blocks (one per scale) in randomized order, and within each block, item order is shuffled to reduce anchoring and fatigue effects.'" 282 }, 283 "blinding_described": { 284 "applies": false, 285 "answer": false, 286 "justification": "Blinding to the grading scale is not feasible — raters must know what scale they are using. This criterion does not apply." 287 }, 288 "attrition_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "No attrition or dropout information is reported. The paper states a 'fully crossed item-rater design' implying all 12 raters completed everything, but this is not explicitly confirmed." 292 } 293 }, 294 "cost_and_practicality": { 295 "inference_cost_reported": { 296 "applies": true, 297 "answer": false, 298 "justification": "No API costs, token consumption, or per-item latency is reported for the LLM judging experiments, despite querying six models across thousands of items on three scales." 299 }, 300 "compute_budget_stated": { 301 "applies": true, 302 "answer": false, 303 "justification": "No total computational budget, API spend, or hardware specifications are stated." 304 } 305 }, 306 "experimental_rigor": { 307 "seed_sensitivity_reported": { 308 "applies": true, 309 "answer": false, 310 "justification": "No random seed sensitivity analysis is performed. The temperature ablation (Section 5.3) explores sensitivity to a decoding parameter but not random seeds." 311 }, 312 "number_of_runs_stated": { 313 "applies": true, 314 "answer": false, 315 "justification": "The main results appear to be from a single run per model per scale. The number of runs is not explicitly stated. Only the temperature ablation section states multiple configurations." 316 }, 317 "hyperparameter_search_budget": { 318 "applies": true, 319 "answer": false, 320 "justification": "No hyperparameter search budget is reported. Temperature is set to 1 as 'default' without justifying this choice or reporting alternatives tried (the ablation is post-hoc for two models only)." 321 }, 322 "best_config_selection_justified": { 323 "applies": true, 324 "answer": true, 325 "justification": "All three scale configurations are reported and compared (Tables 2, 4a, 4b). The 'best' scale (0-5) emerges from comparing all configurations rather than selecting a single best result." 326 }, 327 "multiple_comparison_correction": { 328 "applies": true, 329 "answer": false, 330 "justification": "The paper makes numerous comparisons across 3 scales × 6 benchmarks × 6 models × 2 genders without any correction for multiple comparisons (Bonferroni, Holm, etc.)." 331 }, 332 "self_comparison_bias_addressed": { 333 "applies": true, 334 "answer": false, 335 "justification": "The authors do not discuss potential bias from designing the evaluation framework and interpreting the results. No independent evaluation or acknowledgment of author-evaluation bias." 336 }, 337 "compute_budget_vs_performance": { 338 "applies": false, 339 "answer": false, 340 "justification": "The study measures agreement between raters, not model performance as a function of compute. Compute differences are not relevant to the research question." 341 }, 342 "benchmark_construct_validity": { 343 "applies": true, 344 "answer": false, 345 "justification": "The paper uses six established benchmarks without questioning whether they adequately test what the paper claims to evaluate — whether scale choice affects 'alignment.' No discussion of whether these benchmarks are representative of real LLM-as-a-judge use cases." 346 }, 347 "scaffold_confound_addressed": { 348 "applies": false, 349 "answer": false, 350 "justification": "No scaffolding is used. LLMs are prompted directly with judging templates." 351 } 352 }, 353 "data_leakage": { 354 "temporal_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "All six benchmarks predate the evaluated models. The models may have seen benchmark items and associated ratings during training. This temporal concern is not addressed." 358 }, 359 "feature_leakage_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "Not discussed. The evaluation prompts provide full task context to LLM judges, which is appropriate, but whether training exposure to these specific items creates implicit leakage is not considered." 363 }, 364 "non_independence_addressed": { 365 "applies": true, 366 "answer": false, 367 "justification": "Not discussed. Items from the same benchmark may share structural properties, and models may have been trained on these exact items, but independence is not analyzed." 368 }, 369 "leakage_detection_method": { 370 "applies": true, 371 "answer": false, 372 "justification": "No concrete leakage detection or prevention method is employed. No canary strings, membership inference, or decontamination is applied." 373 } 374 } 375 }, 376 "claims": [ 377 { 378 "claim": "The 0-5 grading scale yields the strongest pooled human-LLM absolute agreement (ICC = 0.853), while 0-10 is consistently the weakest (ICC = 0.805).", 379 "evidence": "Table 2 reports pooled ICC across all benchmarks: 0-5 (0.853), 0-10 (0.805), 0-100 (0.840). nMAE follows the same pattern: 0-5 (0.111), 0-10 (0.122), 0-100 (0.115).", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "LLM inter-scale self-consistency drops substantially on subjective, open-ended quality benchmarks compared to objective-like benchmarks.", 384 "evidence": "Table 1 shows average 3-scale ICC of 0.944 (STS-B) and 0.949 (ToxiGen) vs. 0.740 (MT-Bench) and 0.786 (SummEval).", 385 "supported": "strong" 386 }, 387 { 388 "claim": "Pooled LLM reliability can mask substantial benchmark heterogeneity — a 'reliability illusion.'", 389 "evidence": "Table 3 shows LLM panel ICC on 0-5 scale ranges from 0.573 (SummEval) to 0.969 (STS-B), while the pooled ICC in Table 2 is 0.944.", 390 "supported": "strong" 391 }, 392 { 393 "claim": "The scale ordering (0-5 best, 0-10 worst) is stable under temperature perturbations.", 394 "evidence": "Table 6 shows Llama and Gemini human-LLM ICC across T ∈ {0.1, 0.4, 0.7, 1.0}; the 0-5 scale is highest or competitive at all temperatures for both models.", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "Both male and female rater groups show highest LLM alignment on the 0-5 scale, with the highest within-group reliability also on 0-5.", 399 "evidence": "Table 5 shows Male-LLM ICC: 0.839 (0-5), 0.751 (0-10), 0.832 (0-100); Female-LLM ICC: 0.831 (0-5), 0.805 (0-10), 0.818 (0-100). Within-group ICC is highest on 0-5 for both.", 400 "supported": "moderate" 401 }, 402 { 403 "claim": "GPT-4o achieves the strongest individual model human-LLM alignment across all scales.", 404 "evidence": "Table 4b shows GPT has the highest ICC at every scale: 0.816 (0-5), 0.760 (0-10), 0.810 (0-100).", 405 "supported": "moderate" 406 } 407 ], 408 "methodology_tags": ["benchmark-eval"], 409 "key_findings": "Across six benchmarks and six LLM judges, the 0-5 grading scale yields the highest pooled human-LLM absolute agreement (ICC = 0.853) while 0-10 is consistently the weakest. LLM inter-scale self-consistency drops sharply on subjective open-ended tasks (MT-Bench, SummEval) but remains high on objective-like tasks, revealing that pooled reliability statistics can mask substantial benchmark heterogeneity. The scale ordering is robust to temperature perturbations, and both gender subgroups in the annotator pool show highest alignment with LLMs on the 0-5 scale.", 410 "red_flags": [ 411 { 412 "flag": "Small annotator pool for broad claims", 413 "detail": "Only 12 graduate student annotators (from elite institutions: Harvard, CMU, Stanford, UCSD) are used to establish 'human-LLM alignment.' The title presents findings as general ('Human-LLM Alignment Is Highest on 0-5') despite this narrow, non-representative sample." 414 }, 415 { 416 "flag": "No significance testing on core claims", 417 "detail": "The central claim that 0-5 is the best scale is based on comparing raw ICC point estimates (0.853 vs 0.805 vs 0.840) without any significance tests or confidence intervals. The observed differences may not be statistically significant given the sample sizes." 418 }, 419 { 420 "flag": "Contamination risk unaddressed", 421 "detail": "All six benchmarks (STS-B 2017, ToxiGen 2022, SummEval 2021, etc.) were published well before the models' training periods. LLMs may have memorized benchmark items and associated quality judgments during training, which could inflate or distort agreement metrics." 422 }, 423 { 424 "flag": "Gender analysis from tiny subgroups", 425 "detail": "Gender-stratified analysis (Table 5) is based on only 6 raters per group. Drawing conclusions about gender differences in alignment from n=6 per group is underpowered and risks overinterpretation." 426 } 427 ], 428 "cited_papers": [ 429 { 430 "title": "Judging LLM-as-a-judge with MT-bench and Chatbot Arena", 431 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 432 "year": 2023, 433 "relevance": "Foundational LLM-as-a-judge framework and MT-Bench benchmark used in this study." 434 }, 435 { 436 "title": "G-eval: NLG evaluation using GPT-4 with better human alignment", 437 "authors": ["Yang Liu", "Dan Iter", "Yichong Xu"], 438 "year": 2023, 439 "relevance": "LLM-based NLG evaluation method demonstrating GPT-4 alignment with human judgments." 440 }, 441 { 442 "title": "Evaluating the consistency of LLM evaluators", 443 "authors": ["Noah Lee", "Jiwoo Hong", "James Thorne"], 444 "year": 2025, 445 "relevance": "Directly related work on LLM judge consistency, uses Pearson correlations that this paper argues are inferior to ICC." 446 }, 447 { 448 "title": "Validating LLM-as-a-judge systems under rating indeterminacy", 449 "authors": ["Luke Guerdan", "Solon Barocas", "Kenneth Holstein"], 450 "year": 2025, 451 "arxiv_id": "2503.05965", 452 "relevance": "Examines rating indeterminacy in LLM evaluation, directly relevant to the scale sensitivity this paper studies." 453 }, 454 { 455 "title": "ChatEval: Towards better LLM-based evaluators through multi-agent debate", 456 "authors": ["Chi-Min Chan", "Weize Chen", "Yusheng Su"], 457 "year": 2023, 458 "arxiv_id": "2308.07201", 459 "relevance": "Multi-agent evaluation framework for improving LLM judge quality." 460 }, 461 { 462 "title": "Agent-as-a-judge: Evaluate agents with agents", 463 "authors": ["Mingchen Zhuge", "Changsheng Zhao", "Dylan Ashley"], 464 "year": 2024, 465 "arxiv_id": "2410.10934", 466 "relevance": "Agent-based evaluation paradigm extending LLM-as-a-judge to agentic systems." 467 }, 468 { 469 "title": "From generation to judgment: Opportunities and challenges of LLM-as-a-judge", 470 "authors": ["Dawei Li", "Bohan Jiang", "Liangjie Huang"], 471 "year": 2025, 472 "relevance": "Comprehensive survey of LLM-as-a-judge covering bias, consistency, and alignment challenges." 473 }, 474 { 475 "title": "Are we on the right way to assessing LLM-as-a-judge?", 476 "authors": ["Yuanning Feng", "Sinan Wang", "Zhengxiang Cheng"], 477 "year": 2025, 478 "arxiv_id": "2512.16041", 479 "relevance": "Reveals significant reliability problems in top-tier LLM judges under pairwise scoring and transitivity checks." 480 }, 481 { 482 "title": "Investigating non-transitivity in LLM-as-a-judge", 483 "authors": ["Yi Xu", "Laura Ruis", "Tim Rocktäschel"], 484 "year": 2025, 485 "arxiv_id": "2502.14074", 486 "relevance": "Studies non-transitive preference cycles in LLM judge comparisons, complementary consistency concern." 487 }, 488 { 489 "title": "RocketEval: Efficient automated LLM evaluation via grading checklist", 490 "authors": ["Tianjun Wei", "Wei Wen", "Ruizhi Qiao"], 491 "year": 2025, 492 "arxiv_id": "2503.05142", 493 "relevance": "Checklist-based LLM evaluation framework aiming to reduce bias and improve robustness." 494 }, 495 { 496 "title": "GPTScore: Evaluate as you desire", 497 "authors": ["Jinlan Fu", "See Kiong Ng", "Zhengbao Jiang"], 498 "year": 2024, 499 "relevance": "LLM scoring framework for evaluation that this paper benchmarks against in terms of human alignment methodology." 500 }, 501 { 502 "title": "Benchmarking cognitive biases in large language models as evaluators", 503 "authors": ["Ryan Koo", "Minhwa Lee", "Vipul Raheja"], 504 "year": 2024, 505 "relevance": "Studies systematic cognitive biases in LLM evaluators including position and verbosity bias." 506 } 507 ], 508 "engagement_factors": { 509 "practical_relevance": { 510 "score": 2, 511 "justification": "Practitioners using LLM-as-a-judge can immediately adopt the recommendation to use 0-5 scales over 0-10 or 0-100." 512 }, 513 "surprise_contrarian": { 514 "score": 1, 515 "justification": "The finding that scale matters is somewhat surprising to those who assumed it was neutral, but not deeply contrarian." 516 }, 517 "fear_safety": { 518 "score": 0, 519 "justification": "No AI safety or security concerns are raised by this work." 520 }, 521 "drama_conflict": { 522 "score": 0, 523 "justification": "No controversy or conflict angle; this is a straightforward psychometric study." 524 }, 525 "demo_ability": { 526 "score": 0, 527 "justification": "No code, tool, or demo is released." 528 }, 529 "brand_recognition": { 530 "score": 1, 531 "justification": "Evaluates well-known models (GPT-4o, Gemini) and comes from recognizable institutions (Harvard, Stanford, CMU) but is not from a major AI lab." 532 } 533 } 534 }