scan.json (28257B)
1 { 2 "paper": { 3 "title": "Can LLMs Replace Human Evaluators? An Empirical Study of LLM-as-a-Judge in Software Engineering", 4 "authors": ["Ruiqi Wang", "Jiyu Guo", "Cuiyun Gao", "Guodong Fan", "Chun Yong Chong", "Xin Xia"], 5 "year": 2025, 6 "venue": "Proc. ACM Softw. Eng. (ISSTA)", 7 "arxiv_id": "2502.06193", 8 "doi": "10.1145/3728963" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": true, 15 "justification": "The paper states 'Our source code and data is publicly available at [46]' (Section Data Availability) and provides a GitHub URL: https://github.com/BackOnTruck/llm-judge-empirical." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The replication package at [46] is described as containing both source code and data. The paper also uses publicly available datasets (CodeTransOcean, CodeXGLUE, ComplexCodeEval) and describes curating a meta-evaluation dataset of 450 scored responses that is included in the package." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "The paper mentions 'an Ubuntu 20.04 server with two Intel Xeon Platinum 8276L CPUs, four NVIDIA A100-40GB GPUs, and 256 GB RAM' and that vLLM was used, but there is no requirements.txt, Dockerfile, or detailed dependency version listing in the paper. Hardware is described but software environment is not specified with enough detail to recreate it." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "The paper describes the methodology in detail (datasets, model selection, scoring procedure) but does not provide step-by-step reproduction instructions such as commands to run or a README with a reproduction guide within the paper itself. The repository may contain these, but the paper does not describe them." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "The paper reports correlation coefficients (Spearman's rho, Pearson's R, Kendall's tau) as point estimates throughout Tables 3 and 6. No confidence intervals or error bars are provided for any of these metrics." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": true, 42 "justification": "Section 4.3 states 'We also check if the p-value of each correlation coefficient in RQ1 is smaller than 0.05 to ensure a 95% confidence interval.' Table 3 marks coefficients with p > 0.05 in red." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper reports correlation coefficients themselves as effect size measures (e.g., R = 81.32, rho = 73.67) and provides absolute differences between methods (e.g., 'Δ R = 2.21, 6.03, 3.04 at maximum' for inference strategies). The correlations with baselines provide clear context for the magnitude of improvement." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "The sample of 50 instructions per task (150 total, 450 responses) is not justified with a power analysis or explicit rationale for why 50 is sufficient. No discussion of whether the sample size provides adequate statistical power for the correlation analyses." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "The paper does not report variance or standard deviation across experimental runs. G-Eval uses 20 inference passes and averages them, BatchEval uses 5 rounds, but no variance across these runs is reported. Results appear to be single-run numbers for most methods." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper includes five conventional metrics as baselines (BLEU, ROUGE-L, METEOR, ChrF++, CrystalBLEU) and compares them against all LLM-as-a-judge methods in Table 3." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "The conventional metrics represent the standard evaluation practice in SE. The LLM-as-a-judge methods include recent approaches (G-Eval 2023, BatchEval 2024, Prometheus 2 2024) and use GPT-4o (2024-08-06 version). The baselines are appropriate for the comparison being made." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": true, 74 "justification": "The study design effectively ablates multiple dimensions: method category (embedding, probability, output-based), LLM size (small <50B vs large >100B in Table 5), inference strategy (vanilla vs G-Eval vs BatchEval), and SFT vs non-SFT. RQ1-RQ3 systematically vary these components." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": true, 79 "justification": "Three correlation metrics are used for individual scoring: Spearman's rho, Pearson's R, and Kendall's tau (Table 3). For pairwise comparison, Accuracy and Agreement are both reported (Table 6)." 80 }, 81 "human_evaluation": { 82 "applies": true, 83 "answer": true, 84 "justification": "Human evaluation is central to the study. Two human evaluators scored all 450 responses on two aspects plus an overall score. Inter-annotator agreement is reported (footnote 10): Spearman's rho of 83.07, 75.42, 74.20 across tasks." 85 }, 86 "held_out_test_set": { 87 "applies": true, 88 "answer": true, 89 "justification": "The paper uses test sets from existing benchmarks: 'When training, validation, and test sets are available, we only adopt the test set for our evaluation' (Section 4.1.1)." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "Results are broken down by task (code translation, code generation, code summarization) in Tables 3 and 6, by method category (Table 4), and by LLM size (Table 5). This provides granular analysis." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": true, 99 "justification": "Section 6.1 presents a detailed case study of a failing case (code summarization) where GPT-4o assigns a perfect score while the human score is 2.75, with analysis of why the LLM failed (verbosity bias). The paper also discusses failure modes across tasks." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "The paper reports substantial negative results: LLM-as-a-judge methods 'completely defeated by conventional metrics' in code summarization (Section 5.1), pairwise comparison 'fail to deliver satisfactory and consistent comparison performance' (Finding 5), inference strategies provide only 'marginal improvement' (Section 5.1)." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims output-based methods reach Pearson correlation of 81.32 and 68.51, outperforming ChrF++ at 34.23 and 64.92. These numbers match Table 3. The hedged conclusion 'can potentially replace human evaluations in certain SE tasks' is supported by the mixed task-dependent results." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": true, 116 "justification": "The paper primarily makes correlational claims ('methods X correlate with human scores at Y'). When it makes explanatory claims (e.g., poor summarization performance due to verbosity bias), these are supported by the case study analysis in Section 6.1. The ablation-like design (varying method type, LLM size, inference strategy) supports the component-level claims." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": false, 121 "justification": "The title 'Can LLMs Replace Human Evaluators?' and the abstract claim about replacing 'human evaluations in certain SE tasks' are broad. The study tests only 3 SE tasks with 50 instructions each, using specific datasets and a limited set of programming languages (Java, Python, C, C++). The paper does not adequately bound its findings to these specific tasks and settings; the title and framing suggest broader applicability." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": true, 126 "justification": "The paper discusses alternative explanations for observed results: poor conventional metrics in translation attributed to reference-response divergence (Section 5.1), poor LLM performance in summarization attributed to verbosity bias (Section 6.1), SFT model underperformance attributed to limited parameters and NLP-SE misalignment (Section 5.1)." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "For GPT-4o, a version date is given: '2024-08-06 version' (footnote 8). However, for most other models, only family names and sizes are given (e.g., 'CodeLlama-Instruct 7/13/34B', 'DeepSeek-Coder-V2-Lite 16B', 'DeepSeek-V2.5'). These are marketing names without specific checkpoint identifiers or snapshot dates. Crucially, 'DeepSeek-V2.5' and 'Codestral-v0.1' lack precise version identifiers." 134 }, 135 "prompts_provided": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper states 'The full prompts are available in our repository [46]' and 'the remaining aspects and criteria can be found in our repository' (Section 4.1.3). Only one example evaluation rubric is shown in the paper itself (readability for code summarization). The actual prompts used for LLM-as-a-judge methods are not provided in the paper text." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": true, 143 "justification": "Temperature settings are reported for key methods: Vanilla uses greedy decoding (temperature 0), G-Eval uses temperature 1.0 with 20 inference passes, BatchEval uses temperature 0.2 with batch size 10 and 5 rounds (Section 4.2.4). Token generation limits are specified (3072 tokens for response generation, 1024 for judgments)." 144 }, 145 "scaffolding_described": { 146 "applies": false, 147 "answer": false, 148 "justification": "No agentic scaffolding is used. The methods are direct prompt-based evaluations (single or multi-pass inference), not agent-based workflows." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": true, 153 "justification": "Section 4.1.1 describes filtering by programming language (Java, Python, C, C++), token length limits (1536/1536/1024), reselection of instructions for code summarization (reference summaries with at least 15 tokens), and manual examination. Section 4.1.2 describes dependency augmentation with GPT-4o for ComplexCodeEval. The preprocessing pipeline is well-documented." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": false, 160 "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. The Discussion (Section 6) covers implications and case studies but does not systematically discuss limitations. The Conclusion (Section 7) is brief and does not contain substantive limitation discussion." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": false, 165 "justification": "No specific threats to validity are discussed. There is no analysis of how the small sample size (50 per task), the choice of only two human evaluators, the restriction to specific programming languages, or the use of only GPT-4o for output-based inference strategies might affect the conclusions." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": false, 170 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound the generalizability to the specific 3 tasks, 4 programming languages, or 50 instructions per task. The broad title and conclusions suggest generality beyond what was tested." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": true, 177 "justification": "The replication package at [46] (https://github.com/BackOnTruck/llm-judge-empirical) contains the source code and data, including the 450 scored responses and human evaluations." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 4.1 provides detailed description of how instructions were sampled from three datasets, how responses were generated using 12 LLMs with vLLM, and how human evaluation was conducted with two evaluators using 5-point rubrics on two aspects per task." 183 }, 184 "recruitment_methods_described": { 185 "applies": true, 186 "answer": false, 187 "justification": "The paper states 'Two human evaluators with expertise in the chosen programming languages are involved' (Section 4.1.3) but provides no details about who these evaluators are, how they were recruited, their background, or their relationship to the authors. This lack of detail makes it impossible to assess potential evaluator bias." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": true, 192 "justification": "The data pipeline is documented from instruction collection (Section 4.1.1) through response generation (Section 4.1.2) to manual evaluation (Section 4.1.3) and meta-evaluation (Section 4.3). Filtering steps are described with specific criteria (language, token length, minimum summary length)." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": false, 199 "justification": "No funding information is disclosed anywhere in the paper. There is no acknowledgments section listing grants or sponsors." 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "Author affiliations are clearly listed: Harbin Institute of Technology Shenzhen, Monash University Malaysia, and Zhejiang University. None of these affiliations appear to create a conflict with the products evaluated." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": false, 209 "justification": "Since funding is not disclosed at all, it is impossible to determine whether the funder is independent of the outcome. The absence of any funding disclosure is a concern." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests statement is present in the paper. Absence of disclosure is not the same as absence of conflict." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": true, 220 "answer": false, 221 "justification": "The paper evaluates multiple LLMs (GPT-4o, DeepSeek-V2.5, etc.) as judges and uses LLMs to generate responses. Training cutoff dates are not stated for any of these models. The model release dates are listed in Table 1 but training data cutoffs are not." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": true, 225 "answer": false, 226 "justification": "The evaluation benchmarks (CodeTransOcean, CodeXGLUE, ComplexCodeEval) are all publicly available and could be in the training data of the LLMs used. No discussion of potential train/test overlap is provided." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": true, 230 "answer": false, 231 "justification": "CodeXGLUE was published in 2021, HumanEval in 2021, and CodeSearchNet in 2019 — all well before the training cutoffs of models like GPT-4o and DeepSeek-V2.5. The paper does not discuss benchmark contamination risk despite using publicly available benchmarks with models that likely trained on them." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": true, 237 "answer": false, 238 "justification": "The study involves two human evaluators scoring 450 responses. No pre-registration is mentioned." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": true, 242 "answer": false, 243 "justification": "The study involves human evaluators as participants in the research process. No IRB or ethics board approval is mentioned." 244 }, 245 "demographics_reported": { 246 "applies": true, 247 "answer": false, 248 "justification": "The only characterization of the two human evaluators is that they have 'expertise in the chosen programming languages' (Section 4.1.3). No other demographics (experience level, years of experience, affiliation) are provided." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": true, 252 "answer": false, 253 "justification": "No inclusion or exclusion criteria for selecting the human evaluators are stated. The paper does not explain why these specific evaluators were chosen or what qualifications they have beyond programming language expertise." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "This is not an experimental study with treatment/control groups assigned to human participants. The human evaluators both scored all responses, so randomization of participant assignment is not applicable." 259 }, 260 "blinding_described": { 261 "applies": true, 262 "answer": false, 263 "justification": "The paper does not describe whether human evaluators were blinded to which LLM generated each response. Since responses from different LLMs may have identifiable patterns, blinding is relevant. The paper states evaluators received 'the corresponding instruction and the reference answer along with the response' but does not mention whether the generating model identity was hidden." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "With only two evaluators scoring all responses, attrition reporting is not applicable in the traditional sense. Both evaluators completed all evaluations." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": false, 275 "justification": "The paper discusses inference cost qualitatively (G-Eval 'greatly increasing inference cost', BatchEval 'more expensive inference due to multi-round evaluation') but does not report actual costs, API spending, tokens consumed, or wall-clock time for any method." 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": false, 280 "justification": "The hardware is listed (4x NVIDIA A100-40GB, Ubuntu 20.04 server) but no total compute budget, GPU hours, API spend, or total inference time is reported." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "Output-based LLM-as-a-judge methods reach Pearson correlation of 81.32 and 68.51 with human scores in code translation and generation respectively, outperforming ChrF++ at 34.23 and 64.92.", 287 "evidence": "Table 3 shows BatchEval achieves R=81.32 for code translation and DeepSeek-V2.5 Vanilla achieves R=68.51 for code generation, compared to ChrF++ at R=34.23 and R=64.92 respectively.", 288 "supported": "strong" 289 }, 290 { 291 "claim": "LLM-as-a-judge methods demonstrate drastically different performance across SE tasks, lacking generalizability.", 292 "evidence": "Table 3 shows the same methods that achieve high correlation in translation (R>80) fail in summarization (R<30 for all LLM methods). Finding 1 (Section 5.1) summarizes this pattern.", 293 "supported": "strong" 294 }, 295 { 296 "claim": "Inference strategies (G-Eval, BatchEval) provide only marginal improvement over vanilla output-based methods with large LLMs for individual scoring.", 297 "evidence": "Section 5.1 reports 'ΔR = 2.21, 6.03, 3.04 at maximum' for inference strategies applied to GPT-4o. Table 3 confirms small gaps between GPT-4o vanilla and G-Eval/BatchEval.", 298 "supported": "strong" 299 }, 300 { 301 "claim": "LLM-as-a-judge methods fail to deliver satisfactory and consistent pairwise comparison performance.", 302 "evidence": "Table 6 shows the best accuracy is 65.33 (code translation), with most methods near or below random baseline (33.33). The best-performing methods show Agreement below 25, indicating inconsistency when response order is swapped.", 303 "supported": "strong" 304 }, 305 { 306 "claim": "Output-based methods with large LLMs (>100B parameters) align well with each other despite differences in models and inference strategies.", 307 "evidence": "Table 5 shows Large-Large correlations of rho_min=83.04, rho_max=90.64 for translation and rho_min=68.63, rho_max=88.25 for generation, contrasting with Small-Small correlations below 30.", 308 "supported": "strong" 309 }, 310 { 311 "claim": "Current LLM-as-a-judge methods can potentially replace human evaluations in certain SE tasks.", 312 "evidence": "The evidence supports this only for code translation (R=81.32, near inter-annotator agreement of R=85.86) and partially for code generation (R=68.51 vs inter-annotator R=79.70). For code summarization, all methods fail. The claim is hedged ('potentially', 'certain') but still overstates given only 50 samples per task.", 313 "supported": "moderate" 314 } 315 ], 316 "methodology_tags": ["benchmark-eval"], 317 "key_findings": "The study evaluates nine LLM-as-a-judge methods against five conventional metrics on three SE tasks (code translation, generation, summarization) using 450 manually scored LLM responses. Output-based methods with large LLMs achieve near-human alignment for code translation (Pearson R=81.32) but fail completely for code summarization (all R<30), revealing strong task dependence. Pairwise comparison performs worse than individual scoring, with high inconsistency when response order is swapped. Inference strategies like G-Eval and BatchEval provide only marginal improvement over vanilla prompting with large LLMs.", 318 "red_flags": [ 319 { 320 "flag": "Small sample size without justification", 321 "detail": "Only 50 instructions per task (150 total, 450 responses) are used. No power analysis or justification for this sample size is provided. This is small for drawing conclusions about whether LLMs can replace human evaluators." 322 }, 323 { 324 "flag": "Only two human evaluators", 325 "detail": "The ground truth is based on only two human evaluators whose demographics, qualifications, and recruitment are not described. The inter-annotator agreement varies substantially across tasks (Pearson R from 73.74 to 85.86), and the mean of two evaluators is used as the gold standard." 326 }, 327 { 328 "flag": "No limitations or threats-to-validity section", 329 "detail": "The paper lacks a dedicated limitations section despite making broad claims about replacing human evaluators. Important limitations (small sample, limited languages, only two annotators, potential contamination) are not discussed." 330 }, 331 { 332 "flag": "Benchmark contamination not addressed", 333 "detail": "Public benchmarks (CodeXGLUE 2021, CodeSearchNet 2019) are used with models trained after their publication. No discussion of whether the judge LLMs or response-generating LLMs may have seen these benchmarks during training." 334 }, 335 { 336 "flag": "Broad title and claims relative to evidence scope", 337 "detail": "The title asks 'Can LLMs Replace Human Evaluators?' broadly, but tests only 3 tasks in 4 programming languages with 50 instructions each. The scope of evidence is narrow relative to the generality of the claims." 338 } 339 ], 340 "cited_papers": [ 341 { 342 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 343 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 344 "year": 2023, 345 "relevance": "Foundational work on LLM-as-a-judge methodology that this paper extends to SE tasks." 346 }, 347 { 348 "title": "G-Eval: NLG Evaluation using Gpt-4 with Better Human Alignment", 349 "authors": ["Yang Liu", "Dan Iter", "Yichong Xu"], 350 "year": 2023, 351 "relevance": "Key LLM-as-a-judge method evaluated in this study; uses Chain-of-Thought for evaluation." 352 }, 353 { 354 "title": "Prometheus 2: An Open Source Language Model Specialized in Evaluating Other Language Models", 355 "authors": ["Seungone Kim", "Juyoung Suk", "Shayne Longpre"], 356 "year": 2024, 357 "arxiv_id": "2405.01535", 358 "relevance": "Open-source fine-tuned judge model evaluated in this study for SE task evaluation." 359 }, 360 { 361 "title": "Evaluating Large Language Models Trained on Code", 362 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 363 "year": 2021, 364 "arxiv_id": "2107.03374", 365 "relevance": "Introduced HumanEval and Codex; foundational benchmark for code generation evaluation." 366 }, 367 { 368 "title": "CodeBERTScore: Evaluating Code Generation with Pretrained Models of Code", 369 "authors": ["Shuyan Zhou", "Uri Alon", "Sumit Agarwal", "Graham Neubig"], 370 "year": 2023, 371 "relevance": "Code-specific evaluation metric adapted from BERTScore; relevant to automated code evaluation methods." 372 }, 373 { 374 "title": "DeepSeek-Coder-V2: Breaking the Barrier of Closed-Source Models in Code Intelligence", 375 "authors": ["DeepSeek-AI"], 376 "year": 2024, 377 "arxiv_id": "2406.11931", 378 "relevance": "One of the LLMs used both for response generation and as a judge in this study." 379 }, 380 { 381 "title": "NoFunEval: Funny How Code LMs Falter on Requirements Beyond Functional Correctness", 382 "authors": ["Manav Singhal", "Tushar Aggarwal", "Abhijeet Awasthi"], 383 "year": 2024, 384 "arxiv_id": "2401.15963", 385 "relevance": "Meta-evaluation benchmark for non-functional code quality aspects, related to evaluating code beyond pass/fail." 386 }, 387 { 388 "title": "CodeUltraFeedback: An LLM-as-a-Judge Dataset for Aligning Large Language Models to Coding Preferences", 389 "authors": ["Martin Weyssow", "Aton Kamanda", "Houari A. Sahraoui"], 390 "year": 2024, 391 "arxiv_id": "2403.09032", 392 "relevance": "Dataset for evaluating LLM alignment with human coding preferences, directly related to LLM-as-a-judge for code." 393 }, 394 { 395 "title": "Large Language Models for Software Engineering: A Systematic Literature Review", 396 "authors": ["Xinyi Hou", "Yanjie Zhao", "Yue Liu"], 397 "year": 2023, 398 "arxiv_id": "2308.10620", 399 "relevance": "Comprehensive survey of LLMs in SE, providing context for the evaluation challenges this paper addresses." 400 }, 401 { 402 "title": "BatchEval: Towards Human-like Text Evaluation", 403 "authors": ["Peiwen Yuan", "Shaoxiong Feng", "Yiwei Li"], 404 "year": 2024, 405 "relevance": "Key inference strategy evaluated in this study; proposes multi-round scoring with diversified batching." 406 }, 407 { 408 "title": "Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters", 409 "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"], 410 "year": 2024, 411 "arxiv_id": "2408.03314", 412 "relevance": "Relevant to the finding that inference-time scaling strategies provide only marginal improvement for SE evaluation." 413 }, 414 { 415 "title": "Split and Merge: Aligning Position Biases in Large Language Model based Evaluators", 416 "authors": ["Zongjie Li", "Chaozheng Wang", "Pingchuan Ma"], 417 "year": 2023, 418 "arxiv_id": "2310.01432", 419 "relevance": "Addresses position bias in LLM-as-a-judge, relevant to the inconsistency findings in pairwise comparison (RQ3)." 420 } 421 ] 422 }