scan.json (33391B)
1 { 2 "paper": { 3 "title": "Prompt perturbation and fraction facilitation sometimes strengthen Large Language Model scores", 4 "authors": ["Mike Thelwall"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2512.01330", 8 "doi": "10.48550/arXiv.2512.01330" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Semantically equivalent prompt variations produce substantially different LLM scoring correlations with human judgments (Spearman ρ ranging from 0.176 to 0.279 for equivalent prompts on Gemma3 27b). Encouraging fractional scores, even when the gold standard uses integers, improves correlation by releasing information about model certainty. Averaging scores from diverse semantically equivalent prompts consistently outperforms any single prompt across all six LLMs tested. However, optimal prompt strategies are highly model-dependent — the best prompts for Gemma3 are among the worst for ChatGPT (Pearson r = -0.68 between their prompt-correlation profiles).", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper provides a GitHub URL (https://github.com/MikeThelwall/Webometric_Analyst) for the score extraction algorithms. However, this is the extraction tool, not the full experimental pipeline for prompt submission and analysis." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "The dataset of 2,780 articles with gold standard scores is recycled from a previous paper (Thelwall & Mohammadi, 2025) but no download link or data release is provided in this paper. The LLM output scores are also not released." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No environment specifications are provided. Models are described as run via Ollama (local) or the OpenAI API, but no versions of Ollama, Python, or library dependencies are given. No requirements.txt, Dockerfile, or equivalent." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are included. The Appendix lists all 58 prompts, and the Methods describe the general procedure, but there are no scripts, commands, or a README to reproduce the experiments." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper explicitly acknowledges that 'The correlations are averages of correlations so standard confidence interval formulae or bootstrapping do not apply.' It provides a reference CI width for illustration but does not compute proper confidence intervals for its main results." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "No statistical significance tests are used. Differences between prompt correlations are compared informally by visual inspection of figures and tables. The paper claims some differences 'seem to be too large to be explainable through natural statistical variation' without formal testing." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Spearman correlations are reported for all prompts and models, providing magnitude context. Specific values are compared (e.g., 0.176 to 0.279 for equivalent prompts, average vs. max correlations in Table 3). Pearson correlations between models' prompt profiles are also reported (e.g., r = -0.68 for Gemma3 vs ChatGPT)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No power analysis or formal justification for the sample size of 2,780 articles. The paper notes 'there are no other datasets with individual article scores that are of a similar size or larger,' framing the size as a constraint rather than a justified choice." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No standard deviations or variance measures are reported for the main correlation results. Between-iteration score change percentages are shown (Table 2), but these are frequency counts, not standard deviations of the Spearman correlations. Correlations are averaged over six UoAs without reporting spread." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The standard prompt ('Score this journal article then stop. Do not include any words.') serves as the implicit baseline. All 58 prompt variations are compared against each other and against the averaging strategy, providing a systematic comparison framework." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Six contemporary LLMs are tested: Gemma3 27b, ChatGPT 4.1-mini, Llama4 Scout, Magistral Small, Qwen3 32b, and DeepSeek R1 32b. All are 2024-2025 models and represent current state-of-the-art." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The prompt variations systematically ablate components: adding/removing 'based on the title and abstract,' adding score options, adding fractional score permission, changing verb synonyms (score/rate/assess/grade). Table 3 shows the effect of mixing vs single prompts." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": false, 83 "justification": "Only Spearman correlation with the gold standard is used as the evaluation metric. The paper explicitly argues that 'rank correlations are the only appropriate indicator of value or usefulness,' but does not provide complementary metrics (e.g., mean absolute error, classification accuracy at thresholds, or agreement measures like Cohen's kappa)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No humans evaluated the LLM's output scores. The gold standard human scores were created in a prior study by a single author and serve as a fixed benchmark. The evaluation is automated comparison (Spearman correlation) between LLM scores and pre-existing human scores." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "All 2,780 articles are used for both identifying which prompts perform best and reporting the final correlation results. There is no held-out test set or train/validation/test split. The same data used to determine the best strategy is used to evaluate it." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Correlations are calculated separately for each of the six Units of Assessment (UoAs: Clinical Medicine, Public Health, Allied Health, Psychology, Biological Sciences, Agriculture). The reported averages are 'the average of six correlations calculated separately for each UoA.'" 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper discusses several failure modes: some prompts yielded no scores (returning evaluative text instead due to 10-token output truncation), the complete absence of 1* scores across 1.7 million queries, the 'lowest score' strategy producing lower correlations, and ChatGPT's fractional prompts performing worse rather than better." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Multiple negative results are reported: the 'lowest score' prompt strategy failed (grey bar in Figure 1); 'based on the title and abstract' prompts performed worse for Gemma3; fractional prompts were counterproductive for ChatGPT; some prompts produced negligible correlations (0.027 for Llama4 Scout)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims about (a) testing equivalent variations, (b) averaging scores, (c) fractional scores, and (d) not drawing attention to partial input are all supported by the results (Figures 1-4, Tables 3-5). The abstract appropriately hedges with 'sometimes' and notes model-dependent results." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper claims 'improvements can be obtained by' changing prompt wording. The study design is controlled single-variable manipulation: the same dataset and model are used, with only the prompt changed. This is adequate for causal claims about prompt effects on scoring correlations." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "The Discussion opens with 'The results are limited to a single dataset and task. The extent to which they replicate in other contexts or is a general LLM property is unclear.' The title uses 'sometimes' to hedge. The Conclusions explicitly state 'this study has not identified a shortcut to good prompt design.'" 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "The Discussion considers alternative explanations: the 3* bias may reflect system instruction wording or 'a more generic tendency to be cautiously optimistic' from instruction tuning; the absence of 1* scores may relate to 'the known issue of rare event prediction for machine learning' (Shyalika et al., 2024); and prior work on LLM positive bias (Murugadoss et al., 2025)." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper explicitly states 'This is an imperfect dataset because it is subjective to a single person' and that Spearman correlation measures ranking ability rather than absolute scoring accuracy. It discusses the distinction between the proxy (correlation with one expert's scores) and the broader concept of 'research quality evaluation.'" 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "The ChatGPT model is precisely specified as 'gpt-4.1-mini-2025-04-14 through the API.' Other models are identified by name and parameter count: 'Gemma3 27b,' 'Llama 4 Scout,' 'Magistral Small,' 'Qwen3 27b,' 'DeepSeekR1 32b.' Note: the abstract/tables inconsistently label the ChatGPT model as '4o-mini' while the Methods say '4.1-mini.'" 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "All 58 user prompts are provided in full in the Appendix with their set numbers and type classifications. The system prompt is described as using 'the REF2021 Main Panel A expert reviewer instructions' with a reference to a prior paper's appendix for the exact text." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "Max output tokens are reported (10 for Gemma3 and ChatGPT, 100 for Llama4, unlimited for reasoning models). However, temperature, top-p, and other sampling parameters are not reported for any model. The paper notes previous work tested 'temperature and other parameters' but does not state what settings were used here." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The approach is direct prompting of LLMs with article text, with no tools, workflows, or multi-step reasoning chains." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "The data pipeline is described: 500 articles randomly sampled per UoA from REF2021, resulting in 2,780 unique articles after deduplication. Score extraction methods are described including how text responses were handled ('When the response contained text before a number, this was treated as missing data'). Separate extraction algorithms per model are noted." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. Limitations are discussed within the Discussion and Conclusions sections, but without a titled subsection. The Discussion opens with limitation statements but transitions into general interpretation." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Specific threats are discussed: 'This is an imperfect dataset because it is subjective to a single person'; 'The results are limited to a single dataset and task'; 'the choice of prompts for the current study was not informed by prior research but was non-systematic and relied on intuition'; and model-dependent prompt effects (Gemma3 vs ChatGPT inversion)." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "The Conclusions state 'this study has not identified a shortcut to good prompt design' and that 'the best prompts for one model can be poor for another.' The Discussion states 'The extent to which they replicate in other contexts or is a general LLM property is unclear.'" 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "Neither the gold standard scores, the individual LLM responses, nor the 2,780 article titles/abstracts are released. The GitHub link is for score extraction software, not data. Verification of results requires access to the unreleased dataset from a prior study." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "The data source is clearly described: REF2021 Main Panel A, random sample of 500 per UoA, scored by the first author of the previous paper using official REF scoring criteria. LLM query submission is described with dates (ChatGPT: 12-15 November 2025), platform (Ollama for local models, API for ChatGPT), and query count (1.7 million for Gemma3)." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants in this study. The data consists of published journal articles from an existing national research evaluation framework (REF2021), with gold standard scores from a prior study." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline is documented: previous dataset → 2,780 articles → 58 prompts × 10 iterations (Gemma3) or 1 iteration (other models) → score extraction via pattern matching → Spearman correlation calculation per UoA → averaging. Missing data handling is described (text before number = missing, truncated reports = missing)." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding source or acknowledgments section is present in the paper. There is no statement about the research being funded or unfunded." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "Author affiliations are not visible in the provided paper text. The paper lists 'Mike Thelwall' as the sole author without an institutional affiliation line. The author evaluates multiple LLMs from different companies without disclosing his affiliation." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding information is disclosed, so funder independence cannot be assessed. The paper evaluates products from Google (Gemma), OpenAI (ChatGPT), Meta (Llama), Mistral (Magistral), Alibaba (Qwen), and DeepSeek without any funding disclosure." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff dates are stated for any of the six LLMs tested. The test articles are from 2014-2020 (REF2021 period), and all models were trained after 2021, meaning training data likely includes these published articles." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "The 2,780 health and life science articles from 2014-2020 are published papers that would be in the training data of 2024-2025 models. The potential for models to have memorized content about these articles (citations, impact, journal prestige) is not discussed." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "Aggregate REF2021 results are publicly available. While individual article scores are not public, models could have indirect quality signals from training data (journal rankings, citation counts, author prestige). This contamination risk is not addressed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study. The research evaluates LLM scoring of existing articles against a pre-existing gold standard." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference costs, latency, or per-query times are reported. The paper mentions 1.7 million Gemma3 queries and ChatGPT API submission dates (12-15 November 2025) but does not report API costs, wall-clock time, or per-article processing time." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "The total query volume is stated (1.7 million for Gemma3, 58×2780 for each other model), but no GPU hours, hardware specifications, total API spend, or wall-clock time is reported. The hardware used to run Ollama models locally is not described." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": true, 296 "justification": "Each Gemma3 prompt was submitted 10 times and between-iteration score variation is systematically reported in Table 2 (e.g., 29.3% change rate for fractional prompts vs 0.6-5.4% for non-fractional). This demonstrates sensitivity to LLM stochasticity, analogous to seed sensitivity." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Explicitly stated: 'Each prompt was submitted ten times to Gemma3 27b.' For other models: 'a single iteration was used rather than ten, since they were only used for comparison with Gemma3.'" 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "The paper acknowledges the prompt design 'was not informed by prior research but was non-systematic and relied on intuition.' No systematic search budget is reported for the 58 prompt variations. The prompts represent the full search space tested, but no principled design or budget is described." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "All 58 prompts are reported transparently in the Appendix and Figures. Rather than selecting one best prompt, the paper recommends averaging across all prompts as the universally best strategy (Table 5). No cherry-picking of a single best configuration." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "58 prompts × 6 models produces hundreds of correlation comparisons. No correction for multiple comparisons (Bonferroni, Holm, etc.) is applied. No formal statistical tests are conducted at all." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The gold standard was created by the first author of the source paper (implied to be Thelwall himself based on the self-citation pattern). The potential bias of designing prompts and evaluating them against one's own gold standard scores is not discussed." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "No analysis of performance as a function of compute budget. The paper does not compare whether the improvement from 10 iterations versus 1 justifies the 10× cost, nor does it analyze diminishing returns as iterations increase." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": true, 331 "justification": "The paper discusses construct validity: why Spearman correlation (not accuracy) is appropriate ('the accuracy of the score is irrelevant since the value is only in the rankings'), acknowledges the gold standard is 'imperfect' and 'subjective to a single person,' and discusses why proxy quality scores are sometimes preferable despite limitations." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved. Models are prompted directly with article text." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "The test articles are from 2014-2020 and all LLMs (2024-2025 models) were trained on data that postdates these articles. Models may have internalized quality signals about these papers from their training data. This temporal overlap is not discussed." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "LLMs may have memorized information about these published articles — citation counts, journal impact factors, media coverage — that could inform scoring independently of analyzing the title and abstract. This potential feature leakage through memorized metadata is not discussed." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether the articles in the test set share structural similarities (same journals, same author groups, related topics within UoAs) that could affect score independence across articles." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, temporal splits, or analysis of whether models 'recognize' specific articles." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Semantically equivalent prompt variations yield substantially different correlations with human gold standard scores for Gemma3 27b.", 365 "evidence": "Figure 1 shows Spearman correlations ranging from 0.176 to 0.279 for 'based on title and abstract' prompt variants, and wider ranges across all prompt types. The 'rate' vs 'grade' vs 'assess' vs 'score' synonym substitutions produce measurably different results.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Prompts encouraging fractional scores improve correlations with human judgments for Gemma3 27b, even when the gold standard uses integer scores.", 370 "evidence": "Figures 1 and 2 show fractional prompts (mean ~0.37) generally outperform non-fractional prompts (mean ~0.20-0.35). Fractional prompts produce 14-18 distinct score values vs 2-3 for integer prompts (Table 1), enabling finer ranking discrimination.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "Averaging scores from different but semantically equivalent prompts produces higher correlations than any single prompt for most prompt sets.", 375 "evidence": "Table 3 shows the 'Set mix ρ' exceeds the 'Max. ρ' in 11 out of 15 prompt sets. Table 5 shows averaging all 58 prompts exceeds the maximum single-prompt correlation for all five non-Gemma3 models tested.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "The best prompts vary substantially between LLMs — the best prompts for Gemma3 are among the worst for ChatGPT.", 380 "evidence": "Figure 3 and Table 4 show a Pearson correlation of -0.68 between Gemma3 and ChatGPT standard scores for the 58 prompts. 'Based on title and abstract' prompts worked best for ChatGPT but worst for Gemma3. Table 4 shows most model pairs have weak inter-correlations.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Averaging across all 58 prompts is a universally effective strategy that eliminates the need to identify the best individual prompt.", 385 "evidence": "Table 5 shows the average-all-prompts correlation exceeds the maximum individual prompt correlation for ChatGPT (0.523 vs 0.517/0.448), DeepSeek R1 (0.417 vs 0.257), Llama4 (0.446 vs 0.403), Magistral (0.286 vs 0.206), and Qwen3 (0.433 vs 0.326).", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Specifying that the input is a title and abstract ('based on the title and abstract') reduces performance for Gemma3 but improves it for ChatGPT.", 390 "evidence": "Figure 1 shows 'Scores based' prompts (light blue) have lower correlations than 'Scores' prompts (orange) for Gemma3. Figure 3 shows the inverse for ChatGPT, where based prompts produce the highest correlations.", 391 "supported": "moderate" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "Single-person gold standard", 397 "detail": "All 2,780 gold standard scores were assigned by one person (the first author of the source paper). The paper acknowledges this is 'imperfect' and 'subjective to a single person,' but proceeds to evaluate all six LLMs against this single rater without inter-rater reliability data." 398 }, 399 { 400 "flag": "No statistical significance testing", 401 "detail": "Correlation differences are compared informally across 58 prompts × 6 models with no formal hypothesis tests, no multiple comparison correction, and no confidence intervals. The paper admits standard CI formulas don't apply but does not employ alternatives (e.g., bootstrap)." 402 }, 403 { 404 "flag": "Model naming inconsistency", 405 "detail": "The abstract and tables refer to 'ChatGPT 4o-mini' while the Methods section states 'ChatGPT 4.1-mini' with API model 'gpt-4.1-mini-2025-04-14.' GPT-4o-mini and GPT-4.1-mini are different models, creating ambiguity about which was actually used." 406 }, 407 { 408 "flag": "No train/test separation for prompt selection", 409 "detail": "The same 2,780 articles are used to both identify which prompts and strategies work best and to report the final correlation results. No held-out validation set is used to prevent overfitting to the specific dataset when selecting the recommended strategy." 410 }, 411 { 412 "flag": "Contamination risk unaddressed", 413 "detail": "All LLMs tested (2024-2025 models) were trained on data that includes the 2014-2020 test articles. Models may have internalized quality signals (citation counts, journal prestige, author reputation) from training data, confounding the evaluation of scoring ability from title/abstract analysis alone." 414 } 415 ], 416 "cited_papers": [ 417 { 418 "title": "Large language models are inconsistent and biased evaluators", 419 "authors": ["R. Stureborg", "D. Alikaniotis", "Y. Suhara"], 420 "year": 2024, 421 "arxiv_id": "2405.01724", 422 "relevance": "Directly relevant: demonstrates LLM evaluation inconsistency and bias, and tests fractional scoring strategies that this paper extends to a new task." 423 }, 424 { 425 "title": "Chain-of-thought prompting elicits reasoning in large language models", 426 "authors": ["J. Wei", "X. Wang", "D. Schuurmans", "M. Bosma", "F. Xia", "E. Chi", "D. Zhou"], 427 "year": 2022, 428 "relevance": "Foundational prompting technique for LLMs; establishes chain-of-thought as an alternative strategy not tested in this paper." 429 }, 430 { 431 "title": "Can large language models be an alternative to human evaluations?", 432 "authors": ["C. H. Chiang", "H. Y. Lee"], 433 "year": 2023, 434 "arxiv_id": "2305.01937", 435 "relevance": "Directly addresses whether LLMs can replace human evaluation, the broader question motivating this paper's research on LLM scoring ability." 436 }, 437 { 438 "title": "Evaluating the evaluator: Measuring LLMs' adherence to task evaluation instructions", 439 "authors": ["B. Murugadoss", "C. Poelitz", "I. Drosos", "V. Le", "N. McKenna", "C. S. Negreanu", "A. Sarkar"], 440 "year": 2025, 441 "relevance": "Studies LLM adherence to evaluation instructions, finding substantial positive scoring bias — confirms this paper's observation of LLMs avoiding low scores." 442 }, 443 { 444 "title": "G-eval: NLG evaluation using gpt-4 with better human alignment", 445 "authors": ["Y. Liu", "D. Iter", "Y. Xu", "S. Wang", "R. Xu", "C. Zhu"], 446 "year": 2023, 447 "arxiv_id": "2303.16634", 448 "relevance": "Proposes using GPT-4 for NLG evaluation with token probability weighting, a technique this paper also explores with ChatGPT." 449 }, 450 { 451 "title": "A survey on code generation with llm-based agents", 452 "authors": ["Y. Dong", "X. Jiang", "J. Qian", "T. Wang", "K. Zhang", "Z. Jin", "G. Li"], 453 "year": 2025, 454 "arxiv_id": "2508.00083", 455 "relevance": "Survey on LLM-based code generation agents, representing the broader LLM capability evaluation landscape this paper's methods could be applied to." 456 }, 457 { 458 "title": "From generation to judgment: Opportunities and challenges of LLM-as-a-judge", 459 "authors": ["D. Li", "B. Jiang", "L. Huang", "A. Beigi", "C. Zhao", "Z. Tan", "H. Liu"], 460 "year": 2025, 461 "relevance": "Comprehensive review of LLM-as-a-judge paradigm, directly relevant to this paper's investigation of LLM scoring reliability." 462 }, 463 { 464 "title": "Is LLM a reliable reviewer? A comprehensive evaluation of LLM on automatic paper reviewing tasks", 465 "authors": ["R. Zhou", "L. Chen", "K. Yu"], 466 "year": 2024, 467 "relevance": "Evaluates LLM reliability for paper reviewing, the same application domain as this paper's research quality scoring task." 468 }, 469 { 470 "title": "Unleashing the potential of prompt engineering for large language models", 471 "authors": ["B. Chen", "Z. Zhang", "N. Langrené", "S. Zhu"], 472 "year": 2025, 473 "doi": "10.1016/j.patter.2025.101260", 474 "relevance": "Survey of prompt engineering strategies that provides context for the prompt design decisions explored in this paper." 475 }, 476 { 477 "title": "Diversified Sampling Improves Scaling LLM inference", 478 "authors": ["T. Wang", "Z. Liu", "Y. Chen", "J. Light", "H. Chen", "X. Zhang", "W. Cheng"], 479 "year": 2025, 480 "arxiv_id": "2502.11027", 481 "relevance": "Shows diversified sampling improves LLM inference at scale, supporting this paper's finding that diverse prompt averaging outperforms single prompts." 482 }, 483 { 484 "title": "Aligning with human judgement: The role of pairwise preference in large language model evaluators", 485 "authors": ["Y. Liu", "H. Zhou", "Z. Guo", "E. Shareghi", "I. Vulić", "A. Korhonen", "N. Collier"], 486 "year": 2024, 487 "arxiv_id": "2403.16950", 488 "relevance": "Studies pairwise comparison as an alternative LLM evaluation strategy, one of the approaches mentioned but not tested in this paper." 489 } 490 ] 491 }