scan.json (24320B)
1 { 2 "paper": { 3 "title": "Metric assessment protocol in the context of answer fluctuation on MCQ tasks", 4 "authors": ["Ekaterina Goliakova", "Xavier Renard", "Marie-Jeanne Lesot", "Thibault Laugel", "Christophe Marsala", "Marcin Detyniecki"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2507.15581", 8 "doi": "10.48550/arXiv.2507.15581" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Most existing MCQ evaluation metrics strongly correlate with full fluctuation rates (R² > 0.8), even when computed only on the original option order. Probability mass is the best single-permutation proxy for fluctuation. A novel metric, worst accuracy, achieves the highest combined correlation with both full fluctuation rates and original accuracy when computed on cyclic permutations (R² = 0.942). Continuous metrics like probability mass and Brier score show no improvement from additional permutations, while discrete metrics like sensitivity gap and partial fluctuation rates are unstable on small permutation subsets.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper uses publicly available Meta evaluation datasets (HuggingFace meta-llama/Llama-3.1-8B-evals) and publicly available models. The benchmarks (ARC-C, CSQA, MMLU, AGIEval, Winogrande) are all public." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions using HuggingFace transformers with bfloat16 precision but provides no requirements.txt, library versions, or detailed environment specification." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions, scripts, or README are provided." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Main results in Table 1 report only R² point estimates without confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims certain metrics correlate more strongly than others but uses only R² comparisons without significance tests for the differences." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "R² values are reported throughout (Tables 1a-1c), providing magnitude of correlation effects. Specific values like R² = 0.942 for worst accuracy on cyclic permutations give clear effect sizes." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification for why 10 models and 17 benchmarks were chosen, and no power analysis for whether this sample is sufficient for the correlation analyses." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Section 5.3 and Figure 4 report standard deviations of metrics across 100 random permutation pairs/tuples, providing spread measures for metric stability analysis." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper compares its novel worst accuracy metric against multiple existing metrics: average accuracy, PriDe, strong accuracy, sensitivity gap, probability mass, Brier score, entropy, and partial fluctuation rates." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Baselines include PriDe (Zheng et al., 2024), strong accuracy (Gupta et al., 2024), and fluctuation rates (Wei et al., 2024), all from 2024." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper systematically varies the permutation subset type (original, original+inverse, cyclic, random-2, random-|L|) to assess how each metric performs under different computation budgets, functioning as an ablation over the permutation dimension." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "The entire paper compares 9 metrics. Evaluation uses R² for correlation with fluctuation rates, correlation with original accuracy, and combined correlation." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "Human evaluation is not relevant — this paper evaluates statistical properties of metrics, not human-facing outputs." 89 }, 90 "held_out_test_set": { 91 "applies": false, 92 "answer": false, 93 "justification": "This is not a machine learning system being trained/tested; it evaluates metrics on benchmark results. No train/test split is applicable." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down by permutation type (Tables 1a-1c), by benchmark (Figures 3-15 in appendix), and by model (Figures 5-6). Standard deviations are shown per benchmark in Figure 4." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper discusses Winogrande as an outlier case where metrics like sensitivity gap and worst accuracy perform poorly due to having only 2 options (Section 5.1). The instability of sensitivity gap and partial fluctuation rates on Rrandom2 is also discussed." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper reports that adding permutations does NOT improve continuous metrics (probability mass, Brier score), that sensitivity gap performs very poorly on Rrandom2 (R² = 0.235), and that partial fluctuation rates lose information about original accuracy." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims that most metrics correlate with fluctuation rates and that worst accuracy shows highest association are supported by Tables 1a-1c and Figures 3a-3b." 116 }, 117 "causal_claims_justified": { 118 "applies": false, 119 "answer": false, 120 "justification": "The paper makes correlational claims (metrics correlate with fluctuation rates) and does not claim causal relationships." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section 6 (Limitations) explicitly states bounds: only option order permutations tested, only models below 10B parameters, only next-token prediction (not text generation). The title specifies 'MCQ tasks'." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper discusses that Winogrande's outlier behavior may be due to having only 2 options (Section 5.1), that continuous metrics' stability may explain their capped correlation (Section 5.3), and considers the impact of permutation choice on metric stability." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper is explicit that R² correlation with full fluctuation rates is a proxy for actual metric quality, and that a good metric should capture both fluctuation and original accuracy. The protocol (Section 4) clearly distinguishes between what is measured and what is claimed." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Section 4.1 lists specific models: Llama-3.1-8B, Gemma-2-9B, Mistral-7B-v0.3, Qwen2.5-7B, R1-Distill-Llama-8B, R1-Distill-Qwen-7B, with both pre-trained and instruct-tuned versions specified." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "The paper uses publicly shared Meta evaluation datasets that contain 'full final prompts, including instructions, few-shot examples, their order, and option typography' (Section 4.2), with a link to the HuggingFace dataset. The prompt format is shown in Section 4.2." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "The paper mentions bfloat16 precision and random seed 0 for permutation sampling, but does not report temperature, top-p, or other inference hyperparameters. Since they use next-token probabilities (greedy/argmax), this is partially implicit but not explicitly stated." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The paper evaluates next-token probabilities directly." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 4.2 describes using Meta's pre-formatted evaluation datasets. Appendix B provides benchmark details (number of questions, number of options). The handling of AGIEval nan options is documented (footnote 8)." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 6 'Limitations & Future Work' provides substantive discussion of four specific limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 6 discusses specific threats: only option order permutations (not paraphrases, typography, label changes), only models below 10B, only next-token prediction (not text generation), and unresolved optimal permutation selection." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 6 explicitly states what was NOT tested: other permutation types, larger models, text generation approaches. The paper also bounds scope in Section 4.1 ('models with parameter sizes below 10B')." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "The raw model outputs (per-question probabilities and answers across all permutations) are not released. Only aggregated R² results are shown." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 4 describes the full protocol: models are run on all permutations of each benchmark question, next-token probabilities are extracted, and metrics are computed from these. The benchmarks and their sources are specified." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data sources are standard public benchmarks." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The 6-step protocol (Section 4) documents the pipeline from benchmark selection through permutation generation, model inference, metric computation, and correlation analysis." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding or acknowledgments section is present in the paper, despite authors having both academic (Sorbonne/CNRS) and corporate (AXA) affiliations." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly listed: Sorbonne University/CNRS/LIP6, AXA, and Polish Academy of Science." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "AXA (an insurance company) co-affiliates several authors. While AXA has no direct stake in MCQ metric evaluation, the potential funding relationship is undisclosed, making independence unverifiable." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper evaluates pre-trained LLMs on public benchmarks (MMLU, ARC-C, etc.) but does not state training data cutoff dates for any of the 10 models." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether models may have seen the benchmark questions during training, despite using well-known public benchmarks like MMLU and ARC-C." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "MMLU (2021), ARC-C (2018), CSQA (2019), Winogrande (2021), AGIEval (2024) are all public benchmarks that could be in training data of 2024-2025 models. This contamination risk is not discussed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "The paper discusses computation cost conceptually (full permutations are 'extremely costly') but never reports actual GPU hours, wall-clock time, or cost per experiment." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total compute budget is stated despite running 10 models × 17 benchmarks × all permutations (up to 120 per question for 5-option benchmarks)." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": true, 296 "justification": "Section 5.3 analyzes sensitivity to random permutation selection using 100 random pairs and 100 random tuples, reporting standard deviations across these samples (Figure 4)." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "The paper states full permutations are computed (|L|! per question). For random analysis, '100 random pairs' and '100 random tuples of size |L|' are explicitly stated (Section 5.3)." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": false, 305 "answer": false, 306 "justification": "No hyperparameter tuning is performed; the paper evaluates metrics on model outputs without training." 307 }, 308 "best_config_selection_justified": { 309 "applies": false, 310 "answer": false, 311 "justification": "No configuration selection; all metrics are compared on the same data." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "Multiple metrics are compared across multiple permutation types without correction for multiple comparisons." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors propose worst accuracy and compare it favorably against existing metrics without acknowledging potential author-evaluation bias." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "The entire paper framework compares metrics at different computational costs (original only vs. 2 permutations vs. |L| permutations vs. all permutations), effectively showing performance as a function of compute." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": true, 331 "justification": "Section 2 extensively discusses what MCQ benchmarks actually measure vs. what they claim, including sensitivity to permutation, typography, and format changes. The paper's core contribution is questioning whether standard metrics capture what they claim (robustness)." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved; models are queried directly for next-token probabilities." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "Models from 2024-2025 are tested on benchmarks from 2018-2024. No discussion of whether these models may have been trained on benchmark solutions." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether evaluation setup leaks information. The few-shot examples provided in prompts could potentially bias toward certain answer patterns." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of independence between training data and test benchmarks." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is applied." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Most existing MCQ metrics correlate strongly with full fluctuation rates, even when computed on the original option order only.", 365 "evidence": "Table 1a and Figure 3a show R² values of 0.833-0.893 for most metrics on Roriginal, with probability mass achieving R² = 0.893.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Worst accuracy achieves the highest combined correlation with both full fluctuation rates and original accuracy when using cyclic permutations.", 370 "evidence": "Table 1c shows worst accuracy achieves R² = 0.942 on Rcyclic for the combined target, the highest value in that row.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Continuous metrics (probability mass, Brier score) do not improve with additional permutations.", 375 "evidence": "Comparing Figures 3a and 3b, probability mass R² remains at 0.894 across Roriginal and Rcyclic. Table 1a confirms this stability.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Sensitivity gap and partial fluctuation rates are unstable when computed on only two permutations.", 380 "evidence": "Table 1a shows sensitivity gap drops from R² = 0.640 (Roi) to 0.235 (Rrandom2), and partial FR drops from 0.829 to 0.479. Figure 4 shows high standard deviations for these metrics.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "PriDe achieves the highest correlation with original accuracy on Roi and Rcyclic.", 385 "evidence": "Table 1b shows PriDe at R² = 0.993 (Roi) and 0.994 (Rcyclic), slightly exceeding average accuracy.", 386 "supported": "strong" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "No contamination discussion", 392 "detail": "The paper evaluates 10 LLMs on well-known public benchmarks (MMLU, ARC-C, CSQA) without any discussion of benchmark contamination. Since the paper studies answer fluctuation patterns, contamination could systematically affect which questions show fluctuation (memorized answers may be more stable), biasing the metric comparisons." 393 }, 394 { 395 "flag": "Limited model size range", 396 "detail": "All 10 models are 7-9B parameters. The authors acknowledge this limitation but the findings may not generalize to larger models that are more commonly used in practice and may exhibit different fluctuation patterns." 397 }, 398 { 399 "flag": "No code or raw data released", 400 "detail": "Despite the paper proposing a new metric and evaluation protocol, no code or raw experimental data is released, preventing independent verification of the R² calculations." 401 } 402 ], 403 "cited_papers": [ 404 { 405 "title": "Large language models sensitivity to the order of options in multiple-choice questions", 406 "authors": ["Pouya Pezeshkpour", "Estevam Hruschka"], 407 "year": 2024, 408 "relevance": "Core prior work on MCQ option order sensitivity and sensitivity gap metric, directly evaluated in this paper." 409 }, 410 { 411 "title": "Large Language Models Are Not Robust Multiple Choice Selectors", 412 "authors": ["Chujie Zheng", "Hao Zhou", "Fandong Meng", "Jie Zhou", "Minlie Huang"], 413 "year": 2024, 414 "relevance": "Introduces PriDe debiasing technique and demonstrates MCQ selection bias in LLMs." 415 }, 416 { 417 "title": "Unveiling selection biases: Exploring order and token sensitivity in large language models", 418 "authors": ["Sheng-Lun Wei", "Cheng-Kuang Wu", "Hen-Hsen Huang", "Hsin-Hsi Chen"], 419 "year": 2024, 420 "relevance": "Introduces fluctuation rates metric and studies label/option order sensitivity in LLMs." 421 }, 422 { 423 "title": "Changing Answer Order Can Decrease MMLU Accuracy", 424 "authors": ["Vipul Gupta", "David Pantoja", "Candace Ross", "Adina Williams", "Megan Ung"], 425 "year": 2024, 426 "relevance": "Introduces strong accuracy metric and studies answer order effects on MMLU." 427 }, 428 { 429 "title": "Quantifying Variance in Evaluation Benchmarks", 430 "authors": ["Lovish Madaan", "Aaditya K. Singh", "Rylan Schaeffer"], 431 "year": 2024, 432 "relevance": "Studies variance in LLM evaluation benchmarks and probability mass as a metric." 433 }, 434 { 435 "title": "Are Emergent Abilities of Large Language Models a Mirage?", 436 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 437 "year": 2023, 438 "relevance": "Demonstrates that continuous metrics reveal different patterns than accuracy for tracking model capabilities." 439 }, 440 { 441 "title": "Do LLMs Exhibit Human-like Response Biases? A Case Study in Survey Design", 442 "authors": ["Lindia Tjuatja", "Valerie Chen", "Tongshuang Wu", "Ameet Talwalkwar", "Graham Neubig"], 443 "year": 2024, 444 "relevance": "Studies LLM response biases on MCQ tasks compared to human biases, introduces entropy metric." 445 }, 446 { 447 "title": "When Benchmarks are Targets: Revealing the Sensitivity of Large Language Model Leaderboards", 448 "authors": ["Norah A. Alzahrani", "Hisham A. Alyahya"], 449 "year": 2024, 450 "relevance": "Studies benchmark sensitivity including option typography effects on LLM evaluation." 451 }, 452 { 453 "title": "Holistic Evaluation of Language Models", 454 "authors": ["Percy Liang", "Rishi Bommasani", "Tony Lee"], 455 "year": 2023, 456 "relevance": "Demonstrates few-shot example variation effects on LLM evaluation scores." 457 } 458 ] 459 }