scan.json (27171B)
1 { 2 "paper": { 3 "title": "SKATE, a Scalable Tournament Eval: Weaker LLMs differentiate between stronger ones using verifiable challenges", 4 "authors": ["Dewi S. W. Gould", "Bruno Mlodozeniec", "Samuel F. Brown"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2508.06111", 8 "doi": "10.48550/arXiv.2508.06111" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "checklist": { 13 "artifacts": { 14 "code_released": { 15 "applies": true, 16 "answer": true, 17 "justification": "The paper mentions 'Supplementary Material' containing the codebase and prompts (Section J, Appendix). The prompts are included in the paper, and code is referenced as supplementary." 18 }, 19 "data_released": { 20 "applies": true, 21 "answer": false, 22 "justification": "No dataset download link or repository URL is provided for the generated questions or game results. The paper references supplementary material but does not provide a URL to a data release." 23 }, 24 "environment_specified": { 25 "applies": true, 26 "answer": false, 27 "justification": "No requirements.txt, Dockerfile, or dependency specifications are mentioned. The paper does not describe the environment setup beyond mentioning Python 3 and standard built-ins for COP tasks." 28 }, 29 "reproduction_instructions": { 30 "applies": true, 31 "answer": false, 32 "justification": "No step-by-step reproduction instructions are provided. The paper describes the framework conceptually but does not give commands or scripts to replicate the experiments." 33 } 34 }, 35 "statistical_methodology": { 36 "confidence_intervals_or_error_bars": { 37 "applies": true, 38 "answer": true, 39 "justification": "TrueSkill provides uncertainty estimates (σ) shown in figures. The p(correct) scoring algorithm (Algorithm 1) explicitly computes standard deviation and terminates when σ ≤ 0.05. Figures show (μ, σ) values." 40 }, 41 "significance_tests": { 42 "applies": true, 43 "answer": false, 44 "justification": "No formal statistical significance tests are used to compare model rankings. Differences are assessed via TrueSkill μ values and visual inspection of figures, but no p-values or hypothesis tests are reported." 45 }, 46 "effect_sizes_reported": { 47 "applies": true, 48 "answer": true, 49 "justification": "TrueSkill μ scores with specific values are reported (e.g., Sonnet 4: 30.7, Sonnet 3.5: 25.7). p(correct) differences between models are quantified. Table 2 provides correlation coefficients (Pearson, Spearman)." 50 }, 51 "sample_size_justified": { 52 "applies": true, 53 "answer": false, 54 "justification": "The choice of 50 rounds is stated but not justified with any formal analysis. The choice of 6 models is not justified. The stability threshold σ* = 0.05 is described as a 'balance' but no formal justification is given." 55 }, 56 "variance_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "TrueSkill σ values are reported throughout. The p(correct) scoring includes explicit standard deviation computation (Algorithm 1). Figure 11 shows variance of p(correct) across models." 60 } 61 }, 62 "evaluation_design": { 63 "baselines_included": { 64 "applies": true, 65 "answer": true, 66 "justification": "The paper compares SKATE rankings against established benchmarks (MMLU-Pro, GPQA Diamond, HLE, BigBench Hard) in Table 2 and reports correlation." 67 }, 68 "baselines_contemporary": { 69 "applies": true, 70 "answer": true, 71 "justification": "Comparison benchmarks include recent evaluations: MMLU-Pro, GPQA Diamond, HLE, BigBench Hard. Models tested include frontier models (Sonnet 4, GPT-4o, Gemini 2.0 Flash)." 72 }, 73 "ablation_study": { 74 "applies": true, 75 "answer": true, 76 "justification": "Five augmentation strategies are tested (Section 5, Table 1, Figure 2), serving as an ablation of the information available to task-setters. The self-preferencing filter (Figure 4b) is also an ablation." 77 }, 78 "multiple_metrics": { 79 "applies": true, 80 "answer": true, 81 "justification": "Two TrueSkill scoring variants (Relative Pairwise and Absolute Pairwise, Section 4.3), p(correct) values, asking skill vs answering skill (Figure 12), and correlation with external benchmarks." 82 }, 83 "human_evaluation": { 84 "applies": true, 85 "answer": false, 86 "justification": "No human evaluation is included. All scoring is automated via code execution sandboxes and TrueSkill. No human assessment of question quality, difficulty, or ranking validity is reported." 87 }, 88 "held_out_test_set": { 89 "applies": false, 90 "answer": false, 91 "justification": "The framework generates questions dynamically during gameplay — there is no static dataset to split into train/test. The evaluation is inherently generative, not dataset-based." 92 }, 93 "per_category_breakdown": { 94 "applies": true, 95 "answer": true, 96 "justification": "Results are broken down per model (Figures 1, 3, 4, 5), per augmentation strategy (Figure 2), and per question variance level (Figure 11). Asking vs answering skill is separated (Figure 12)." 97 }, 98 "failure_cases_discussed": { 99 "applies": true, 100 "answer": true, 101 "justification": "The paper discusses models failing to generate valid questions (Haiku 3.5 fails to write questions it can answer, Figure 4b). Question examples where models score 0.0 are shown (Figure 11). Section 7 discusses limitations." 102 }, 103 "negative_results_reported": { 104 "applies": true, 105 "answer": true, 106 "justification": "Augmentation strategies had 'minimal effect on the final rankings' (Section 5). Models 'may not yet possess the advanced strategic reasoning capabilities to fully exploit' game state (Section 7). Haiku 3.5 fails the self-preferencing filter entirely." 107 } 108 }, 109 "claims_and_evidence": { 110 "abstract_claims_supported": { 111 "applies": true, 112 "answer": true, 113 "justification": "Abstract claims (weaker models score stronger ones, self-preferencing behavior, fine-grained capability differences) are all supported by specific figures and results in the paper (Figures 4, 5, 11)." 114 }, 115 "causal_claims_justified": { 116 "applies": true, 117 "answer": false, 118 "justification": "The paper claims models 'exhibit a capacity to write self-favoring questions' but the mechanism is observational — correlation between setter identity and performance advantage. No controlled experiment isolates the causal mechanism from, e.g., question-style familiarity." 119 }, 120 "generalization_bounded": { 121 "applies": true, 122 "answer": false, 123 "justification": "The abstract claims SKATE is a 'general, scalable evaluation framework' but experiments are limited to 6 models on COP tasks only. The paper acknowledges COP 'does not capture the full range of LLM capabilities' (Section 7) but the title and abstract are broader than the evidence." 124 }, 125 "alternative_explanations_discussed": { 126 "applies": true, 127 "answer": true, 128 "justification": "Section 7 discusses that negative results could 'reflect prompt design rather than true model failure', that models with code-execution tools trivially solve COP tasks, and that question diversity may be limited by shared blind spots." 129 }, 130 "proxy_outcome_distinction": { 131 "applies": true, 132 "answer": true, 133 "justification": "The paper is explicit that COP is a proxy testbed and that SKATE is general-purpose. Section 7 and Appendix C acknowledge COP's limitations and discuss the gap between COP performance and general capability evaluation." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": true, 139 "answer": false, 140 "justification": "Models are identified as 'GPT-4o', 'Sonnet 3.5', 'Sonnet 4', 'Gemini 2.0 Flash', 'Haiku 3', 'Haiku 3.5'. Only GPT-4o has a date in Table 2 ('2024-11-20'). Claude and Gemini versions lack snapshot dates or API versions." 141 }, 142 "prompts_provided": { 143 "applies": true, 144 "answer": true, 145 "justification": "The full task-setting prompt for the historical performance strategy is provided in Appendix J. The paper states 'All other prompts used in this work are included in the codebase in the Supplementary Material.'" 146 }, 147 "hyperparameters_reported": { 148 "applies": true, 149 "answer": true, 150 "justification": "Temperature = 0.7 is stated (Section 5). σ* = 0.05 stability threshold, dthresh = 0.336 similarity threshold, Nrounds = 50, Nattempts = 3, pthresh = 0.55 are all specified." 151 }, 152 "scaffolding_described": { 153 "applies": true, 154 "answer": true, 155 "justification": "The game scaffolding is described in detail: round structure, question validation pipeline (verifiable, distractor-rich, unique), code-execution sandbox, augmentation strategies, retry logic (3 attempts with feedback). Section 4 and appendices cover the full scaffold." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": true, 160 "justification": "Question filtering pipeline is documented: verification via code sandbox, distractor generation, uniqueness check via embedding similarity (threshold 0.336), and the adaptive scoring algorithm (Algorithm 1)." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 7 (Discussion) contains substantive limitations discussion spanning multiple paragraphs, covering prompt sensitivity, tool-augmented models, narrow model set, and COP scope." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": true, 172 "justification": "Specific threats: 'prompting strategy may not reliably elicit the intended game-playing behavior' (prompt validity), models with code execution trivially solve COP (construct validity), narrow model set may miss patterns, shared blind spots in COP questions." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": true, 177 "justification": "The paper explicitly states: COP 'does not capture the full range of LLM capabilities', experiments cover 'a narrow set of models', restricting to MCQ 'inherently restricts the evaluation space, for example by excluding open-ended responses' (Section 3)." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": false, 184 "justification": "No raw data (question sets, model responses, p(correct) scores per question) is released. Supplementary material is mentioned but no URL is provided." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "The data generation process is thoroughly described: LLMs generate COP questions, questions are validated via sandbox execution, scored via Algorithm 1, and clustered via embedding similarity. The full pipeline is documented." 190 }, 191 "recruitment_methods_described": { 192 "applies": false, 193 "answer": false, 194 "justification": "No human participants. Models are the subjects and their selection is simply described (6 frontier LLMs). Standard benchmark — NA." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": true, 199 "justification": "The pipeline from question generation → validation (verifiable, distractor-rich) → uniqueness filtering → scoring (Algorithm 1) → TrueSkill ranking is fully documented across Sections 3-4 and appendices." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": true, 206 "justification": "Acknowledgments section states: 'We are grateful to the Supervised Program for Alignment Research for facilitating collaboration and providing financial support.'" 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "Author affiliations are listed: Alan Turing Institute, University of Cambridge / Max Planck Institute, and Independent. None are affiliated with the companies whose models are evaluated." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": true, 216 "justification": "The funder (Supervised Program for Alignment Research) is an alignment research program, not a model provider. They have no direct financial stake in which model ranks highest." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests statement or financial interests declaration is present in the paper." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": true, 227 "answer": false, 228 "justification": "No training data cutoff dates are stated for any of the 6 models used. This matters because COP questions could overlap with training data patterns." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": true, 232 "answer": true, 233 "justification": "The paper argues that dynamically generated questions reduce contamination risk compared to static benchmarks: 'Game-based evaluations provide advantages including... reduced data contamination' (Section 2). However, no formal overlap analysis is performed." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": true, 237 "answer": true, 238 "justification": "The paper's core design addresses contamination by generating novel questions each game rather than using static benchmarks. Section 2 explicitly positions this as an advantage over contamination-prone static evaluations." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants in this study." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants in this study." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": true, 281 "answer": true, 282 "justification": "Appendix K provides token counts (~130,000 input tokens per asking model, ~150 tokens per question) and estimates scaling with N players." 283 }, 284 "compute_budget_stated": { 285 "applies": true, 286 "answer": true, 287 "justification": "Appendix K states: 'All experiments in this paper cost less than $500.'" 288 } 289 }, 290 "experimental_rigor": { 291 "seed_sensitivity_reported": { 292 "applies": true, 293 "answer": false, 294 "justification": "No results across multiple random seeds are reported. The game is run once per configuration with no seed sensitivity analysis." 295 }, 296 "number_of_runs_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "The number of game iterations is not explicitly stated for the main experiment. The augmentation strategy experiment uses 'five iterations of SKATE' but the main 6-player game appears to be a single run." 300 }, 301 "hyperparameter_search_budget": { 302 "applies": true, 303 "answer": false, 304 "justification": "No hyperparameter search budget is reported. Key parameters (σ* = 0.05, dthresh = 0.336, Nrounds = 50, temperature = 0.7) are presented without systematic search." 305 }, 306 "best_config_selection_justified": { 307 "applies": true, 308 "answer": false, 309 "justification": "The selection of historical performance as the augmentation strategy is justified qualitatively ('balance between informativeness and computational cost') but not through systematic comparison on a validation set." 310 }, 311 "multiple_comparison_correction": { 312 "applies": false, 313 "answer": false, 314 "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable." 315 }, 316 "self_comparison_bias_addressed": { 317 "applies": true, 318 "answer": true, 319 "justification": "The framework inherently addresses this: models evaluate each other rather than the authors evaluating their own system. The authors are not affiliated with any model provider." 320 }, 321 "compute_budget_vs_performance": { 322 "applies": true, 323 "answer": false, 324 "justification": "Models of very different sizes and costs are compared (Haiku 3 vs Sonnet 4) without discussing whether performance differences reflect compute differences. No matched-compute comparison is attempted." 325 }, 326 "benchmark_construct_validity": { 327 "applies": true, 328 "answer": true, 329 "justification": "Appendix C extensively discusses what COP actually tests and its relationship to broader capabilities. The paper acknowledges COP 'does not capture the full range of LLM capabilities' and discusses construct validity at length." 330 }, 331 "scaffold_confound_addressed": { 332 "applies": true, 333 "answer": true, 334 "justification": "All models use the same scaffolding (same prompt structure, same game rules, same scoring). The augmentation strategy experiments use identical strategies across models. No scaffold confound exists in the comparisons." 335 } 336 }, 337 "data_leakage": { 338 "temporal_leakage_addressed": { 339 "applies": true, 340 "answer": true, 341 "justification": "The paper's design inherently addresses temporal leakage — questions are generated fresh by LLMs during the game, not drawn from existing datasets. This is positioned as a key advantage over static benchmarks." 342 }, 343 "feature_leakage_addressed": { 344 "applies": true, 345 "answer": false, 346 "justification": "No discussion of whether MCQ distractor options could leak information about the correct answer, or whether the question format itself provides hints not present in real-world code understanding tasks." 347 }, 348 "non_independence_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "Questions generated by the same model may share systematic patterns (style, difficulty, topic). The uniqueness filter uses embedding similarity but does not address deeper structural non-independence. Not discussed." 352 }, 353 "leakage_detection_method": { 354 "applies": true, 355 "answer": false, 356 "justification": "No formal leakage detection method is applied. The dynamic question generation is argued to reduce contamination conceptually, but no canary strings, membership inference, or n-gram overlap analysis is used." 357 } 358 } 359 }, 360 "claims": [ 361 { 362 "claim": "Weaker models can reliably score and differentiate between stronger models using SKATE-generated questions.", 363 "evidence": "Figure 5 shows four weaker models generating questions that differentiate Sonnet 3.5 and Sonnet 4 when added. Rankings remain stable across panels (a)-(d).", 364 "supported": "moderate" 365 }, 366 { 367 "claim": "LLMs exhibit self-preferencing behavior, generating questions that favor their own capabilities.", 368 "evidence": "Figure 4b shows that after filtering to questions with p(correct) > 0.55, most models perform best on their own questions (diagonal maxima in the heatmap).", 369 "supported": "moderate" 370 }, 371 { 372 "claim": "SKATE automatically surfaces fine-grained capability differences between models.", 373 "evidence": "Figure 11 shows questions with high p(correct) variance, revealing model-specific strengths (e.g., Gemini 2.0 Flash scoring 1.0 where Sonnet 3.5 scores 0.0 on a specific question).", 374 "supported": "moderate" 375 }, 376 { 377 "claim": "Rankings are stable to the sequential addition of new models.", 378 "evidence": "Figure 10 (Appendix F) shows that adding Sonnet 3.5 then Sonnet 4 (and vice versa) preserves relative TrueSkill ordering.", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "SKATE correlates well with established benchmarks like MMLU-Pro and GPQA Diamond.", 383 "evidence": "Table 2 shows Spearman rank correlations of 0.92 (MMLU-Pro) and 0.90 (GPQA Diamond), but weaker correlations with HLE (0.58) and BigBench Hard (0.30).", 384 "supported": "moderate" 385 } 386 ], 387 "methodology_tags": ["benchmark-eval"], 388 "key_findings": "SKATE introduces a peer-challenge evaluation framework where LLMs compete by generating and solving verifiable code-output-prediction tasks, using TrueSkill ranking. The framework demonstrates that weaker models can differentiate stronger ones, that LLMs exhibit self-preferencing behavior in question generation, and that dynamically generated questions can surface fine-grained capability differences. Rankings show moderate-to-strong correlation with established benchmarks (Spearman 0.92 with MMLU-Pro, 0.90 with GPQA Diamond) across 6 frontier models.", 389 "red_flags": [ 390 { 391 "flag": "Very narrow model set", 392 "detail": "Only 6 models tested, all from 3 providers (Anthropic, OpenAI, Google). No open-source models. Claims of 'scalable evaluation' rest on a very small sample." 393 }, 394 { 395 "flag": "Single-run main experiment", 396 "detail": "The main 6-player game appears to be run once. Game outcomes depend on stochastic question generation, but no repeated runs are reported to assess stability of the main result." 397 }, 398 { 399 "flag": "COP-only proof of concept generalized broadly", 400 "detail": "All experiments use only code-output-prediction tasks, but claims extend to 'general, scalable evaluation.' Appendices B and C discuss other task types but none are empirically tested." 401 }, 402 { 403 "flag": "Weak correlation with some benchmarks", 404 "detail": "Spearman correlation with BigBench Hard is only 0.30, yet this is not treated as concerning. The paper selectively emphasizes the strong correlations (MMLU-Pro, GPQA Diamond)." 405 } 406 ], 407 "cited_papers": [ 408 { 409 "title": "ZeroSumEval: An Extensible Framework For Scaling LLM Evaluation with Inter-Model Competition", 410 "authors": ["Hisham A. Alyahya", "Haidar Khan", "Yazeed Alnumay", "M. Saiful Bari", "Bülent Yener"], 411 "year": 2025, 412 "arxiv_id": "2503.10673", 413 "relevance": "Directly related LLM competition-based evaluation framework with 7 static game types." 414 }, 415 { 416 "title": "Measuring the Impact of Early-2025 AI on Experienced Open-Source Developer Productivity", 417 "authors": ["Joel Becker", "Nate Rush", "Elizabeth Barnes", "David Rein"], 418 "year": 2025, 419 "arxiv_id": "2507.09089", 420 "relevance": "RCT measuring AI impact on developer productivity, cited as criticism of unrealistic benchmarks." 421 }, 422 { 423 "title": "Evaluating Large Language Models Trained on Code", 424 "authors": ["Mark Chen"], 425 "year": 2021, 426 "arxiv_id": "2107.03374", 427 "relevance": "Codex/HumanEval — foundational code generation benchmark referenced as prior work." 428 }, 429 { 430 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 431 "authors": ["Carlos E. Jimenez", "John Yang"], 432 "year": 2024, 433 "arxiv_id": "2310.06770", 434 "relevance": "Major agentic coding benchmark used as comparison point for LLM evaluation approaches." 435 }, 436 { 437 "title": "AI Agents That Matter", 438 "authors": ["Sayash Kapoor", "Benedikt Stroebl", "Zachary S. Siegel", "Nitya Nadgir", "Arvind Narayanan"], 439 "year": 2024, 440 "relevance": "Key meta-research paper on benchmark validity gaps and evaluation methodology for AI agents." 441 }, 442 { 443 "title": "Red Teaming Language Models with Language Models", 444 "authors": ["Ethan Perez", "Saffron Huang"], 445 "year": 2022, 446 "arxiv_id": "2202.03286", 447 "relevance": "LLM-based automated red-teaming, foundational work on models evaluating models." 448 }, 449 { 450 "title": "Discovering Language Model Behaviors with Model-Written Evaluations", 451 "authors": ["Ethan Perez", "Sam Ringer"], 452 "year": 2022, 453 "arxiv_id": "2212.09251", 454 "relevance": "LLM-generated evaluation datasets for probing model behaviors, direct predecessor approach." 455 }, 456 { 457 "title": "Weak-to-Strong Generalization: Eliciting Strong Capabilities With Weak Supervision", 458 "authors": ["Collin Burns", "Pavel Izmailov"], 459 "year": 2023, 460 "arxiv_id": "2312.09390", 461 "relevance": "Scalable oversight work on weaker models supervising stronger ones, directly related to SKATE's weak-scores-strong claim." 462 }, 463 { 464 "title": "JuStRank: Benchmarking LLM Judges for System Ranking", 465 "authors": ["Ariel Gera", "Odellia Boni"], 466 "year": 2025, 467 "arxiv_id": "2412.09569", 468 "relevance": "Benchmarks LLM judge biases, directly motivating SKATE's verifiable-task design to avoid judge bias." 469 }, 470 { 471 "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions", 472 "authors": ["Terry Yue Zhuo"], 473 "year": 2025, 474 "arxiv_id": "2406.15877", 475 "relevance": "Code generation benchmark used in Table 2 comparison (BigBench Hard)." 476 }, 477 { 478 "title": "Automated Capability Discovery via Foundation Model Self-Exploration", 479 "authors": ["Cong Lu", "Shengran Hu", "Jeff Clune"], 480 "year": 2025, 481 "arxiv_id": "2502.07577", 482 "relevance": "LLM self-exploration for capability discovery, related approach using LLMs to generate evaluation tasks." 483 }, 484 { 485 "title": "MMLU-Pro: A More Robust and Challenging Multi-Task Language Understanding Benchmark", 486 "authors": ["Yubo Wang"], 487 "year": 2024, 488 "arxiv_id": "2406.01574", 489 "relevance": "Benchmark used for correlation analysis with SKATE rankings (Spearman 0.92)." 490 } 491 ] 492 }