scan-v5.json (26863B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Detecting Benchmark Contamination Through Watermarking", 6 "authors": [ 7 "Tom Sander", 8 "Pierre Fernandez", 9 "Saeed Mahloujifar", 10 "Alain Durmus", 11 "Chuan Guo" 12 ], 13 "year": 2025, 14 "venue": "arXiv.org", 15 "arxiv_id": "2502.17259", 16 "doi": "10.48550/arXiv.2502.17259" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "All abstract claims are verified in the body: benchmark utility preservation (Fig 3a, Table 1), contamination detection effectiveness (Table 1, p-val 10^-3 for +5% on ARC-Easy), and pre-training experiments with 1B models on 10B tokens are all substantiated.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "The paper uses controlled pre-training experiments where the only variable is benchmark contamination (injected between steps 2500–7500), which is adequate experimental design for causal inference about the contamination→radioactivity mechanism.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": true, 35 "justification": "Claims are bounded to multiple-choice QA benchmarks (ARC-Easy, ARC-Challenge, MMLU); the limitations section explicitly notes math/code questions pose rephrasing challenges and the method is designed primarily for unintentional contamination.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": true, 41 "justification": "Proposition 1 explicitly distinguishes between memorizing the watermark vs. artificially enhanced performance, acknowledging that a model could perform well without overfitting to watermark biases, in which case the test would fail.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper clearly explains that the test detects watermark memorization (radioactivity) as a proxy for contamination, and Proposition 1 explicitly discusses when this proxy can and cannot be equated to actual performance inflation.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "Section 5 is titled 'Limitations & Conclusion' and contains a dedicated limitations section with two specific bullet points covering rephrasing impact and intentional evasion.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": true, 61 "justification": "Specific threats are identified: (1) rephrasing may cause coherence loss in some questions (shown in Figure 6), and (2) malicious actors could rephrase questions or train only on answers conditioned on questions to bypass radioactivity detection.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper explicitly states the method is primarily designed for unintentional contamination, and demonstrates that smaller benchmarks like ARC-Challenge yield lower detection confidence at low contamination levels.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No funding statement appears in the paper. Authors are affiliated with Meta FAIR and École polytechnique CMAP but no explicit acknowledgment of funding sources is provided.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations are clearly stated on the first page: Meta FAIR and École polytechnique CMAP.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": false, 87 "justification": "Most authors are from Meta FAIR; Meta develops and trains large language models (Llama series), giving them a direct interest in contamination detection research that affects LLM evaluation credibility and defends against contamination accusations.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) appears anywhere in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Benchmark contamination, radioactivity, the red/green list watermarking scheme, and the statistical test (p-value, FPR, binomial null hypothesis) are all precisely defined in sections 2 and 3.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Three explicit contributions are enumerated in the introduction: benchmark rephrasing with watermarking, extending watermark radioactivity to pre-training setup, and a new detection algorithm for different tokenizers.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 2 substantively engages with prior contamination detection methods (membership inference, context completion checks) and explains why they fail, situating the watermarking approach as a distinct solution to a gap.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": false, 124 "justification": "No code release is mentioned. The paper references Meta Lingua as the training library but does not release their watermarking or detection code.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": false, 130 "justification": "The watermarked benchmark versions (the novel artifact of this work) are not mentioned as being released; while original benchmarks are public, the modified watermarked versions are not.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "Hardware (A-100 GPUs) and the Meta Lingua library are mentioned, but no requirements file, Dockerfile, or package version specification is provided.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "Training hyperparameters and experimental setup are described in prose but no step-by-step reproduction instructions or executable scripts are provided.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "Accuracy results in Table 1 and Figure 3 are single point estimates without CIs or error bars; p-values for detection are reported but are not CIs on the benchmark accuracy metrics.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": true, 156 "justification": "A binomial statistical test is used throughout with p-values explicitly reported for all contamination detection experiments, grounded in Proposition 1's formal proof.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Accuracy improvements are reported as absolute percentage gains (e.g., +4.3%, +9.5%, +18.2% on ARC-Easy) relative to an uncontaminated baseline, constituting effect size reporting.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "Benchmark sizes (1172, 2372, 5000 questions) are noted and their impact on detection sensitivity is analyzed, but no power analysis or formal sample size justification is provided.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "Each experiment appears to be a single training run; no variance, standard deviation, or repeated-run statistics are reported for accuracy or detection metrics.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Canary insertion (as used in BIG-bench) is included as a baseline comparison in Section 4.3 and Table 3, demonstrating the superiority of watermark radioactivity.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "Canary insertion is a legitimate baseline from the contamination detection literature and the comparison fairly reflects its limitations.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "Section 4.3 provides explicit ablations on watermark strength δ, window size k, model size (135M/360M/1B), benchmark size, and rephrasing model size (8B vs 70B).", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Multiple metrics are used: detection p-value (log10), benchmark accuracy (absolute and delta), proportion of green tokens, and number of scored tokens after deduplication.", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": false, 205 "answer": false, 206 "justification": "Human evaluation is not applicable; benchmark contamination detection is fully automated.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "OOD evaluation uses a different prompt template from the one seen during contamination, serving as a held-out evaluation that measures genuine knowledge transfer vs. template memorization.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Results are broken down by benchmark (ARC-Easy, ARC-Challenge, MMLU*), contamination level, watermark strength, window size, and model size in separate figures and tables.", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Figure 6 explicitly shows a failure case where the 8B rephrasing model fails on a math question (requiring the 70B model), and the limitations section discusses coherence loss in some rephrased questions.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "ARC-Challenge shows doubtful detection confidence with only 4 contaminations (log10 p = -1.2), and smaller models require more contaminations; these weak results are reported without omission.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "Exact model versions are specified: Llama-3.1-8B-Instruct for rephrasing, Llama-3.2-1B, Llama-3.2-3B, Llama-3.1-8B for utility evaluation, and SmolLM architectures for smaller ablation models.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": true, 244 "justification": "Figure 5 provides the exact system prompt and instruction used for benchmark rephrasing, including their full text.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Training hyperparameters are fully reported: learning rate 3×10^-3, weight decay 0.033, batch size 4, sequence length 4096, 10,000 steps, warmup 5,000 steps, gradient clip 1.0; watermarking parameters δ, k, and γ=50% are also specified.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "No agentic scaffolding is used; the paper trains standard language models and evaluates them directly.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "The contamination injection procedure is precisely described: every 5000/#contaminations steps between steps 2500 and 7500, a benchmark batch replaces a DCLM batch, formatted as 'Question: {Q}\\nAnswer: {A}'.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": false, 270 "justification": "Trained model weights and watermarked benchmark files are not released; raw experimental outputs cannot be independently verified.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Training data (DCLM corpus) is identified by citation, benchmark subsets used (random 5000 MMLU questions, full ARC sets) are described, and the contamination injection procedure is detailed.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants are involved; the study uses pre-existing benchmarks and trains models from scratch.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "The full pipeline from benchmark selection → watermark embedding → contaminated pre-training → evaluation is documented with specific parameters, step numbers, and formatting templates.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": false, 295 "answer": false, 296 "justification": "This paper trains models from scratch specifically to test contamination detection; it is not evaluating pre-existing model capabilities on benchmarks, so training cutoff is not applicable.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": false, 301 "answer": false, 302 "justification": "The paper controls contamination deliberately as the experimental variable rather than treating it as an uncontrolled threat to validity.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": false, 307 "answer": false, 308 "justification": "Llama models used for utility evaluation may have benchmark data in pretraining, but this paper's experimental validity rests on the from-scratch trained models, not on those models being clean.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": true, 360 "justification": "Appendix C explicitly states each radioactivity detection test took less than 30 minutes on a single GPU, processing up to 325k tokens for MMLU*.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": true, 366 "justification": "Appendix C estimates approximately 10,000 GPU hours total for training all models, with pretraining of the 1B model taking ~6 hours on 64 A-100 GPUs.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "Watermarking benchmark questions through LLM rephrasing preserves benchmark utility — models rank identically and achieve similar accuracy on watermarked and original versions.", 375 "evidence": "Figure 3a and Figure 7 show Llama-3.2-1B, 3.2-3B, and 3.1-8B achieve nearly identical accuracy on original and watermarked ARC-Easy/Challenge/MMLU* versions across all δ values including δ=4 (79% green tokens).", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Benchmark contamination can be reliably detected via watermark radioactivity with p < 10^-3 when contamination inflates accuracy by ~5% on ARC-Easy and MMLU*.", 380 "evidence": "Table 1 shows 4 contaminations yield log10(p) = -3.0 with +4.3% OOD accuracy gain on ARC-Easy, and log10(p) = -5.7 with +5.1% on MMLU*, both at δ=4.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Canary insertion is inferior to watermark radioactivity, failing even with 10x more contaminations than the maximum tested in the radioactivity experiments.", 385 "evidence": "Table 3 shows that a 360M model trained with 160 MMLU* contaminations fails to sufficiently memorize a 64-digit canary, with best p-value 0.19 at step 10000.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "The detection method generalizes across different tokenizers with only modest loss in detection confidence.", 390 "evidence": "Table 4 shows reliable detection (log10 p = -7 to -15) across Llama-1/2, Gemma-1/2, Gemma-3 tokenizers when the watermark was embedded with Llama-3's tokenizer.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "Larger benchmark size directly increases contamination detection confidence at equivalent contamination levels.", 395 "evidence": "Table 1 shows that at 8 contaminations, MMLU* (325k tokens) achieves log10(p) < -12 while ARC-Challenge (64k tokens) achieves only -4.5, despite similar OOD accuracy inflation (~10%).", 396 "supported": "strong" 397 }, 398 { 399 "claim": "Smaller models require more contamination batches to achieve the same benchmark accuracy gain but detection confidence per unit of accuracy gain is consistent across model sizes.", 400 "evidence": "Figure 4 shows 135M, 360M, and 1B models achieve similar detection confidence (~10^-5) after 16, 8, and 4 contaminations respectively, with comparable +6% accuracy gains.", 401 "supported": "moderate" 402 } 403 ], 404 "methodology_tags": [ 405 "empirical", 406 "theoretical", 407 "benchmark-eval" 408 ], 409 "key_findings": "The paper introduces a proactive watermarking approach to detect benchmark contamination, where benchmarks are rephrased using a watermarked LLM before public release. Training on contaminated watermarked data leaves detectable radioactive traces in the model's next-token predictions, testable via a binomial statistical test with a provably controlled false positive rate. Controlled pre-training experiments confirm that contamination sufficient to boost accuracy by ~5% can be detected with p < 10^-3, while benchmark utility is preserved even at strong watermarking strengths. The method outperforms canary insertion and generalizes across different tokenizers, though it requires open-weight model access and is primarily designed for unintentional contamination.", 410 "red_flags": [ 411 { 412 "flag": "No code or data release", 413 "detail": "Neither the watermarked benchmark versions (the novel artifact) nor the detection code are released, making independent replication impossible despite adequate methodological description." 414 }, 415 { 416 "flag": "Single runs, no variance", 417 "detail": "All experiments appear to be single training runs without reporting variance, making it unclear how sensitive results are to random initialization, data ordering, or watermarking seed choice." 418 }, 419 { 420 "flag": "Meta FAIR conflict of interest undisclosed", 421 "detail": "Primary authors are from Meta FAIR, which trains and releases large language models (Llama series); no competing interests statement is present despite Meta having stakes in how contamination detection affects LLM evaluation credibility." 422 }, 423 { 424 "flag": "White-box access assumption limits applicability", 425 "detail": "The method requires open-weight access to the suspect model, which limits applicability to closed-source models like GPT-4 or Gemini — exactly the models most commonly suspected of contamination in practice." 426 }, 427 { 428 "flag": "Only 1B max model size in contamination experiments", 429 "detail": "All from-scratch pre-training uses at most 1B parameters, far smaller than production-scale models (7B–405B+); radioactivity behavior at scale with different learning dynamics is not demonstrated." 430 } 431 ], 432 "cited_papers": [ 433 { 434 "title": "A watermark for large language models", 435 "relevance": "Kirchenbauer et al.'s red/green list watermarking scheme is the direct technical foundation for the benchmark watermarking method used throughout." 436 }, 437 { 438 "title": "Watermarking makes language models radioactive", 439 "relevance": "Sander et al. (2024) is the direct predecessor establishing that fine-tuned models retain watermark traces; this paper extends radioactivity to the pre-training and benchmark contamination setting." 440 }, 441 { 442 "title": "Do membership inference attacks work on large language models?", 443 "relevance": "Duan et al. establish that traditional membership inference is ineffective for LLMs in realistic scenarios, directly motivating the watermarking approach as an alternative." 444 }, 445 { 446 "title": "A careful examination of large language model performance on grade school arithmetic", 447 "relevance": "Zhang et al. demonstrate benchmark contamination in practice via GSM8K performance drops, establishing the empirical problem this paper proposes to solve." 448 }, 449 { 450 "title": "Measuring massive multitask language understanding", 451 "relevance": "MMLU is one of the three primary benchmarks watermarked, evaluated, and tested for contamination detection in the paper." 452 }, 453 { 454 "title": "Evaluation data contamination in LLMs: how do we measure it and (when) does it matter?", 455 "relevance": "Singh et al. survey contamination detection approaches and demonstrate limitations of decontamination methods, contextualizing the need for a proactive watermarking solution." 456 }, 457 { 458 "title": "Investigating data contamination for pre-training language models", 459 "relevance": "Jiang et al. use controlled contamination experiments to show that small models exhibit performance gains — experimental methodology closely parallel to this paper's setup." 460 }, 461 { 462 "title": "Rethinking benchmark and contamination for language models with rephrased samples", 463 "relevance": "Yang et al. show that training on reformulated questions is sufficient to boost performance on original benchmarks, establishing that rephrased-question contamination is a real threat." 464 } 465 ], 466 "engagement_factors": { 467 "practical_relevance": { 468 "score": 3, 469 "justification": "Benchmark providers could deploy this method immediately before releasing new benchmarks, providing a concrete and actionable defense against contamination that requires no access to training data." 470 }, 471 "surprise_contrarian": { 472 "score": 2, 473 "justification": "Using watermarking proactively rather than post-hoc detection is a novel reframing that challenges the prevailing membership inference and performance comparison approaches." 474 }, 475 "fear_safety": { 476 "score": 2, 477 "justification": "Addresses a serious reliability problem: undetected benchmark contamination means SOTA claims across the field are potentially untrustworthy, undermining AI progress measurement at scale." 478 }, 479 "drama_conflict": { 480 "score": 1, 481 "justification": "The paper is a methodological contribution without naming specific models or labs as contamination offenders, limiting controversy potential despite the high-stakes topic." 482 }, 483 "demo_ability": { 484 "score": 1, 485 "justification": "The method is well described and reproducible in principle but no code or watermarked benchmarks are released, preventing immediate hands-on demonstration by others." 486 }, 487 "brand_recognition": { 488 "score": 3, 489 "justification": "Meta FAIR is a top-tier AI research lab; the paper comes from the team behind Llama models, lending significant credibility and visibility to the work." 490 } 491 }, 492 "hn_data": { 493 "threads": [ 494 { 495 "hn_id": "43258481", 496 "title": "HumT DumT: Measuring and controlling human-like language in LLMs", 497 "points": 1, 498 "comments": 0, 499 "url": "https://news.ycombinator.com/item?id=43258481" 500 } 501 ], 502 "top_points": 1, 503 "total_points": 1, 504 "total_comments": 0 505 } 506 }