ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (32340B)


      1 {
      2   "paper": {
      3     "title": "On The Fragility of Benchmark Contamination Detection in Reasoning Models",
      4     "authors": ["Han Wang", "Haoyu Li", "Brian Ko", "Huan Zhang"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2510.02386",
      8     "doi": "10.48550/arXiv.2510.02386"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval", "theoretical"],
     13   "key_findings": "Existing benchmark contamination detection methods are fragile against LRM contamination. RL training (GRPO) with clean data conceals SFT contamination evidence, reducing detection AUROC from ~75% to ~50% (random guessing), with PPO-style importance sampling/clipping identified as the root cause via theoretical analysis and ablation. For advanced LRMs, SFT contamination with chain-of-thought reasoning yields up to 11.76% performance inflation while making all 10 tested detection methods perform near chance, suggesting LRMs generalize from contamination rather than memorizing.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The abstract states 'Our code is available at https://github.com/ASTRAL-Group/LRM_Conta_Detection_Arena.git' — a working repository URL is provided."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All evaluation benchmarks are publicly available (AIME 2024/2025, AMC 2023, GPQA Diamond, OlympiadBench, Minerva Math). Clean training data comes from public datasets (OpenThoughts3, DeepMath-103K)."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Appendix F lists hardware (9× NVIDIA L40S, CUDA 12.8, Ubuntu 22.04) and mentions frameworks (LLaMA-Factory, Verl, FlashAttention-2, DeepSpeed ZeRO-1, Liger kernels, vLLM), but no requirements.txt, Dockerfile, or versioned dependency list is provided. Framework versions are cited only by arXiv paper references."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper provides detailed experimental setup in Appendix D (contamination pipelines, hyperparameters, prompt templates) and a code repository URL, but no step-by-step reproduction instructions (e.g., 'run this command') are included in the paper itself."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Tables 1-5 report point estimates for AUROC and Pass@1 with no confidence intervals or error bars. Results are 'averaged over detection scores from 8 rollouts' but no uncertainty measures are provided."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Claims that GRPO 'conceals' contamination and that detection methods 'perform near random guess' are based on comparing raw AUROC numbers without any statistical significance tests (no p-values, t-tests, or bootstrap tests)."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Tables 2, 3, and 5 include a Δ column reporting absolute AUROC changes relative to the baseline (e.g., -14.22, -16.42 for GRPO concealment). Table 1 reports Pass@1 with absolute differences across conditions, providing magnitude context."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification is given for using these particular benchmarks or their sizes. Some benchmarks are very small (AIME 2024/2025 have only 30 problems each, yielding 15-member evaluation sets), and no power analysis or sample size rationale is provided."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "AUROC is averaged over 8 rollouts and Pass@1 over 3-10 rollouts, but no standard deviations, interquartile ranges, or other spread measures are reported for any results."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "10 representative contamination detection methods are evaluated spanning generation-based, perturbation-based, reference-based, and reference-free approaches (Table 2). Multiple training regimes (SFT only, SFT+GRPO, SFT+RAFT, SFT+RAFT++) serve as comparison conditions."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Detection methods range from 2018-2025, including recent work like Min-K%++ (Zhang et al., 2024), DICE (Tu et al., 2024), CDD (Dong et al., 2024), and Verbatim (Wu et al., 2025). The RL algorithms (GRPO, RAFT++) are also contemporary."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 3 presents a carefully designed ablation isolating the effect of importance sampling and clipping. RAFT (no clipping), RAFT++ (with/without clipping), and GRPO (with/without clipping) are compared, directly testing the theoretical prediction."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper reports both AUROC for detection performance and Pass@1 for benchmark performance/contamination inflation, providing complementary views of the contamination problem."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation is included. All evaluation is automated (AUROC computation, Pass@1 with automated answer checking). Human expert review of detection failures or contamination evidence could have strengthened the analysis."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "For each benchmark, 'we randomly sample half of the questions as the member set (used for contamination) and leave the remaining half as the non-member set (for detection evaluation)' (Section 3). The member/non-member split provides clear separation."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by each of the 6 benchmarks (OlympiadBench, GPQA, AIME25, AIME24, Minerva, AMC23) and by each detection method category (generation-based, perturbation-based, reference-based, reference-free) in all tables."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Extensive failure analysis is provided: log-probability distributions show why detection fails (Figures 3-4), the discussion in Section 4 explains why LRMs generalize rather than memorize, and embedding visualizations (Appendix E.6, Figures 10-12) show member/non-member indistinguishability."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "RAFT does NOT conceal contamination (Table 3, Δ=+2.03), RL contamination provides 'no significant difference compared to using a clean RL training set' (Section 3.1), and further SFT does NOT conceal contamination (Figure 2, Table 14). These are explicitly reported negative results."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims that (1) GRPO conceals contamination signals — supported by Table 2 showing AUROC drops averaging -14 to -20 points; (2) PPO-style clipping is the root cause — supported by Theorem 3.1 and ablation in Table 3; (3) detection methods perform near random on advanced LRMs — supported by Table 5 averaging ~50-55% AUROC."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The causal claim that 'PPO-style importance sampling and clipping objectives are the root cause of detection concealment' is supported by (1) theoretical analysis (Theorem 3.1), (2) controlled ablation removing clipping from RAFT++/GRPO (Table 3), and (3) comparison of RAFT (no clipping, no concealment) vs RAFT++ (with clipping, concealment)."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims fragility of 'benchmark contamination detection in reasoning models' broadly, but experiments use only 7-8B parameter models (Qwen2.5-7B-Instruct, DeepSeek-R1-Distill-Llama-8B/Qwen-7B) on 6 math/science benchmarks. No results on larger models, coding benchmarks, or other reasoning domains. The paper does not bound its claims to the tested scale or domains."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 3.1 explicitly rules out alternative explanations: (1) 'simply training with more clean samples' by comparing continued SFT vs GRPO (Figure 2, Table 14); (2) 'further training makes models forget contamination' by showing Pass@1 inflation persists (Table 1). Section 4 discusses the alternative that LRMs generalize rather than memorize."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures AUROC for detection and Pass@1 for benchmark performance — both directly match the claimed constructs (detection effectiveness and performance inflation). No proxy gap exists between measurements and claims."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Specific model checkpoints are named: 'Qwen2.5-7B-Instruct', 'DeepSeek-R1-Distill-Llama-8B', 'DeepSeek-R1-Distill-Qwen-7B', 'QwQ-32B' for distillation, and 'bespokelabs/Bespoke-Stratos-7B' as the reference model. These are identifiable checkpoints."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Appendix D.4 provides the actual prompt templates used: the math reasoning template ('{question}\\nPlease reason step by step, and put your final answer within \\boxed{}.') and the multiple-choice template, with concrete examples showing how they are instantiated."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix D.4 provides comprehensive hyperparameter tables for SFT (batch size 128, LR 4e-5, 5 epochs, cosine scheduler, etc.) and RL (batch size 64, ε=0.2, LR 1e-6, rollout num 4, temp 0.6) training, plus inference settings (temperature=0.6, top_p=0.95, max_new_tokens=32768)."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The paper trains and evaluates language models directly without any scaffolding layer."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Appendix D.1 describes the contamination pipeline: 10K clean samples from OpenThoughts3, distillation with QwQ-32B using rejection sampling (64 rollouts), 3× replication of member data. Deduplication uses '13-gram overlap deduplication' against evaluation benchmarks. Table 6 shows the proportion of questions solved after distillation rollouts."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Appendix A contains a dedicated 'LIMITATIONS' section discussing the scope of the findings and acknowledging that no new detection algorithm is proposed."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "The limitations section (Appendix A) discusses the implications of their findings but does not address specific threats to their own study's validity, such as whether results generalize beyond 7-8B models, whether the small benchmark sizes (15 member items in AIME) introduce noise, or whether base model pre-training contamination could confound results."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state what the results do NOT show. It generalizes from a few 7-8B models to 'LRMs' broadly without bounding claims to the tested scale, model families, or benchmark domains."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "While code is released and benchmarks are public, the paper does not release raw detection scores, contaminated model checkpoints, or intermediate experimental data needed for independent verification of the reported AUROC and Pass@1 values."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Appendix D describes data collection in detail: contamination pipeline construction (D.1), detection method implementations (D.2), benchmark descriptions with sizes (D.3), and SFT/RL implementation specifics (D.4)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. All data sources are standard public benchmarks (AIME, AMC, GPQA Diamond, OlympiadBench, Minerva Math) and public training datasets."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The full pipeline is documented: base model → SFT contamination (10K clean + 1,866 member × 3 replications = 11,866 samples) → RL training (4,096 clean + optional members). Detection pipeline: 8 rollouts per question → compute detection score per rollout → average. Deduplication step is documented."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding, acknowledgments, or grants section is present in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: University of Illinois Urbana-Champaign and University of Washington. None of the authors are affiliated with the model providers being evaluated (DeepSeek, Qwen/Alibaba)."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Cannot assess funder independence because no funding source is disclosed. The absence of a funding disclosure makes this unanswerable."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement appears in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates are stated for the base models (Qwen2.5-7B-Instruct, DeepSeek-R1-Distill models). While the paper controls its own contamination, the base models may have already been exposed to the evaluation benchmarks during pre-training."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Train/test overlap is the central topic. The paper explicitly splits benchmarks into member/non-member halves and uses 13-gram deduplication on clean training data against evaluation benchmarks to prevent unintended overlap."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Benchmark contamination is the entire subject of the paper. They control contamination as an experimental variable and explicitly address it through member/non-member splits and deduplication of clean data."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference costs, API costs, or per-example wall-clock times are reported. The paper generates many rollouts per question (8 for detection, 3-10 for Pass@1 across 6 benchmarks) but does not quantify the computational cost."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Appendix F states hardware specs (9× NVIDIA L40S GPUs, 48 GiB each) but does not report total GPU hours, wall-clock training time, or total compute spent across all experiments."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No results across multiple random seeds are reported. The member/non-member split is performed once ('we randomly sample half of the questions') with no sensitivity analysis across different random splits."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "The paper states 'Each AUROC is averaged over detection scores from 8 rollouts' and 'we evaluate pass@1 and run 10 rollouts on AIME 2024 & 2025, AMC 2023, and 3 rollouts on OlympiadBench, GPQA Diamond, and Minerva Math' (Appendix D.4)."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search is reported. The paper adopts SFT hyperparameters 'suggested by OpenThought3' and uses fixed RL hyperparameters without stating whether alternatives were explored."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The paper uses recommended hyperparameters from prior work ('we adopt the SFT hyperparameters suggested by OpenThought3 for medium dataset scales', Appendix D.4), providing transparent justification for configuration choices rather than cherry-picking."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical tests are performed, so multiple comparison correction is inapplicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors re-implement 10 detection methods from other groups but do not acknowledge the potential bias of their own implementations of these baselines. No independent evaluation or verification of implementation correctness is discussed."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Figure 2 and Tables 11-12 show AUROC as a function of RL training steps (0, 64, 110, 156 steps), providing performance-vs-compute curves that reveal the monotonic decline in detection with more training."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper uses AUROC as the sole measure of detection effectiveness without discussing whether it adequately captures real-world detection utility (e.g., at what false positive rate a method becomes useful). No discussion of whether the member/non-member random split on small benchmarks is a valid construct for evaluating practical detectability."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is used. Models are trained and evaluated directly."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The paper does not discuss whether the base models (Qwen2.5-7B-Instruct, DeepSeek-R1-Distill) may have already been exposed to the evaluation benchmarks (AIME, GPQA, etc.) during pre-training. AIME 2024 problems may have appeared in 2025-era model training data."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup inadvertently leaks information. For example, non-members from the same benchmark may share structural features with members, making them not truly 'unseen' in distribution."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Members and non-members are randomly split from the same benchmarks but share strong distributional similarity (same competition, same difficulty distribution). This non-independence is not discussed and could affect AUROC measurements."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "The paper uses '13-gram overlap deduplication' on clean training datasets against evaluation benchmarks 'to ensure conclusive results' (Appendix D.4). This is a concrete decontamination method applied to their own training data."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Contamination inflation mainly comes from SFT, not RL training",
    365       "evidence": "Table 1 shows SFT contamination yields 8.82% additional Pass@1 inflation across six benchmarks, while RL contamination shows 'no significant difference compared to using a clean RL training set.'",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "SFT contamination is initially detectable by existing methods",
    370       "evidence": "Table 2 shows reference-free methods achieve ~73-75% AUROC and LiRA achieves 89.13% AUROC on SFT-contaminated models before RL training.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "GRPO training conceals SFT contamination evidence across all detection methods",
    375       "evidence": "Table 2 shows consistent AUROC decreases after GRPO across all 10 detection methods and 6 benchmarks. Average drops range from -0.60 (Verbatim) to -19.84 (Max-K%). Figure 2 shows monotonic AUROC decline with more GRPO steps.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "PPO-style importance sampling and clipping is the root cause of detection concealment",
    380       "evidence": "Theorem 3.1 provides theoretical analysis; Table 3 shows ablation where removing clipping from GRPO raises AUROC from 61.26% to 73.28% (Δ from -14.22 to -2.20), and RAFT (no clipping) shows Δ=+2.03 while RAFT++ (with clipping) shows Δ=-17.91.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Extensive SFT contamination with CoT on advanced LRMs leaves little detectable evidence",
    385       "evidence": "Table 5 shows almost all detection methods perform near random guessing (AUROC ≈50%) across three advanced LRMs and six benchmarks. Best method (LiRA) achieves only 58.74% average AUROC.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "LRMs generalize from contamination rather than memorizing specific sequences",
    390       "evidence": "Figures 4, 8, and 9 show that after SFT contamination, log-probability distributions shift upward for both members and non-members at similar margins, and embedding visualizations (Figures 10-12) show member/non-member embeddings are indistinguishable. Discussed in Section 4.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "Further GRPO training preserves performance inflation while reducing detectable evidence",
    395       "evidence": "Table 1 shows the SFT+GRPO contaminated model retains 7.14% average performance inflation vs clean SFT, while Table 2 and Figure 2 show AUROC drops toward random guessing.",
    396       "supported": "strong"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "No uncertainty quantification",
    402       "detail": "No error bars, confidence intervals, or standard deviations are reported for any AUROC or Pass@1 results despite averaging over multiple rollouts. With very small evaluation sets (15 member items for AIME), point estimates may be highly unstable."
    403     },
    404     {
    405       "flag": "Very small evaluation set sizes",
    406       "detail": "AIME 2024 and 2025 have only 30 problems each (15 members, 15 non-members). AMC 2023 has 40 (20/20). AUROC computed on 15-20 items is inherently noisy and may not reliably discriminate between methods. No discussion of the statistical power implications."
    407     },
    408     {
    409       "flag": "Limited model scale",
    410       "detail": "All experiments use 7-8B parameter models only. The paper's claims about 'LRMs' broadly are not tested at larger scales (e.g., 70B, 405B) where contamination dynamics and detection may differ substantially."
    411     },
    412     {
    413       "flag": "No significance testing for key claims",
    414       "detail": "Claims that detection methods 'perform near random guess' or that GRPO 'conceals' contamination are based on comparing raw numbers without statistical tests. Some reported AUROC values (e.g., 55-62%) may not be statistically distinguishable from 50% given the small sample sizes."
    415     }
    416   ],
    417   "cited_papers": [
    418     {
    419       "title": "Leak, cheat, repeat: Data contamination and evaluation malpractices in closed-source LLMs",
    420       "authors": ["Simone Balloccu", "Patrícia Schmidtová", "Mateusz Lango", "Ondřej Dušek"],
    421       "year": 2024,
    422       "arxiv_id": "2402.03927",
    423       "relevance": "Documents data contamination and evaluation malpractices in closed-source LLMs, directly relevant to benchmark integrity."
    424     },
    425     {
    426       "title": "Detecting pretraining data from large language models",
    427       "authors": ["Weijia Shi", "Anirudh Ajith", "Mengzhou Xia"],
    428       "year": 2023,
    429       "arxiv_id": "2310.16789",
    430       "relevance": "Proposes Min-K% detection method for pre-training data, a key baseline evaluated in this paper."
    431     },
    432     {
    433       "title": "Generalization or memorization: Data contamination and trustworthy evaluation for large language models",
    434       "authors": ["Yihong Dong", "Xue Jiang", "Huanyu Liu"],
    435       "year": 2024,
    436       "arxiv_id": "2402.15938",
    437       "relevance": "Proposes the CDD generation-based contamination detection method evaluated in this paper."
    438     },
    439     {
    440       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    441       "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"],
    442       "year": 2025,
    443       "arxiv_id": "2501.12948",
    444       "relevance": "Foundational LRM work whose distilled models are used as experimental subjects for Stage II contamination."
    445     },
    446     {
    447       "title": "The emperor's new clothes in benchmarking? A rigorous examination of mitigation strategies for LLM benchmark data contamination",
    448       "authors": ["Yifan Sun", "Han Wang", "Dongbai Li"],
    449       "year": 2025,
    450       "arxiv_id": "2503.16402",
    451       "relevance": "Examines mitigation strategies for LLM benchmark contamination, directly related to evaluation integrity."
    452     },
    453     {
    454       "title": "Does data contamination detection work (well) for LLMs? A survey and evaluation on detection assumptions",
    455       "authors": ["Yujuan Fu", "Ozlem Uzuner", "Meliha Yetisgen", "Fei Xia"],
    456       "year": 2024,
    457       "arxiv_id": "2410.18966",
    458       "relevance": "Surveys contamination detection assumptions and their effectiveness, providing foundational context for this work."
    459     },
    460     {
    461       "title": "Evading data contamination detection for language models is (too) easy",
    462       "authors": ["Jasper Dekoninck", "Mark Niklas Müller", "Maximilian Baader"],
    463       "year": 2024,
    464       "arxiv_id": "2402.02823",
    465       "relevance": "Demonstrates evasion of contamination detection through benchmark augmentation, a precursor to this paper's findings on RL-based concealment."
    466     },
    467     {
    468       "title": "Reasoning or memorization? Unreliable results of reinforcement learning due to data contamination",
    469       "authors": ["Mingqi Wu", "Zhihao Zhang", "Qiaole Dong"],
    470       "year": 2025,
    471       "arxiv_id": "2507.10532",
    472       "relevance": "Studies how RL training results become unreliable due to data contamination, complementary to this paper's findings."
    473     },
    474     {
    475       "title": "How much do language models memorize?",
    476       "authors": ["John X Morris", "Chawin Sitawarin", "Chuan Guo"],
    477       "year": 2025,
    478       "arxiv_id": "2505.24832",
    479       "relevance": "Studies memorization in language models, providing theoretical grounding for the memorization assumptions that this paper challenges."
    480     },
    481     {
    482       "title": "Benchmark data contamination of large language models: A survey",
    483       "authors": ["Cheng Xu", "Shuhao Guan", "Derek Greene"],
    484       "year": 2024,
    485       "arxiv_id": "2406.04244",
    486       "relevance": "Comprehensive survey of benchmark contamination in LLMs, providing the taxonomic framework for detection methods."
    487     },
    488     {
    489       "title": "LLMs can easily learn to reason from demonstrations structure, not content, is what matters!",
    490       "authors": ["Dacheng Li", "Shiyi Cao", "Tyler Griggs"],
    491       "year": 2025,
    492       "arxiv_id": "2502.07374",
    493       "relevance": "Demonstrates that LLMs learn reasoning structure rather than content from demonstrations, relevant to why contamination with CoT generalizes."
    494     },
    495     {
    496       "title": "DeepSeekMath: Pushing the limits of mathematical reasoning in open language models",
    497       "authors": ["Zhihong Shao", "Peiyi Wang", "Qihao Zhu"],
    498       "year": 2024,
    499       "arxiv_id": "2402.03300",
    500       "relevance": "Introduces GRPO training algorithm that this paper identifies as a key mechanism for contamination concealment."
    501     }
    502   ],
    503   "engagement_factors": {
    504     "practical_relevance": {
    505       "score": 1,
    506       "justification": "Findings inform contamination detection researchers and leaderboard operators, but not directly usable as a tool by practitioners."
    507     },
    508     "surprise_contrarian": {
    509       "score": 3,
    510       "justification": "Directly challenges the widely-held assumption that contamination detection methods work, showing they are trivially evaded by standard RL training."
    511     },
    512     "fear_safety": {
    513       "score": 2,
    514       "justification": "Raises alarm about the integrity of LLM leaderboards and the ease of undetectable benchmark gaming, undermining trust in model evaluations."
    515     },
    516     "drama_conflict": {
    517       "score": 3,
    518       "justification": "Strong 'leaderboards are fake' angle — demonstrates that model developers could easily contaminate LRMs to achieve inflated rankings while evading all 10 tested detection methods."
    519     },
    520     "demo_ability": {
    521       "score": 1,
    522       "justification": "Code is released on GitHub but this is a research pipeline requiring significant compute (9× L40S GPUs), not something easily tried."
    523     },
    524     "brand_recognition": {
    525       "score": 1,
    526       "justification": "University researchers (UIUC, UW) without major lab affiliation. Uses DeepSeek models which have moderate recognition."
    527     }
    528   }
    529 }

Impressum · Datenschutz