scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30706B)
      1 {
      2   "paper": {
      3     "title": "Detecting Benchmark Contamination Through Watermarking",
      4     "authors": [
      5       "Tom Sander",
      6       "Pierre Fernandez",
      7       "Saeed Mahloujifar",
      8       "Alain Durmus",
      9       "Chuan Guo"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2502.17259",
     14     "doi": "10.48550/arXiv.2502.17259"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "Watermarking benchmark questions via LLM rephrasing preserves benchmark utility (models rank identically on original and watermarked versions) while enabling reliable contamination detection through radioactivity tests. Detection confidence scales with watermark strength, number of contaminations, and benchmark size — achieving p-values below 10^-5 when accuracy is inflated by just 5% on 5000 MMLU questions. The method works across different tokenizers with reasonable confidence loss, and substantially outperforms canary-based detection, which fails even with 10x more contamination exposures.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No code repository URL is provided in the paper. They reference Meta Lingua (Videau et al., 2024) as the training framework but do not release their own watermarking or detection code."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The original benchmarks (ARC, MMLU) are public, but the watermarked versions of the benchmarks and the trained model checkpoints are not released."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "They mention A-100 GPUs with 80GB memory and the Meta Lingua framework, but provide no requirements.txt, Dockerfile, or detailed library version listing sufficient to recreate the environment."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are provided. The method and hyperparameters are described but there is no README, script, or explicit guide to replicate experiments."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Accuracy numbers and p-values are reported as point estimates. No confidence intervals or error bars appear on accuracy results. The paper notes that [-1, 0] is a 90% CI for log10(p) under H0, but this is a theoretical property of the test, not an uncertainty measure on the main results."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The core contribution is a binomial significance test for contamination detection (Equation 2, Proposition 1). P-values are reported throughout (Table 1, Figures 3b, 4, 8). The test has a formally proven False Positive Rate equal to α."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Table 1 reports accuracy improvements with baseline context: e.g., '+4.3%' (from 53.5% to 57.9%), '+9.5%' (from 53.5% to 63.0%), '+18.2%' (from 53.5% to 71.7%) for ARC-Easy at different contamination levels."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "MMLU* uses 5000 questions 'to accelerate experimentation and maintain a comparable size to the other benchmarks,' but no power analysis or formal justification for any of the benchmark sizes is provided."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "All results appear to be from single training runs. No standard deviation, variance across seeds, or spread measures are reported for accuracy or detection metrics."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Section 4.3 compares against canary-based contamination detection as an alternative baseline (Table 3). They also compare contaminated vs. uncontaminated models."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The canary method from BIG-bench (Srivastava et al., 2022) is a standard and commonly used approach for contamination detection. Membership inference attacks (Duan et al., 2024) are discussed as contemporary alternatives that are ineffective."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Section 4.3 provides extensive ablations: watermark strength δ (0, 1, 2, 4), window size k (0, 1, 2), model size (135M, 360M, 1B), number of contaminations (0, 4, 8, 16), rephrasing model (8B vs 70B), and tokenizer (Section 4.4)."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Results are reported on two distinct metrics: benchmark accuracy (performance impact) and detection confidence (log10 p-value from the radioactivity test). Both are reported in Table 1 and throughout Section 4."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No human evaluation of the watermarked questions' quality. Benchmark utility is assessed entirely through automated comparison of model performance on original vs. watermarked versions."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Models are trained with controlled contamination and evaluated on the benchmark test sets. An out-of-distribution (OOD) evaluation template is used to further separate training and evaluation contexts (Section 4.2)."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down per benchmark (ARC-Easy, ARC-Challenge, MMLU*) in Table 1, per model size in Figure 4, per window size in Table 2 and Figures 7-8, and per watermark strength in Figure 3."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Figure 6 shows a concrete rephrasing failure on a math question. Section 4.3 discusses how smaller benchmarks lead to weaker detection (ARC-Challenge with 4 contaminations has 'doubtful' p-value). Section 5 discusses evasion scenarios."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Table 3 shows the canary-based method fails even with 160 contaminations (p-val=0.19). They report that detection on ARC-Challenge with 4 contaminations is 'doubtful' (log10(p)=-1.2). Figure 6 shows the 8B rephrasing model failing on math questions."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims 'p-val = 10^-3 for +5% on ARC-Easy.' Table 1 shows 4 contaminations on ARC-Easy: log10(p)=-3.0, Acc +4.3%, which is consistent. Claims of preserved utility are supported by Figure 3a and Figure 7."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper's experimental design controls contamination levels by training models from scratch with and without benchmark data, enabling causal inference about the relationship between contamination and detection confidence. This is a controlled experiment with single-variable manipulation."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Results are tested on 3 benchmarks, 3 model sizes, and 4 tokenizer families. The limitations section explicitly bounds scope: 'primarily designed for unintentional contamination' and acknowledges that math/code questions are harder to watermark."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Proposition 1 explicitly distinguishes three meanings of 'contaminated': (i) trained on test set, (ii) memorized the watermark, (iii) performance artificially enhanced. The paragraph below Proposition 1 acknowledges the scenario where a model performs well without overfitting to watermark biases."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper explicitly distinguishes between what is measured (watermark memorization, Definition ii) and what is claimed (benchmark contamination). The text below Proposition 1 states: 'This is distinct from the hypothesis that the model's test performance is not artificially enhanced due to training on the benchmark.'"
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Specific model versions are given: 'Llama-3.1-8B-Instruct' for rephrasing, 'Llama-3.3-1B', 'Llama-3.3-3B', 'Llama-3.1-8B' for evaluation. Tokenizer experiments use Llama-1/2, Llama-3, Gemma-1/2, Gemma-3 (Table 4)."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Figure 5 provides the exact system prompt ('You are a problem rephrasing assistant...') and instruction. Section 4.2 provides the exact evaluation templates for both in-distribution and OOD evaluation."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 4.1 reports top-p=0.7, temperature=0.5, k=2, γ=50%. Section 4.2 reports hidden dim=2048, 25 layers, 16 attention heads, LR=3×10^-3, weight decay=0.033, warmup=5000 steps, batch size=4, sequence length=4096, gradient clipping=1.0."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. The method involves direct LLM generation for rephrasing and standard model training/evaluation."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 4.2 describes the contamination procedure in detail: batches taken between steps 2500-7500, format template provided, DCLM as base training data. Section 4.1 describes MMLU* as a random subset of 5000 questions across all disciplines."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 5 is titled 'Limitations & Conclusion' and contains two substantive bullet points addressing rephrasing impact and intentional evasion."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 5 identifies study-specific threats: 'some questions lose coherence after rephrasing' (Figure 6 demonstrates this), and 'Malicious actors could rephrase questions to weaken the watermark or train only on answers conditioned on questions, which would bypass radioactivity detection.'"
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The paper states the method is 'primarily designed for unintentional contamination.' It explicitly acknowledges that intentional evasion (rephrasing to remove watermarks, training only on answers) would bypass detection. Section 4.3 notes that math/code questions present challenges for rephrasing."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "No trained model checkpoints, watermarked benchmark datasets, or raw experimental outputs are released for independent verification."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Training data source is DCLM (Li et al., 2024). Benchmarks are ARC-Easy (1172 questions), ARC-Challenge (2372 questions), MMLU* (5000 randomly selected questions). Contamination injection procedure is fully described in Section 4.2."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. Data sources are standard public benchmarks (ARC, MMLU) and a public training corpus (DCLM)."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The full pipeline is documented: benchmark selection → watermarked rephrasing with specified parameters → training with controlled contamination injection at specified steps → evaluation with specified templates → radioactivity detection scoring."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding or acknowledgments section is present. Authors are Meta FAIR employees and École polytechnique affiliates, implying corporate funding, but this is not explicitly disclosed."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly stated: 'Meta FAIR' and 'École polytechnique CMAP' in the paper header."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "Meta has a vested interest in the integrity of LLM benchmarks, as it trains and releases Llama models that are evaluated on them. The paper uses exclusively Meta models (Llama family) for all experiments. No statement of funder independence."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests statement or financial disclosure section is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": true,
    236         "justification": "The paper trains models from scratch with fully controlled training data (10B tokens from DCLM with specified contamination injections). The entire training pipeline is described, so the 'cutoff' is effectively complete — the authors know exactly what the model saw."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "Train-test overlap is the central topic of the paper. They intentionally control contamination levels (0, 4, 8, 16 injections) and analyze the resulting overlap's impact on detection and performance."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "Benchmark contamination is the paper's primary research question. The entire method is designed to detect it, and experiments systematically vary contamination levels to validate detection."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study. All experiments involve training and evaluating language models."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Appendix C states: 'Each radioactivity detection test took less than 30 minutes on a single GPU.' This quantifies the cost of the detection method."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Appendix C provides detailed compute: '1B models was conducted on 8 nodes (so 64 GPUs) and took approximately six hours.' Total budget estimated at 'approximately 10,000 GPU hours' including all experiments."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No mention of multiple random seeds. All experiments appear to be single-run training with single-seed evaluation."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The number of experimental runs is not explicitly stated. Results appear to come from single training runs for each configuration."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "No hyperparameter search budget is reported. Training hyperparameters are listed but no indication of how they were selected or how many configurations were tried."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "Results are reported across all tested configurations (multiple δ values, window sizes, model sizes, contamination levels, tokenizers) rather than selecting a single best. The paper sweeps over parameter space rather than optimizing."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "Multiple statistical tests are performed across 3 benchmarks, 4 contamination levels, 4 watermark strengths, and 3 window sizes. No correction for multiple comparisons is applied, though many p-values are so extreme (10^-12) that correction would not change conclusions."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors propose the watermarking method and evaluate it themselves. No discussion of self-comparison bias or independent evaluation."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "While different model sizes are tested (135M, 360M, 1B), performance is not systematically analyzed as a function of compute budget. The focus is on contamination level, not compute."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "No discussion of whether ARC-Easy, ARC-Challenge, and MMLU actually measure the capabilities they claim to measure. The benchmarks are treated as given without questioning their construct validity."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding is used. Models are evaluated directly via loss comparison on answer choices."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": true,
    349         "justification": "The paper trains models from scratch on controlled data, fully addressing temporal leakage by design. The authors know exactly when each piece of data enters training (contamination injected between steps 2500 and 7500)."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": true,
    354         "justification": "Section 4.2 uses an out-of-distribution (OOD) evaluation template distinct from the contamination template to ensure the model isn't merely memorizing the formatting context. Results are reported for both in-distribution and OOD evaluation."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": true,
    359         "justification": "Each benchmark uses a different secret watermarking key s, ensuring independence between detection tests. The base training corpus (DCLM) is separate from the benchmarks, and the paper controls exactly which benchmark data enters training."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": true,
    364         "justification": "The paper's entire contribution IS a concrete leakage detection method: watermark embedding via rephrasing followed by a binomial radioactivity test with formal p-value guarantees (Proposition 1, Algorithm 1)."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "Watermarking benchmark questions through LLM rephrasing preserves benchmark utility — models perform similarly and rank identically on watermarked vs. original versions.",
    371       "evidence": "Figure 3a shows Llama-3.2-1B, 3B, and 8B perform comparably on original and watermarked ARC-Easy across all δ values (0-4). Figure 7 extends this across all benchmarks and window sizes. Some MMLU* discrepancies noted but occur even without watermarking.",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "Contamination detection achieves p-value below 10^-5 when accuracy is inflated by 5% on 5000 MMLU questions.",
    376       "evidence": "Table 1 shows 4 contaminations on MMLU*: log10(p)=-5.7 with +5.1% accuracy gain. For ARC-Easy with 4 contaminations: log10(p)=-3.0 with +4.3% gain.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Detection confidence increases with both watermark strength δ and number of contaminations.",
    381       "evidence": "Figure 3b and Table 1 show monotonic increase in -log10(p) across contamination levels (0, 4, 8, 16) and δ values (0, 1, 2, 4). With 16 contaminations and δ=4, all benchmarks achieve log10(p) < -12.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "The method works across different tokenizers with reasonable loss in detection confidence.",
    386       "evidence": "Table 4 shows detection remains reliable across Llama-1/2, Llama-3, Gemma-1/2, and Gemma-3 tokenizers. P-values range from -7 (Llama-1/2) to -15 (Gemma-3) with contamination, and near 0 without.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Watermark radioactivity substantially outperforms canary-based contamination detection.",
    391       "evidence": "Table 3 shows a 64-digit canary seen 160 times (10x more than max watermark contaminations) yields p-val=0.19, while watermarking achieves p-val < 10^-12 with just 16 contaminations.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "For a fixed performance gain, p-values are consistent across model sizes.",
    396       "evidence": "Section 4.3 and Figure 4 show that 1B (4 contaminations), 360M (8 contaminations), and 135M (16 contaminations) all achieve ~+6% gain on MMLU* with detection p-values around 10^-5.",
    397       "supported": "moderate"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "Meta evaluating using exclusively Meta models",
    403       "detail": "All experiments use Meta's Llama models for both watermark embedding and evaluation. While the paper is about a detection method rather than model capability, there is no validation with non-Meta models for the rephrasing step. The authors acknowledge this partially by testing different tokenizers."
    404     },
    405     {
    406       "flag": "No variance across experimental runs",
    407       "detail": "All results appear to be from single training runs. Given the stochasticity of pretraining, detection p-values and accuracy numbers could vary across seeds. The absence of multi-seed results means the reported p-values may not be representative."
    408     },
    409     {
    410       "flag": "No code or data release",
    411       "detail": "Despite proposing a method that benchmark creators should adopt, neither the watermarking code, detection code, watermarked benchmarks, nor model checkpoints are released, making independent verification impossible."
    412     }
    413   ],
    414   "cited_papers": [
    415     {
    416       "title": "A watermark for large language models",
    417       "authors": ["John Kirchenbauer", "Jonas Geiping", "Yuxin Wen", "Jonathan Katz", "Ian Miers", "Tom Goldstein"],
    418       "year": 2023,
    419       "arxiv_id": "2301.10226",
    420       "relevance": "Foundation watermarking scheme (green/red list) that this paper builds upon for benchmark contamination detection."
    421     },
    422     {
    423       "title": "Watermarking makes language models radioactive",
    424       "authors": ["Tom Sander", "Pierre Fernandez", "Alain Durmus", "Matthijs Douze", "Teddy Furon"],
    425       "year": 2024,
    426       "arxiv_id": "2402.14904",
    427       "relevance": "Prior work by same authors demonstrating watermark radioactivity in fine-tuning; this paper extends the concept to pre-training contamination detection."
    428     },
    429     {
    430       "title": "Evaluation data contamination in LLMs: how do we measure it and (when) does it matter?",
    431       "authors": ["Aaditya K Singh", "Muhammed Yusuf Kocyigit", "Andrew Poulton", "David Esiobu", "Maria Lomeli", "Gergely Szilvasy", "Dieuwke Hupkes"],
    432       "year": 2024,
    433       "arxiv_id": "2411.03923",
    434       "relevance": "Directly addresses the benchmark contamination measurement problem that this paper proposes to solve via watermarking."
    435     },
    436     {
    437       "title": "Leak, cheat, repeat: Data contamination and evaluation malpractices in closed-source LLMs",
    438       "authors": ["Simone Balloccu", "Patrícia Schmidtová", "Mateusz Lango", "Ondřej Dušek"],
    439       "year": 2024,
    440       "arxiv_id": "2402.03927",
    441       "relevance": "Documents data contamination and evaluation malpractice in closed-source LLMs, motivating the need for proactive contamination detection."
    442     },
    443     {
    444       "title": "A careful examination of large language model performance on grade school arithmetic",
    445       "authors": ["Hugh Zhang", "Jeff Da", "Dean Lee", "Vaughn Robinson", "Catherine Wu", "Will Song"],
    446       "year": 2024,
    447       "arxiv_id": "2405.00332",
    448       "relevance": "Demonstrated significant performance drops on new GSM8K-like questions, providing key evidence of benchmark contamination in LLMs."
    449     },
    450     {
    451       "title": "Do membership inference attacks work on large language models?",
    452       "authors": ["Michael Duan", "Anshuman Suri", "Niloofar Mireshghallah", "Sewon Min", "Weijia Shi"],
    453       "year": 2024,
    454       "arxiv_id": "2402.07841",
    455       "relevance": "Shows membership inference is ineffective for LLMs in realistic scenarios, motivating alternative contamination detection approaches like watermarking."
    456     },
    457     {
    458       "title": "Investigating data contamination for pre-training language models",
    459       "authors": ["Minhao Jiang", "Ken Ziyu Liu", "Ming Zhong", "Rylan Schaeffer", "Siru Ouyang", "Jiawei Han", "Sanmi Koyejo"],
    460       "year": 2024,
    461       "arxiv_id": "2401.06059",
    462       "relevance": "Studies data contamination during pre-training showing even small models exhibit benchmark performance gains from contamination."
    463     },
    464     {
    465       "title": "Quantifying memorization across neural language models",
    466       "authors": ["Nicholas Carlini", "Daphne Ippolito", "Matthew Jagielski", "Katherine Lee", "Florian Tramer", "Chiyuan Zhang"],
    467       "year": 2022,
    468       "arxiv_id": "2202.07646",
    469       "relevance": "Foundational work on quantifying memorization in language models, closely related to the contamination detection problem."
    470     },
    471     {
    472       "title": "Measuring massive multitask language understanding",
    473       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart", "Andy Zou", "Mantas Mazeika", "Dawn Song", "Jacob Steinhardt"],
    474       "year": 2020,
    475       "arxiv_id": "2009.03300",
    476       "relevance": "Introduces MMLU, one of the three benchmarks used in this paper's contamination detection experiments."
    477     },
    478     {
    479       "title": "The Llama 3 herd of models",
    480       "authors": ["Abhimanyu Dubey", "Abhinav Jauhri", "Abhinav Pandey"],
    481       "year": 2024,
    482       "arxiv_id": "2407.21783",
    483       "relevance": "Describes Llama-3 models used throughout this paper for both watermark embedding and model evaluation."
    484     },
    485     {
    486       "title": "WARD: Provable RAG dataset inference via LLM watermarks",
    487       "authors": ["Nikola Jovanović", "Robin Staab", "Maximilian Baader", "Martin Vechev"],
    488       "year": 2024,
    489       "arxiv_id": "2410.03537",
    490       "relevance": "Applies watermark radioactivity concepts to RAG systems, extending the contamination detection paradigm to retrieval-augmented generation."
    491     },
    492     {
    493       "title": "Rethinking benchmark and contamination for language models with rephrased samples",
    494       "authors": ["Shuo Yang", "Wei-Lin Chiang", "Lianmin Zheng", "Joseph E Gonzalez", "Ion Stoica"],
    495       "year": 2023,
    496       "arxiv_id": "2311.04850",
    497       "relevance": "Shows training on reformulated questions boosts performance on originals, establishing that contamination via rephrasing is a real threat."
    498     }
    499   ],
    500   "engagement_factors": {
    501     "practical_relevance": {
    502       "score": 2,
    503       "justification": "Benchmark creators could adopt this watermarking technique before releasing new benchmarks, though no code is provided for immediate use."
    504     },
    505     "surprise_contrarian": {
    506       "score": 1,
    507       "justification": "Confirms the known problem of benchmark contamination and proposes a solution; does not challenge conventional wisdom."
    508     },
    509     "fear_safety": {
    510       "score": 1,
    511       "justification": "Highlights the unreliability of current benchmark evaluations due to contamination, a mild integrity concern rather than a safety threat."
    512     },
    513     "drama_conflict": {
    514       "score": 2,
    515       "justification": "Feeds into the 'benchmarks are unreliable' narrative and implicitly suggests models may be scoring higher than they should due to contamination."
    516     },
    517     "demo_ability": {
    518       "score": 0,
    519       "justification": "No code, demo, or tool released; the method cannot be tried without reimplementation."
    520     },
    521     "brand_recognition": {
    522       "score": 2,
    523       "justification": "Meta FAIR is a well-known AI research lab, though the paper is not about a flagship product."
    524     }
    525   }
    526 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs