scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32272B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Detecting Benchmark Contamination Through Watermarking",
      6     "authors": [
      7       "Tom Sander",
      8       "Pierre Fernandez",
      9       "Saeed Mahloujifar",
     10       "Alain Durmus",
     11       "Chuan Guo"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2502.17259",
     16     "doi": "10.48550/arXiv.2502.17259"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims 'p-val = 10^-3 for +5% on ARC-Easy.' Table 1 shows 4 contaminations on ARC-Easy: log10(p)=-3.0, Acc +4.3%, which is consistent. Claims of preserved utility are supported by Figure 3a and Figure 7.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper's experimental design controls contamination levels by training models from scratch with and without benchmark data, enabling causal inference about the relationship between contamination and detection confidence. This is a controlled experiment with single-variable manipulation.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Results are tested on 3 benchmarks, 3 model sizes, and 4 tokenizer families. The limitations section explicitly bounds scope: 'primarily designed for unintentional contamination' and acknowledges that math/code questions are harder to watermark.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Proposition 1 explicitly distinguishes three meanings of 'contaminated': (i) trained on test set, (ii) memorized the watermark, (iii) performance artificially enhanced. The paragraph below Proposition 1 acknowledges the scenario where a model performs well without overfitting to watermark biases.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly distinguishes between what is measured (watermark memorization, Definition ii) and what is claimed (benchmark contamination). The text below Proposition 1 states: 'This is distinct from the hypothesis that the model's test performance is not artificially enhanced due to training on the benchmark.'",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 5 is titled 'Limitations & Conclusion' and contains two substantive bullet points addressing rephrasing impact and intentional evasion.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section 5 identifies study-specific threats: 'some questions lose coherence after rephrasing' (Figure 6 demonstrates this), and 'Malicious actors could rephrase questions to weaken the watermark or train only on answers conditioned on questions, which would bypass radioactivity detection.'",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper states the method is 'primarily designed for unintentional contamination.' It explicitly acknowledges that intentional evasion (rephrasing to remove watermarks, training only on answers) would bypass detection. Section 4.3 notes that math/code questions present challenges for rephrasing.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding or acknowledgments section is present. Authors are Meta FAIR employees and École polytechnique affiliates, implying corporate funding, but this is not explicitly disclosed.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly stated: 'Meta FAIR' and 'École polytechnique CMAP' in the paper header.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Meta has a vested interest in the integrity of LLM benchmarks, as it trains and releases Llama models that are evaluated on them. The paper uses exclusively Meta models (Llama family) for all experiments. No statement of funder independence.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or financial disclosure section is present in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are defined: 'contamination' is given three explicit definitions in Section 3.2, 'radioactivity' is defined via reference to Sander et al. (2024), 'benchmark utility' is operationalized as similar accuracy across watermarked and original versions.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The introduction lists three explicit contributions: (1) rephrasing benchmarks with watermarking, (2) extending watermark radioactivity to pre-training, and (3) a new detection algorithm for different tokenizers.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 provides a thorough related work review covering membership inference limitations, canary approaches, prior watermarking schemes, and radioactivity, explicitly explaining why existing methods fail and how this work extends Sander et al. (2024).",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No code repository URL is provided in the paper. They reference Meta Lingua (Videau et al., 2024) as the training framework but do not release their own watermarking or detection code.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The original benchmarks (ARC, MMLU) are public, but the watermarked versions of the benchmarks and the trained model checkpoints are not released.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "They mention A-100 GPUs with 80GB memory and the Meta Lingua framework, but provide no requirements.txt, Dockerfile, or detailed library version listing sufficient to recreate the environment.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided. The method and hyperparameters are described but there is no README, script, or explicit guide to replicate experiments.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Accuracy numbers and p-values are reported as point estimates. No confidence intervals or error bars appear on accuracy results. The paper notes that [-1, 0] is a 90% CI for log10(p) under H0, but this is a theoretical property of the test, not an uncertainty measure on the main results.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "The core contribution is a binomial significance test for contamination detection (Equation 2, Proposition 1). P-values are reported throughout (Table 1, Figures 3b, 4, 8). The test has a formally proven False Positive Rate equal to α.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Table 1 reports accuracy improvements with baseline context: e.g., '+4.3%' (from 53.5% to 57.9%), '+9.5%' (from 53.5% to 63.0%), '+18.2%' (from 53.5% to 71.7%) for ARC-Easy at different contamination levels.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "MMLU* uses 5000 questions 'to accelerate experimentation and maintain a comparable size to the other benchmarks,' but no power analysis or formal justification for any of the benchmark sizes is provided.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "All results appear to be from single training runs. No standard deviation, variance across seeds, or spread measures are reported for accuracy or detection metrics.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Section 4.3 compares against canary-based contamination detection as an alternative baseline (Table 3). They also compare contaminated vs. uncontaminated models.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "The canary method from BIG-bench (Srivastava et al., 2022) is a standard and commonly used approach for contamination detection. Membership inference attacks (Duan et al., 2024) are discussed as contemporary alternatives that are ineffective.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Section 4.3 provides extensive ablations: watermark strength δ (0, 1, 2, 4), window size k (0, 1, 2), model size (135M, 360M, 1B), number of contaminations (0, 4, 8, 16), rephrasing model (8B vs 70B), and tokenizer (Section 4.4).",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Results are reported on two distinct metrics: benchmark accuracy (performance impact) and detection confidence (log10 p-value from the radioactivity test). Both are reported in Table 1 and throughout Section 4.",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "No human evaluation of the watermarked questions' quality. Benchmark utility is assessed entirely through automated comparison of model performance on original vs. watermarked versions.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Models are trained with controlled contamination and evaluated on the benchmark test sets. An out-of-distribution (OOD) evaluation template is used to further separate training and evaluation contexts (Section 4.2).",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down per benchmark (ARC-Easy, ARC-Challenge, MMLU*) in Table 1, per model size in Figure 4, per window size in Table 2 and Figures 7-8, and per watermark strength in Figure 3.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Figure 6 shows a concrete rephrasing failure on a math question. Section 4.3 discusses how smaller benchmarks lead to weaker detection (ARC-Challenge with 4 contaminations has 'doubtful' p-value). Section 5 discusses evasion scenarios.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Table 3 shows the canary-based method fails even with 160 contaminations (p-val=0.19). They report that detection on ARC-Challenge with 4 contaminations is 'doubtful' (log10(p)=-1.2). Figure 6 shows the 8B rephrasing model failing on math questions.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Specific model versions are given: 'Llama-3.1-8B-Instruct' for rephrasing, 'Llama-3.3-1B', 'Llama-3.3-3B', 'Llama-3.1-8B' for evaluation. Tokenizer experiments use Llama-1/2, Llama-3, Gemma-1/2, Gemma-3 (Table 4).",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Figure 5 provides the exact system prompt ('You are a problem rephrasing assistant...') and instruction. Section 4.2 provides the exact evaluation templates for both in-distribution and OOD evaluation.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Section 4.1 reports top-p=0.7, temperature=0.5, k=2, γ=50%. Section 4.2 reports hidden dim=2048, 25 layers, 16 attention heads, LR=3×10^-3, weight decay=0.033, warmup=5000 steps, batch size=4, sequence length=4096, gradient clipping=1.0.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used. The method involves direct LLM generation for rephrasing and standard model training/evaluation.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 4.2 describes the contamination procedure in detail: batches taken between steps 2500-7500, format template provided, DCLM as base training data. Section 4.1 describes MMLU* as a random subset of 5000 questions across all disciplines.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "No trained model checkpoints, watermarked benchmark datasets, or raw experimental outputs are released for independent verification.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Training data source is DCLM (Li et al., 2024). Benchmarks are ARC-Easy (1172 questions), ARC-Challenge (2372 questions), MMLU* (5000 randomly selected questions). Contamination injection procedure is fully described in Section 4.2.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. Data sources are standard public benchmarks (ARC, MMLU) and a public training corpus (DCLM).",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The full pipeline is documented: benchmark selection → watermarked rephrasing with specified parameters → training with controlled contamination injection at specified steps → evaluation with specified templates → radioactivity detection scoring.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "The paper trains models from scratch with fully controlled training data (10B tokens from DCLM with specified contamination injections). The entire training pipeline is described, so the 'cutoff' is effectively complete — the authors know exactly what the model saw.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "Train-test overlap is the central topic of the paper. They intentionally control contamination levels (0, 4, 8, 16 injections) and analyze the resulting overlap's impact on detection and performance.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "Benchmark contamination is the paper's primary research question. The entire method is designed to detect it, and experiments systematically vary contamination levels to validate detection.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study. All experiments involve training and evaluating language models.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Appendix C states: 'Each radioactivity detection test took less than 30 minutes on a single GPU.' This quantifies the cost of the detection method.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": true,
    366           "justification": "Appendix C provides detailed compute: '1B models was conducted on 8 nodes (so 64 GPUs) and took approximately six hours.' Total budget estimated at 'approximately 10,000 GPU hours' including all experiments.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "No mention of multiple random seeds. All experiments appear to be single-run training with single-seed evaluation.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The number of experimental runs is not explicitly stated. Results appear to come from single training runs for each configuration.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "No hyperparameter search budget is reported. Training hyperparameters are listed but no indication of how they were selected or how many configurations were tried.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": true,
    392           "justification": "Results are reported across all tested configurations (multiple δ values, window sizes, model sizes, contamination levels, tokenizers) rather than selecting a single best. The paper sweeps over parameter space rather than optimizing.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "Multiple statistical tests are performed across 3 benchmarks, 4 contamination levels, 4 watermark strengths, and 3 window sizes. No correction for multiple comparisons is applied, though many p-values are so extreme (10^-12) that correction would not change conclusions.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The authors propose the watermarking method and evaluate it themselves. No discussion of self-comparison bias or independent evaluation.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "While different model sizes are tested (135M, 360M, 1B), performance is not systematically analyzed as a function of compute budget. The focus is on contamination level, not compute.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "No discussion of whether ARC-Easy, ARC-Challenge, and MMLU actually measure the capabilities they claim to measure. The benchmarks are treated as given without questioning their construct validity.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": false,
    421           "answer": false,
    422           "justification": "No scaffolding is used. Models are evaluated directly via loss comparison on answer choices.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": true,
    430           "justification": "The paper trains models from scratch on controlled data, fully addressing temporal leakage by design. The authors know exactly when each piece of data enters training (contamination injected between steps 2500 and 7500).",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": true,
    436           "justification": "Section 4.2 uses an out-of-distribution (OOD) evaluation template distinct from the contamination template to ensure the model isn't merely memorizing the formatting context. Results are reported for both in-distribution and OOD evaluation.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": true,
    442           "justification": "Each benchmark uses a different secret watermarking key s, ensuring independence between detection tests. The base training corpus (DCLM) is separate from the benchmarks, and the paper controls exactly which benchmark data enters training.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": true,
    448           "justification": "The paper's entire contribution IS a concrete leakage detection method: watermark embedding via rephrasing followed by a binomial radioactivity test with formal p-value guarantees (Proposition 1, Algorithm 1).",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "Watermarking benchmarks via LLM rephrasing preserves benchmark utility: models rank the same and achieve similar accuracy on watermarked vs. original versions even at 80% green-token density.",
    457       "evidence": "Figure 3a and Figure 7 show Llama-3.2-1B/3B and Llama-3.1-8B achieve comparable accuracy on all watermarked versions of ARC-Easy; Table 1 reports baseline accuracy with 0 contaminations matching original benchmark performance.",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "Benchmark contamination can be reliably detected via radioactivity testing with p-values as low as 10^-6 even for modest performance gains (~10%).",
    462       "evidence": "Table 1 shows log10(p) < -12 for 8 contaminations (+9.5% gain on ARC-Easy) and log10(p) = -5.7 for 4 contaminations (+5.1% gain on MMLU*).",
    463       "supported": "strong"
    464     },
    465     {
    466       "claim": "Canary insertion (as used in BIG-bench) is less effective than radioactivity for contamination detection.",
    467       "evidence": "Table 3 shows a 360M model trained with 160 MMLU* contaminations (10× more than the most contaminated radioactivity setup) fails to memorize a 64-digit canary sufficiently to yield a low p-value.",
    468       "supported": "strong"
    469     },
    470     {
    471       "claim": "The detection method generalizes across different tokenizers used by the watermarking and suspect models.",
    472       "evidence": "Table 4 shows detection confidence log10(p) between -7 and -15 for contaminated models using Llama-1/2, Gemma-1/2, and Gemma-3 tokenizers, while uncontaminated models yield p-values near 0.5.",
    473       "supported": "strong"
    474     },
    475     {
    476       "claim": "Larger benchmarks, stronger watermarks, and more contamination passes all increase detection confidence proportionally.",
    477       "evidence": "Table 1 and Figure 3b demonstrate monotonic increases in -log10(p) with both δ (watermark strength) and number of contaminations; Table 2 shows benchmark size directly increases the number of scored tokens and thus detection confidence.",
    478       "supported": "strong"
    479     },
    480     {
    481       "claim": "The false positive rate of the contamination test equals the significance threshold α by design.",
    482       "evidence": "Proposition 1 provides a formal proof based on independence of token predictions under H0, confirmed empirically by uncontaminated models consistently yielding log10(p) ≈ -0.3.",
    483       "supported": "strong"
    484     }
    485   ],
    486   "methodology_tags": [
    487     "benchmark-eval",
    488     "theoretical"
    489   ],
    490   "key_findings": "Benchmark contamination can be reliably detected by embedding LLM watermarks into benchmark questions during rephrasing before release; models trained on contaminated watermarked data retain detectable radioactive traces even for modest performance gains (p-val ≤ 10^-5 for +5% accuracy gain on MMLU*). Benchmark utility is preserved across all watermark strengths tested (up to 80% green tokens), with model rankings unchanged. The canary-based alternative completely fails under the same conditions (160 contaminations, still no significant detection). The method extends to cross-tokenizer scenarios with only modest loss in detection confidence, and detection scales predictably with benchmark size, watermark strength, and number of contamination passes.",
    491   "red_flags": [
    492     {
    493       "flag": "No code or model release",
    494       "detail": "Despite sufficient implementation detail to describe the method, no code repository or watermarked benchmark versions are released, making independent replication require full re-implementation."
    495     },
    496     {
    497       "flag": "Only 1B parameter models tested",
    498       "detail": "All training experiments use models up to 1B parameters; whether the method scales to the 70B–405B range of production LLMs is not validated, only speculated."
    499     },
    500     {
    501       "flag": "Requires white-box model access",
    502       "detail": "The radioactivity detection requires full model weights for the 'reading mode'; the method does not apply to closed-source APIs, which is where contamination is most suspected and hardest to audit."
    503     },
    504     {
    505       "flag": "No variance across runs",
    506       "detail": "All results reflect single training runs; there is no statistical characterization of run-to-run variability in either accuracy or detection p-values."
    507     },
    508     {
    509       "flag": "Meta FAIR affiliation and undisclosed funding",
    510       "detail": "All primary authors are from Meta FAIR, a major LLM developer with commercial interest in contamination detection, and no funding disclosure or competing interests statement is provided."
    511     }
    512   ],
    513   "cited_papers": [
    514     {
    515       "title": "Watermarking makes language models radioactive",
    516       "relevance": "Direct predecessor: the paper extends this fine-tuning radioactivity result to the pre-training / benchmark contamination setting."
    517     },
    518     {
    519       "title": "A watermark for large language models",
    520       "relevance": "The green/red list watermarking scheme (Kirchenbauer et al. 2023a/b) is the core technical primitive used throughout the paper."
    521     },
    522     {
    523       "title": "Do membership inference attacks work on large language models?",
    524       "relevance": "Establishes that membership inference is ineffective for LLMs, motivating the need for the watermarking-based alternative approach."
    525     },
    526     {
    527       "title": "A careful examination of large language model performance on grade school arithmetic",
    528       "relevance": "Provides empirical evidence that models exhibit performance drops on fresh questions vs. contaminated benchmarks, motivating the contamination problem."
    529     },
    530     {
    531       "title": "Measuring massive multitask language understanding",
    532       "relevance": "MMLU is one of the three benchmarks watermarked and tested in the experiments."
    533     },
    534     {
    535       "title": "Evaluation data contamination in LLMs: how do we measure it and (when) does it matter?",
    536       "relevance": "Recent work surveying contamination measurement approaches, providing context for the problem the paper addresses."
    537     },
    538     {
    539       "title": "Investigating data contamination for pre-training language models",
    540       "relevance": "Shows that even small contamination can improve benchmark performance in pre-training, directly motivating the detection method."
    541     },
    542     {
    543       "title": "WARD: Provable RAG dataset inference via LLM watermarks",
    544       "relevance": "Related work applying watermark radioactivity to a different setting (RAG dataset inference), showing the general applicability of the radioactivity concept."
    545     }
    546   ],
    547   "engagement_factors": {
    548     "practical_relevance": {
    549       "score": 2,
    550       "justification": "Benchmark creators could adopt this watermarking technique before releasing new benchmarks, though no code is provided for immediate use."
    551     },
    552     "surprise_contrarian": {
    553       "score": 1,
    554       "justification": "Confirms the known problem of benchmark contamination and proposes a solution; does not challenge conventional wisdom."
    555     },
    556     "fear_safety": {
    557       "score": 1,
    558       "justification": "Highlights the unreliability of current benchmark evaluations due to contamination, a mild integrity concern rather than a safety threat."
    559     },
    560     "drama_conflict": {
    561       "score": 2,
    562       "justification": "Feeds into the 'benchmarks are unreliable' narrative and implicitly suggests models may be scoring higher than they should due to contamination."
    563     },
    564     "demo_ability": {
    565       "score": 0,
    566       "justification": "No code, demo, or tool released; the method cannot be tried without reimplementation."
    567     },
    568     "brand_recognition": {
    569       "score": 2,
    570       "justification": "Meta FAIR is a well-known AI research lab, though the paper is not about a flagship product."
    571     }
    572   },
    573   "hn_data": {
    574     "threads": [
    575       {
    576         "hn_id": "43258481",
    577         "title": "HumT DumT: Measuring and controlling human-like language in LLMs",
    578         "points": 1,
    579         "comments": 0,
    580         "url": "https://news.ycombinator.com/item?id=43258481"
    581       }
    582     ],
    583     "top_points": 1,
    584     "total_points": 1,
    585     "total_comments": 0
    586   }
    587 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs