scan.json (32417B)
1 { 2 "paper": { 3 "title": "PaCoST: Paired Confidence Significance Testing for Benchmark Contamination Detection in Large Language Models", 4 "authors": [ 5 "Huixuan Zhang", 6 "Yun Lin", 7 "Xiaojun Wan" 8 ], 9 "year": 2024, 10 "venue": "Conference on Empirical Methods in Natural Language Processing", 11 "arxiv_id": "2406.18326", 12 "doi": "10.48550/arXiv.2406.18326" 13 }, 14 "scan_version": 3, 15 "active_modules": ["experimental_rigor", "data_leakage"], 16 "methodology_tags": ["benchmark-eval"], 17 "key_findings": "PaCoST detects benchmark contamination by comparing model confidence on original vs. rephrased benchmark instances via paired t-test, satisfying five desirable properties no prior method achieves simultaneously. In controlled experiments with intentionally contaminated Llama-2 and Mistral models, PaCoST achieves zero false positives and zero false negatives. Applied to 10 open-source LLMs across 6 popular benchmarks, nearly all model-benchmark pairs show some contamination evidence, calling into question the trustworthiness of standard benchmark evaluations.", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "The paper states 'Our code will be released at https://github.com/lleozhang/PaCoST' (Section 1 footnote). This is a promise of future release, not an actual release." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "All benchmarks used are publicly available: WMDP (Li et al., 2024), MMLU, HellaSwag, ARC-E, ARC-C, TruthfulQA, WinoGrande. No proprietary data was collected." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "No requirements.txt, Dockerfile, or environment setup instructions are provided. The paper does not specify library versions or dependencies needed to reproduce the experiments." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions or README with commands are provided. The algorithm pseudocode (Algorithm 1) describes the method but not how to run it." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "Results are reported as point-estimate p-values (Tables 2, 3, 4) without confidence intervals or error bars. No uncertainty bounds are provided on the p-values or the underlying confidence differences." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": true, 50 "justification": "The entire method is built on a paired samples t-test (Section 4.3). P-values are reported for all experiments, with the standard p < 0.05 threshold for significance. This is the core contribution of the paper." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": false, 55 "justification": "Only p-values are reported in all results tables (Tables 2, 3, 4). No effect sizes (Cohen's d, mean confidence difference magnitudes, or similar) are provided, making it impossible to assess the practical magnitude of the detected contamination signal." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "Sample sizes of 1000 trained and 400 untrained samples are stated, but the authors note 'The choice of number of samples are just for simplicity' (Section 5.1). No power analysis is provided. Table 3 shows stability across 100-1000 samples but this is post-hoc validation, not justification." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "Table 7 shows p-values across 5 random seeds but does not report standard deviation or spread of the underlying confidence differences. The main results (Tables 2, 4) appear to be single-run results with no variance information." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "Guided-Prompting (Golchin & Surdeanu, 2023b) is used as the main baseline in Table 2. DCQ and Min-k% Prob are compared in Appendices A and B. A simplified version of PaCoST (Algorithm 2) also serves as a comparison." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "Baselines include Guided-Prompting (2023), DCQ (2023), and Min-k% Prob (2024) — all recent contamination detection methods. Table 1 compares against all major existing approaches." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "The simplified version (Algorithm 2) ablates the confidence estimation step by using ground-truth answers instead of model responses. Appendix C tests different rephrase models (Llama vs. Mistral), different contamination types, and different random seeds — all effectively ablating design choices." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": false, 87 "justification": "The only metric for PaCoST's performance is the p-value from the paired t-test. No secondary metrics (detection accuracy, AUC, precision/recall) are computed for the main method. Accuracy is reported only for DCQ and Min-k% Prob comparisons." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "Human evaluation is used only for rephrase quality assessment (Table 8), not for evaluating the contamination detection outputs. The detection evaluation is entirely automated via the p-value threshold." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": true, 97 "justification": "In controlled experiments, 1000 samples are used for contamination and a separate 400 samples from the remaining WMDP data form the clean test set. These splits are clearly separated and used for different evaluation purposes (Section 5.1)." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Results are broken down by model (10 models), benchmark (6 benchmarks) in Table 4, by data type (trained/untrained) in Table 2, and by sample size in Table 3. Appendix C provides breakdowns by contamination type and rephrase model." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": false, 107 "justification": "No failure cases of PaCoST are discussed. The method achieves zero false positives and zero false negatives in all controlled experiments (Table 2). The limitations section discusses structural limitations but not specific cases where the method breaks down." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "The simplified version (Algorithm 2) produces a false negative on contaminated Llama (p=0.94 in Table 2). The authors discuss why this variant fails: 'using ground-truth answer may result in contaminated model behaving similarly on original samples and rephrased samples' (Appendix A)." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims PaCoST 'effectively detect[s] benchmark contamination' (supported by Tables 2-3) and 'almost all models and benchmarks we tested are suspected contaminated' (supported by Table 4). Both claims are backed by experimental results." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "The key causal claim — that higher confidence on original vs. rephrased data indicates contamination — is validated through controlled experiments where the authors intentionally contaminate models via fine-tuning (Section 5.1). This controlled manipulation establishes the causal link between contamination and the confidence signal." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": true, 129 "justification": "The abstract specifies 'open-source LLMs.' The limitations section explicitly states the method requires probability distributions unavailable in black-box models, cannot detect instance-level contamination, and has efficiency costs. Models tested range from 0.5B to 13B parameters." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper does not substantively discuss alternative explanations for confidence differences beyond contamination. Rephrasing could introduce subtle difficulty changes; certain phrasings might be more common in training data without being benchmark contamination; or model familiarity with certain question formats could affect confidence. None of these alternatives are discussed." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": true, 139 "justification": "The paper explicitly distinguishes between confidence (what is measured via P(True)) and contamination (what is claimed). Section 4.2 justifies why P(True) is a better proxy than raw probabilities or verbalized confidence, and Section 4.3 frames the p-value as 'evidence for potential contamination' rather than proof." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": true, 146 "justification": "Exact model versions are specified: 'Mistral-7B-Instruct-v0.2', 'Llama-2-7B-Chat', 'Llama-2-Chat (7B, 13B)', 'Llama-3-Instruct (8B)', 'Phi-3 (3.8B)', 'Qwen1.5 (0.5B, 7B)', 'Qwen2 (7B)', 'Yi (6B)', 'DeepSeek (7B)' (Section 5.2)." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": true, 151 "justification": "Full prompt text is provided in Appendix D for both the rephrase prompt and confidence estimation prompt, including in-context example formatting. The reader can reconstruct the exact prompts used." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": false, 156 "justification": "No hyperparameters are reported for the fine-tuning process (learning rate, epochs, batch size) or for inference (temperature, top-p, max tokens). The only parameter mentioned is p < 0.05 threshold and k=20 for Min-k% Prob baseline." 157 }, 158 "scaffolding_described": { 159 "applies": false, 160 "answer": false, 161 "justification": "No agentic scaffolding is used. The method is a three-step statistical pipeline (rephrase, confidence estimation, t-test) with direct model calls." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 5.1 documents data preparation: '1000 samples from biology split from the WMDP dataset' for contamination, '400 samples from the remaining data' for clean set. Section 5.2 states 'we randomly sample 400 samples in each benchmark for detection.' The supervised fine-tuning procedure is described (second contamination type)." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": true, 173 "justification": "A dedicated 'Limitations' section discusses three specific limitations: inability to detect instance-level contamination, lower efficiency due to multiple LLM interactions, and requirement for probability distribution access (excluding black-box models)." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": true, 178 "justification": "The limitations are specific to this study: 'our method focuses on detecting benchmark-level contamination and is not suitable for identifying instance-level contamination,' 'our method involves multiple interactions with the LLM' causing efficiency concerns, and 'our approach cannot be used to detect benchmark contamination in black-box LLMs.'" 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": true, 183 "justification": "The paper explicitly bounds scope: benchmark-level only (not instance-level), open-source models only (not black-box), and notes in the Discussion (Section 5.3) that 'we do not intend to accuse any LLM provider of intentional contamination' and 'contamination does not inherently imply that a model is ineffective.'" 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": false, 190 "justification": "Raw confidence values, per-instance confidence differences, and t-statistics are not released. Only aggregated p-values are shown in tables. There is no supplementary data download." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 5.1 describes the controlled contamination procedure: sampling from WMDP, supervised fine-tuning on the biology split. Section 5.2 lists all benchmarks and models used. The three-step pipeline (rephrase, confidence, test) is fully described." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants in the study. All data comes from standard public benchmarks (WMDP, MMLU, HellaSwag, ARC, TruthfulQA, WinoGrande) and public model checkpoints." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": true, 205 "justification": "Algorithm 1 documents the full pipeline: rephrase each instance, compute confidence pairs via P(True), calculate mean difference, standard deviation, t-value, and p-value. Section 4 describes each step in detail." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "The Acknowledgement section discloses: 'Beijing Science and Technology Program (Z231100007423011), National Science Foundation of China (No. 62161160339) and Key Laboratory of Science, Technology and Standard in Press Industry.'" 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Author affiliations are clearly stated: Wangxuan Institute of Computer Technology and School of Foreign Languages, Peking University. The authors are not affiliated with any of the model providers being tested." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": true, 222 "justification": "Funding comes from Chinese government research grants and a university lab, none of which have financial interest in whether specific models or benchmarks are found to be contaminated." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial interests statement is present in the paper." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": false, 233 "answer": false, 234 "justification": "The paper does not evaluate model capability on benchmarks — it detects benchmark contamination. The benchmarks are inputs to the detection method, not targets for capability evaluation." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": false, 238 "answer": false, 239 "justification": "The paper is itself a contamination detection method. It does not evaluate model capability on benchmarks in the traditional sense, so the standard train/test overlap concern does not apply to the paper's own evaluation framework." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": false, 243 "answer": false, 244 "justification": "Benchmark contamination is the subject of the paper, not a confound of its evaluation. The controlled experiments use WMDP (released May 2024, post-training of tested models) to ensure clean baselines." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in the study. Human annotators evaluated rephrase quality but were not research subjects." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants. The Ethics Statement discusses responsible reporting of results but no IRB is needed." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in the study." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in the study." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in the study." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in the study." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in the study." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": false, 288 "justification": "The Limitations section acknowledges the method 'involves multiple interactions with the LLM, including one for paraphrasing, two for answer generation, and two for confidence estimation' but provides no quantification of wall-clock time, API costs, or tokens consumed." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "No compute budget is stated for either the fine-tuning experiments or the detection pipeline. GPU hours, hardware specifications, and total compute costs are all absent." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": true, 300 "justification": "Table 7 reports results across 5 random seeds (0, 42, 302, 3407, 9056) for both contaminated and original Llama on trained and untrained data, showing that significance/insignificance is preserved across seeds." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": false, 305 "justification": "The seed sensitivity experiment uses 5 seeds, which is stated. However, the main results in Tables 2 and 4 do not state how many runs produced them — it appears they are single-run results." 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "No hyperparameter search is described. The fine-tuning hyperparameters are not even reported, let alone any search procedure. The detection method uses fixed settings (p < 0.05) but fine-tuning settings are opaque." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": true, 315 "justification": "The method uses fixed, pre-determined settings: p < 0.05 threshold (standard statistical convention), Llama-2-Chat-7B for rephrasing (validated in Appendix C with an alternative), and P(True) for confidence (justified in Section 4.2 against alternatives). No configuration search was performed." 316 }, 317 "multiple_comparison_correction": { 318 "applies": true, 319 "answer": false, 320 "justification": "Table 4 performs 60 independent tests (10 models × 6 benchmarks) at p < 0.05, expecting ~3 false positives by chance. No multiple comparison correction (Bonferroni, Holm, Benjamini-Hochberg) is applied or discussed." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "The authors compare PaCoST against their own re-implementation or application of baseline methods (Guided-Prompting, DCQ, Min-k% Prob) without acknowledging the potential bias of authors evaluating their own system." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": false, 330 "justification": "The Limitations section acknowledges the method requires 5 LLM interactions per instance (vs. 1-2 for baselines) but does not quantify the compute cost difference or report performance as a function of compute budget." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": false, 335 "justification": "The paper does not discuss whether the paired confidence difference actually measures what is claimed. Alternatives — such as the possibility that rephrasing introduces difficulty changes rather than removing contamination signal — are not explored as construct validity concerns." 336 }, 337 "scaffold_confound_addressed": { 338 "applies": false, 339 "answer": false, 340 "justification": "No scaffolding is involved in the method. Models are queried directly with prompts." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": true, 347 "justification": "For controlled experiments, the authors explicitly select WMDP (released May 2024) to ensure models (Llama-2, Mistral) could not have been contaminated on it, establishing clean temporal separation (Section 5.1)." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "Not discussed. The confidence estimation prompt includes the model's own answer, and whether the P(True) probing setup could leak information about answer quality independent of contamination is not analyzed." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "The trained (1000) and untrained (400) samples are drawn from different splits of the same WMDP biology dataset. Whether these splits share structural similarities (same question style, topic overlap) that could affect independence is not discussed." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": true, 362 "justification": "The entire paper IS a leakage/contamination detection method. The controlled experiments use temporal splits (WMDP post-training) as the prevention mechanism, and PaCoST itself is the detection method validated against known contamination status." 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "PaCoST successfully detects intentionally contaminated benchmarks with zero false positives and zero false negatives in controlled experiments.", 369 "evidence": "Table 2 shows p-values < 0.05 for trained data on contaminated models (6e-8 for Llama, 2e-4 for Mistral) and p > 0.05 for all untrained/original conditions. Section 5.1.", 370 "supported": "strong" 371 }, 372 { 373 "claim": "PaCoST satisfies all five proposed detection requirements (TDA Free, CT Free, TDL Free, SP, T Free) while no existing method does.", 374 "evidence": "Table 1 compares PaCoST against 6 existing methods across 5 properties. The requirements are defined in Section 3.2.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "The method is stable across sample sizes from 100 to 1000 without generating false positives or false negatives.", 379 "evidence": "Table 3 shows consistent significant/insignificant results across 100, 200, 400, 500, and 1000 sample sizes for both contaminated and original models.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Almost all tested open-source models and benchmarks show evidence of benchmark contamination.", 384 "evidence": "Table 4 shows p < 0.05 for at least 2 benchmarks on 9 out of 10 models, and all 6 benchmarks are flagged for at least some models. Section 5.2.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "The method is robust to choice of rephrase model and random seeds.", 389 "evidence": "Table 9 (Appendix C) shows consistent results with Llama vs. Mistral for rephrasing. Table 7 shows consistent significance across 5 random seeds.", 390 "supported": "strong" 391 }, 392 { 393 "claim": "Guided-Prompting, DCQ, and Min-k% Prob are ineffective for benchmark contamination detection.", 394 "evidence": "Table 2 shows Guided-Prompting fails to detect contamination and generates false positives. Table 5 (Appendix B) shows DCQ accuracy worse than random. Table 6 shows Min-k% Prob conflates accuracy with contamination detection.", 395 "supported": "moderate" 396 } 397 ], 398 "red_flags": [ 399 { 400 "flag": "No multiple comparison correction", 401 "detail": "Table 4 performs 60 independent tests at p < 0.05, expecting ~3 false positives by chance alone. Without Bonferroni or similar correction, some of the 'contaminated' findings may be spurious. This weakens the claim that 'almost all models and benchmarks are contaminated.'" 402 }, 403 { 404 "flag": "Model anonymization prevents verification", 405 "detail": "Table 4 anonymizes the 10 models as 'Model I' through 'Model X,' preventing independent verification of specific contamination claims and making the results unreproducible. While ethically motivated, this undermines scientific transparency." 406 }, 407 { 408 "flag": "No effect sizes reported", 409 "detail": "Only p-values are reported for all experiments. The magnitude of confidence differences (mean d-bar, Cohen's d) is never shown, making it impossible to assess whether the detected contamination signal is practically meaningful or just statistically detectable." 410 }, 411 { 412 "flag": "Suspiciously perfect controlled results", 413 "detail": "PaCoST shows zero false positives and zero false negatives across all controlled conditions in Tables 2 and 3, while all baselines fail. The lack of any failure mode in controlled experiments, combined with missing effect sizes, makes it difficult to assess the method's true robustness boundary." 414 }, 415 { 416 "flag": "Construct validity gap for rephrasing assumption", 417 "detail": "The method assumes confidence differences between original and rephrased questions indicate contamination. However, rephrasing could introduce subtle difficulty changes, unfamiliar phrasing, or distribution shifts unrelated to contamination. This alternative explanation is not tested or discussed." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "Detecting Pretraining Data from Large Language Models", 423 "authors": ["Weijia Shi", "Anirudh Ajith", "Mengzhou Xia", "Yangsibo Huang", "Daogao Liu", "Terra Blevins", "Danqi Chen", "Luke Zettlemoyer"], 424 "year": 2024, 425 "relevance": "Proposes Min-K% Prob for detecting training data membership — a key baseline for contamination detection methods." 426 }, 427 { 428 "title": "Time Travel in LLMs: Tracing Data Contamination in Large Language Models", 429 "authors": ["Shahriar Golchin", "Mihai Surdeanu"], 430 "year": 2023, 431 "arxiv_id": "2308.08493", 432 "relevance": "Introduces Guided-Prompting for contamination detection via data replication, directly compared as a baseline." 433 }, 434 { 435 "title": "Data Contamination Quiz: A Tool to Detect and Estimate Contamination in Large Language Models", 436 "authors": ["Shahriar Golchin", "Mihai Surdeanu"], 437 "year": 2023, 438 "arxiv_id": "2311.06233", 439 "relevance": "Proposes the DCQ framework for contamination detection via multiple-choice quizzes, evaluated as a baseline." 440 }, 441 { 442 "title": "Generalization or Memorization: Data Contamination and Trustworthy Evaluation for Large Language Models", 443 "authors": ["Yihong Dong", "Xue Jiang", "Huanyu Liu", "Zhi Jin", "Ge Li"], 444 "year": 2024, 445 "arxiv_id": "2402.15938", 446 "relevance": "Proposes CDD and TED methods for contamination detection based on LLM output distribution analysis." 447 }, 448 { 449 "title": "NLP Evaluation in Trouble: On the Need to Measure LLM Data Contamination for Each Benchmark", 450 "authors": ["Oscar Sainz", "Jon Ander Campos", "Iker García-Ferrero", "Julen Etxaniz", "Oier Lopez de Lacalle", "Eneko Agirre"], 451 "year": 2023, 452 "arxiv_id": "2310.18018", 453 "relevance": "Highlights the pervasive data contamination problem in LLM evaluation and the need for per-benchmark contamination measurement." 454 }, 455 { 456 "title": "Proving Test Set Contamination in Black Box Language Models", 457 "authors": ["Yonatan Oren", "Nicole Meister", "Niladri Chatterji", "Faisal Ladhak", "Tatsunori B Hashimoto"], 458 "year": 2023, 459 "arxiv_id": "2310.17623", 460 "relevance": "Develops sharded-likelihood method for contamination detection in black-box models, addressing a complementary setting to PaCoST." 461 }, 462 { 463 "title": "Rethinking Benchmark and Contamination for Language Models with Rephrased Samples", 464 "authors": ["Shuo Yang", "Wei-Lin Chiang", "Lianmin Zheng", "Joseph E Gonzalez", "Ion Stoica"], 465 "year": 2023, 466 "arxiv_id": "2311.04850", 467 "relevance": "Studies how rephrasing benchmark samples affects LLM performance, directly motivating PaCoST's rephrasing approach." 468 }, 469 { 470 "title": "The WMDP Benchmark: Measuring and Reducing Malicious Use with Unlearning", 471 "authors": ["Nathaniel Li", "Alexander Pan", "Anjali Gopal"], 472 "year": 2024, 473 "arxiv_id": "2403.03218", 474 "relevance": "Provides the WMDP benchmark used for controlled contamination experiments, enabling temporal separation from model training." 475 }, 476 { 477 "title": "A Careful Examination of Large Language Model Performance on Grade School Arithmetic", 478 "authors": ["Hugh Zhang", "Jeff Da", "Dean Lee"], 479 "year": 2024, 480 "arxiv_id": "2405.00332", 481 "relevance": "Examines LLM benchmark performance with attention to contamination concerns in arithmetic evaluation." 482 }, 483 { 484 "title": "Scalable Extraction of Training Data from (Production) Language Models", 485 "authors": ["Milad Nasr", "Nicholas Carlini", "Jonathan Hayase"], 486 "year": 2023, 487 "arxiv_id": "2311.17035", 488 "relevance": "Demonstrates extraction of training data from production LLMs, establishing the feasibility and severity of memorization in large models." 489 }, 490 { 491 "title": "Extracting Training Data from Large Language Models", 492 "authors": ["Nicholas Carlini", "Florian Tramer", "Eric Wallace"], 493 "year": 2021, 494 "arxiv_id": "2012.07805", 495 "relevance": "Foundational work on training data extraction from language models, establishing the field of LLM membership inference." 496 }, 497 { 498 "title": "GPT-4 Technical Report", 499 "authors": ["OpenAI"], 500 "year": 2023, 501 "arxiv_id": "2303.08774", 502 "relevance": "Documents GPT-4's contamination analysis using string matching, illustrating the limitations of vendor-side contamination detection." 503 } 504 ], 505 "engagement_factors": { 506 "practical_relevance": { 507 "score": 2, 508 "justification": "Researchers and benchmark maintainers can apply PaCoST to audit LLMs for contamination, though it requires probability distribution access limiting applicability to open-source models." 509 }, 510 "surprise_contrarian": { 511 "score": 2, 512 "justification": "The finding that 'almost all models and benchmarks are suspected contaminated' challenges trust in standard LLM leaderboard rankings." 513 }, 514 "fear_safety": { 515 "score": 1, 516 "justification": "Raises concerns about unreliable LLM evaluation but does not demonstrate a novel attack or safety risk." 517 }, 518 "drama_conflict": { 519 "score": 2, 520 "justification": "Implies widespread benchmark cheating across the LLM industry, though model anonymization softens the controversy." 521 }, 522 "demo_ability": { 523 "score": 0, 524 "justification": "Code is promised but not released; no demo or installable tool available." 525 }, 526 "brand_recognition": { 527 "score": 1, 528 "justification": "From Peking University; tests popular models (Llama, Mistral, Qwen) but not from a headline lab." 529 } 530 } 531 }