ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (33469B)


      1 {
      2   "paper": {
      3     "title": "The Emperor's New Clothes in Benchmarking? A Rigorous Examination of Mitigation Strategies for LLM Benchmark Data Contamination",
      4     "authors": [
      5       "Yifan Sun",
      6       "Han Wang",
      7       "Dongbai Li",
      8       "Gang Wang",
      9       "Huan Zhang"
     10     ],
     11     "year": 2025,
     12     "venue": "Preprint",
     13     "arxiv_id": "2503.16402",
     14     "doi": "10.48550/arXiv.2503.16402"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "This paper introduces two metrics—fidelity and contamination resistance—for assessing benchmark data contamination (BDC) mitigation strategies at question level rather than aggregate accuracy. Testing 20 strategies across 10 LLMs (3B–34B) and 5 benchmarks under mild and intensive contamination, they find no existing strategy significantly outperforms the vanilla case (no update) in resistance across all benchmarks. Minor modifications preserve fidelity but offer negligible resistance; aggressive modifications improve resistance at the cost of fidelity, with no strategy achieving both simultaneously.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The abstract states: 'Our code repository is available at https://github.com/ASTRAL-Group/BDC_mitigation_assessment.' A working URL is provided."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "All five benchmarks used (Arc-C, MMLU, TruthfulQA, GSM8K, RepliQA) are publicly available standard datasets. The paper uses standard public splits documented in Table 8."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Table 12 lists GPU hardware (9x NVIDIA L40S) and training hyperparameters, but no Python version, library versions, requirements.txt, Dockerfile, or dependency specifications are provided in the paper."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper provides a detailed pipeline description (Sections 4.1–4.5) and a code repository link, but no step-by-step reproduction instructions, README commands, or 'Reproducing Results' section are included in the paper itself."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Table 3 reports averages over 10 LLMs but includes no confidence intervals, error bars, or ± notation. Only point estimates are presented."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "One-sided paired hypothesis testing at the 0.05 significance level is used throughout (Section 5.1). Green-highlighted values in Tables 3 and 4 indicate statistically significant improvements over vanilla."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Fidelity and resistance scores are reported as absolute values alongside the vanilla baseline (e.g., MPA resistance 0.921 vs. vanilla 0.923 in Table 3), providing full context for the magnitude of differences."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The paper uses 10 LLMs, 5 benchmarks, and 20 strategies. They explain model selection (14 candidates filtered to 10 via contamination checks) but provide no power analysis or justification that 10 LLMs is sufficient for their statistical claims."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Table 3 states 'Each value represents the average of 10 scores obtained using different LLMs' but no standard deviation, interquartile range, or other spread measure is reported."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The 'vanilla' case (no benchmark update, fidelity=1.000) serves as the baseline throughout. All 20 mitigation strategies are compared against it in Tables 3 and 4."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The 20 mitigation strategies are from contemporary work (Zhu et al. 2023b, 2024a/b; Ying et al. 2024; Wang et al. 2021). The vanilla baseline is the natural comparison point. The paper claims to 'comprehensively cover all existing BDC mitigation strategies proposed to date' (Section 4.3)."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The study compares 11 single strategies and their combinations (e.g., MPA = S2+S3+S4+S9+S10+S11 vs. individual components S2, S3, S4, etc.), showing how combining strategies affects fidelity and resistance. Two contamination recipes (mild and intensive) also serve as controlled variable manipulation."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Two complementary metrics are proposed and used: fidelity (matching between clean evaluations on original vs. updated benchmark) and contamination resistance (matching between clean and contaminated evaluations on updated benchmark). Section 3 defines both formally."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "All evaluation is automated (evaluation vectors, normalized Hamming distance). The paper notes the need for manual validation of low-fidelity strategies (Section 5.3, Tables 5–6) but does not actually perform systematic human evaluation."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Table 8 specifies standard splits: Arc-C test (1172), MMLU test (50 per subset), TruthfulQA validation (817), GSM8K test (1319), RepliQA (1000). These are held out from contamination tuning."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down per benchmark (5 benchmarks), per strategy (20 strategies), and per contamination type (mild vs. intensive) in Tables 3 and 4. Figure 4 aggregates at the benchmark level."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 5.3 provides qualitative examples of strategy failures: Table 5 shows Analysis Extension dramatically increasing problem complexity, Table 6 shows MPA introducing incorrect constraints. Appendix C.4.2 shows LLM-generated wrong answers."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The central finding is a negative result: 'no existing strategy significantly improves resistance over the vanilla case across all benchmarks' and 'none achieves strong fidelity and contamination resistance simultaneously' (Section 5.1, Section 6)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims 'no existing strategy significantly improves resistance over the vanilla case across all benchmarks' and 'none effectively balances fidelity and contamination resistance.' Both are supported by Table 3 (no green-highlighted values span all benchmarks for any strategy) and Figure 4 (no strategy in upper-right corner)."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper uses a controlled experimental design: deliberately selecting uncontaminated LLMs (verified via 3 detection methods), manually introducing contamination via fine-tuning, and measuring effects on evaluation vectors. This controlled manipulation supports the causal claim that contamination affects benchmark performance through specific mitigation strategies."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Claims are bounded to the tested setting: 10 specific LLMs (3B–34B, listed in Table 7), 5 specific benchmarks (Table 8), 20 specific strategies (Table 2), and two contamination recipes. The paper states it covers 'all existing BDC mitigation strategies proposed to date' but does not overclaim to untested models or benchmarks."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper does not discuss whether fine-tuning-based contamination (their method) may behave fundamentally differently from pre-training contamination. It does not consider whether their metrics might be insensitive to certain forms of strategy effectiveness, or whether model size (3B–34B only) limits conclusions about frontier models."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper measures exactly what it claims: question-level evaluation matching via normalized Hamming distance. The metrics (fidelity and resistance) directly correspond to the stated goals without proxy gaps. Section 3 provides formal definitions."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Table 7 lists specific model versions: Llama-3.2-3B-Instruct, Yi-1.5-6B-Chat, vicuna-7b-v1.5, Llama-3.1-8B-Instruct, Falcon3-10B-Instruct, Qwen2.5-14B-Instruct, Phi-3-medium-128k-instruct, DeepSeek-V2-Lite-Chat, Qwen2.5-32B-Instruct, Yi-1.5-34B-Chat. GPT-4o-2024-08-06 is specified for mitigation strategy generation."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The evaluation template ('Question:{input}\\n Answer:') and full 5-shot GSM8K prompt are provided in Appendix C.5. However, the GPT-4o prompts used to generate the 20 mitigated benchmarks—a key part of the methodology—are not provided. Only natural-language descriptions and output examples are given."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Table 12 reports fine-tuning details: AdamW optimizer, batch sizes 2/3/4, learning rates 1e-5/3e-5, linear LR schedule, weight decay 0, warmup ratio 5%, epochs 1/3. GPT-4o temperature 0.7 is stated for mitigation. Evaluation settings (zero-shot, 5-shot, max tokens) are in Appendix C.5."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. The paper uses standard LLM fine-tuning and evaluation without agent-based workflows."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "The full pipeline is documented: 14 candidate LLMs filtered to 10 via 3 contamination detection methods (Tables 9–11), benchmark selection with rationale (Section 4.2), contamination recipes with validation checks (Section 4.4, Tables 13–15), and evaluation procedures (Section 4.5)."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "The paper has no dedicated Limitations or Threats-to-Validity section. The 'Impact Statement' (after Section 6) is two sentences and does not discuss limitations of the methodology."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No specific threats to validity are discussed anywhere in the paper. Key unaddressed threats include: fine-tuning vs. pre-training contamination differences, model size limitations (3B–34B only), and whether Hamming distance captures all relevant dimensions of strategy effectiveness."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The paper does not explicitly state what the results do NOT show. It does not acknowledge limitations of its model size range, benchmark selection, or contamination simulation method. No explicit boundary statements about untested settings."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "A code repository is provided but the paper does not explicitly state that raw evaluation vectors, contaminated model weights, or per-question results are available for independent verification."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Data collection is described in detail: LLM selection via 3 detection methods on 14 candidates (Section 4.2, Tables 9–11), benchmark selection with rationale and splits (Section 4.2, Table 8), contamination procedure (Section 4.4, Table 12), and evaluation procedure (Section 4.5)."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. Data sources are standard public benchmarks."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Figure 3 provides an overview of the full pipeline. Each step is documented: LLM-benchmark pair selection with contamination checks, mitigation strategy application, model contamination with two recipes and three validation checks (accuracy inflation, retained correctness, perplexity), evaluation vector computation, and metric derivation."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding source, acknowledgments section, or grant numbers are mentioned anywhere in the paper."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "All authors are listed as affiliated with University of Illinois Urbana-Champaign. No conflict with evaluated products (they evaluate open-source models and use GPT-4o only for benchmark generation, not as the subject of evaluation)."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funding is disclosed, so independence of the funder cannot be assessed."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests statement or financial interest disclosure is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "Training data cutoff dates are not stated for any of the 10 LLMs. The paper instead relies on contamination detection methods (Min-K% Prob, Sharded Rank Comparison Test, TS-Guessing) to verify non-contamination rather than using temporal cutoffs."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "Train/test overlap is the central topic of the paper. Three BDC detection methods are applied to all 14 candidate LLM-benchmark pairs (Section 4.2, Tables 9–11). Only models passing all three detection methods on all benchmarks are retained."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "The entire paper addresses benchmark contamination. They verify non-contamination before experiments (Section 4.2), deliberately introduce contamination (Section 4.4), validate contamination effectiveness (Tables 13–15), and measure mitigation strategy resistance to contamination."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study. All experiments involve LLMs evaluated on benchmarks."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No inference costs, API costs, or per-example compute costs are reported despite the study requiring extensive LLM evaluations across 10 models × 5 benchmarks × 20 strategies × 2 contamination recipes, plus GPT-4o calls for generating mitigated benchmarks."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Table 12 mentions '9x NVIDIA L40S' GPUs but does not quantify total GPU hours, training time, or overall computational budget for the full experimental campaign."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No seed sensitivity analysis is reported. Fine-tuning involves stochastic optimization, but no results across multiple random seeds are provided. Variation comes only from different LLMs."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The number of experimental runs per LLM-benchmark-strategy combination is not stated. Each value is averaged across 10 different LLMs, but whether each individual experiment was run once or multiple times is not specified."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "Fixed hyperparameters are reported in Table 12 (e.g., learning rate 1e-5 or 3e-5 depending on benchmark type) but no hyperparameter search was described. The rationale for choosing these specific values is not provided."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "Different learning rates are used for multiple-choice (1e-5) vs. open-ended (3e-5) benchmarks without justification for how these values were selected. No validation-based selection process is described."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "One-sided paired hypothesis tests are performed for each of 20 strategies across 5 benchmarks and 2 contamination types, yielding up to 200 comparisons. No multiple comparison correction (Bonferroni, Holm, etc.) is mentioned."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The paper introduces two novel combination strategies (MPA-Ques+Trans-CN and MPA-Choice+Trans-CN) alongside 18 existing strategies but does not acknowledge potential bias in evaluating their own proposed combinations within the same framework."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "Different mitigation strategies have vastly different computational costs (e.g., adding a typo vs. translation via GPT-4o vs. generating entirely new questions), but performance is never analyzed as a function of compute cost."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": true,
    337         "justification": "The fidelity metric directly addresses construct validity: it measures whether the updated benchmark still evaluates the same model capability as the original. Section 3 and Figure 2(b) explicitly argue that matching aggregate accuracy is insufficient because the evaluation objective may have shifted. Tables 5–6 provide concrete examples."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding is involved. The paper evaluates LLMs directly on benchmarks without agent scaffolding."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": true,
    349         "justification": "RepliQA is explicitly included because 'Its recent release (December 9, 2024) and non-factual nature ensure that none of the LLMs in our study have been contaminated by this benchmark' (Section 4.2). All other model-benchmark pairs are verified via three detection methods."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "The paper does not discuss whether the evaluation format (e.g., multiple-choice probability-based selection) or prompt structure could leak answer information, or whether mitigation strategies inadvertently introduce feature leakage."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "The paper does not discuss potential non-independence between training data and benchmark examples beyond the contamination detection step. Structural similarities or near-duplicates between benchmark questions and training corpora are not analyzed."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": true,
    364         "justification": "Three concrete detection methods are applied: Min-K% Prob (token probability-based, Table 11), Sharded Rank Comparison Test (order-based, Table 9), and TS-Guessing (generation-based, Table 10). These are applied to all 14 candidate LLMs across 4 benchmarks."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "No existing BDC mitigation strategy significantly improves contamination resistance over the vanilla case (no benchmark update) across all benchmarks.",
    371       "evidence": "Table 3 shows that while some strategies achieve statistically significant resistance improvements on specific benchmarks (green-highlighted values), no single strategy achieves significant improvement across all 5 benchmarks. One-sided paired hypothesis testing at 0.05 level (Section 5.1).",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "No existing mitigation strategy achieves both high fidelity and high contamination resistance simultaneously.",
    376       "evidence": "Figure 4 shows strategies clustering in either the lower-right (high fidelity, low resistance) or upper-left (high resistance, low fidelity) regions. No strategy occupies the upper-right corner. Section 5.1 discusses this trade-off explicitly.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Minor modifications (typos, synonyms) achieve high fidelity (>0.9) but do not improve resistance beyond vanilla.",
    381       "evidence": "Table 3 shows typographical perturbation and synonym replacement achieve fidelity >0.9 on most benchmarks, but their resistance scores are not statistically significantly higher than vanilla on any benchmark.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Semantic-altering strategies achieve statistically significantly higher resistance than vanilla but at the cost of ~0.15 lower fidelity.",
    386       "evidence": "Table 4 shows all four semantic-altering strategies (Mimicking, Remember-Understand, Application, Analysis Extension) achieve significant resistance improvements on Arc-C and MMLU (all green-highlighted), with fidelity ranging from 0.655 to 0.766 compared to ~0.85–0.95 for semantic-preserving strategies.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Intensive contamination leads to lower resistance scores than mild contamination, but the relative ranking of strategies remains stable.",
    391       "evidence": "Table 3 shows resistance under intensive contamination is consistently lower than under mild contamination across all strategies and benchmarks. Section 5.1 states 'strategies that perform well under mild contamination continue to rank highly under intensive contamination.'",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "Existing accuracy-based assessment methods (accuracy drop, accuracy matching) are incomplete and can be misleading.",
    396       "evidence": "Figure 2 illustrates the limitations: accuracy drop lacks a clean baseline reference, and accuracy matching can show matching scalar accuracy while question-level results differ significantly. This motivates the question-level metrics. Section 1.",
    397       "supported": "moderate"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "No limitations section",
    403       "detail": "Despite making strong negative claims about all existing BDC mitigation strategies, the paper has no Limitations or Threats-to-Validity section. Key unaddressed limitations include: whether fine-tuning adequately simulates real-world pre-training contamination, the 3B–34B model size range excluding frontier models, and whether Hamming distance captures all relevant dimensions of strategy effectiveness."
    404     },
    405     {
    406       "flag": "No multiple comparison correction",
    407       "detail": "The paper performs up to 200 one-sided paired hypothesis tests (20 strategies × 5 benchmarks × 2 contamination types) without any correction for multiple comparisons. This inflates the family-wise error rate and could lead to false significant results in the green-highlighted cells."
    408     },
    409     {
    410       "flag": "Fine-tuning as contamination proxy",
    411       "detail": "Contamination is introduced via full-parameter fine-tuning (1 or 3 epochs), which is a much more direct form of memorization than incidental inclusion during pre-training on massive corpora. The paper's two recipes (mild and intensive) vary only in intensity, not in the fundamental mechanism. Results may not transfer to real-world contamination scenarios."
    412     },
    413     {
    414       "flag": "No variance reported for averaged metrics",
    415       "detail": "Table 3 averages fidelity and resistance over 10 LLMs without reporting any spread measure (std dev, IQR, range). The reader cannot assess whether the averaged metrics hide high variability across models."
    416     }
    417   ],
    418   "cited_papers": [
    419     {
    420       "title": "Don't make your LLM an evaluation benchmark cheater",
    421       "authors": ["Kun Zhou", "Yutao Zhu", "Zhipeng Chen"],
    422       "year": 2023,
    423       "arxiv_id": "2311.01964",
    424       "relevance": "Directly addresses LLM benchmark cheating and BDC detection, a foundational paper for the contamination mitigation space."
    425     },
    426     {
    427       "title": "Benchmark data contamination of large language models: A survey",
    428       "authors": ["Cheng Xu", "Shuhao Guan", "Derek Greene"],
    429       "year": 2024,
    430       "arxiv_id": "2406.04244",
    431       "relevance": "Comprehensive survey of LLM benchmark data contamination, directly relevant to the survey's evaluation reliability theme."
    432     },
    433     {
    434       "title": "Clean-eval: Clean evaluation on contaminated large language models",
    435       "authors": ["Wenhao Zhu", "Hongyi Hao", "Zhiwei He"],
    436       "year": 2023,
    437       "arxiv_id": "2311.09154",
    438       "relevance": "One of the key BDC mitigation strategies evaluated in this paper (S12), combining syntactic modification, synonym replacement, and back-translation."
    439     },
    440     {
    441       "title": "Inference-time decontamination: Reusing leaked benchmarks for large language model evaluation",
    442       "authors": ["Qin Zhu", "Qingyuan Cheng", "Runyu Peng"],
    443       "year": 2024,
    444       "arxiv_id": "2406.13990",
    445       "relevance": "Proposes the ITD mitigation strategy (S13) that is one of the most extensively evaluated approaches in this paper."
    446     },
    447     {
    448       "title": "Dynamic evaluation of large language models by meta probing agents",
    449       "authors": ["Kaijie Zhu", "Jiaao Wang", "Qiang Zhao"],
    450       "year": 2024,
    451       "relevance": "Proposes the MPA mitigation strategy (S14), one of the combined strategies that achieves higher resistance but lower fidelity."
    452     },
    453     {
    454       "title": "Automating dataset updates towards reliable and timely evaluation of large language models",
    455       "authors": ["Jiahao Ying", "Yixin Cao", "Yushi Bai"],
    456       "year": 2024,
    457       "relevance": "Proposes semantic-altering mitigation strategies (Mimicking, Remember-Understand, Application, Analysis Extension) that are the S17–S20 strategies evaluated."
    458     },
    459     {
    460       "title": "LiveBench: A challenging, contamination-free LLM benchmark",
    461       "authors": ["Colin White", "Samuel Dooley", "Manley Roberts"],
    462       "year": 2024,
    463       "arxiv_id": "2406.19314",
    464       "relevance": "Proposes a contamination-free benchmark approach as an alternative to mitigation strategies, relevant to evaluation reliability."
    465     },
    466     {
    467       "title": "Detecting pretraining data from large language models",
    468       "authors": ["Weijia Shi", "Anirudh Ajith", "Mengzhou Xia"],
    469       "year": 2023,
    470       "arxiv_id": "2310.16789",
    471       "relevance": "Proposes Min-K% Prob, one of the three contamination detection methods used in this paper's pipeline to verify model non-contamination."
    472     },
    473     {
    474       "title": "Proving test set contamination in black box language models",
    475       "authors": ["Yonatan Oren", "Nicole Meister", "Niladri Chatterji"],
    476       "year": 2023,
    477       "arxiv_id": "2310.17623",
    478       "relevance": "Proposes the Sharded Rank Comparison Test used as one of three contamination detection methods in this paper."
    479     },
    480     {
    481       "title": "Investigating data contamination in modern benchmarks for large language models",
    482       "authors": ["Chunyuan Deng", "Yilun Zhao", "Xiangru Tang"],
    483       "year": 2023,
    484       "arxiv_id": "2311.09783",
    485       "relevance": "Proposes TS-Guessing, the generation-based contamination detection method used as one of three detection methods in this paper."
    486     },
    487     {
    488       "title": "Evading data contamination detection for language models is (too) easy",
    489       "authors": ["Jasper Dekoninck", "Mark Niklas Müller", "Maximilian Baader"],
    490       "year": 2024,
    491       "arxiv_id": "2402.02823",
    492       "relevance": "Demonstrates that contamination detection can be evaded, motivating the need for robust contamination mitigation assessment."
    493     },
    494     {
    495       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    496       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    497       "year": 2024,
    498       "arxiv_id": "2403.07974",
    499       "relevance": "Contamination-free code evaluation benchmark, relevant to the survey's coverage of evaluation integrity for LLM code generation."
    500     },
    501     {
    502       "title": "Top leaderboard ranking = top coding proficiency, always? EvoEval: Evolving coding benchmarks via LLM",
    503       "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Lingming Zhang"],
    504       "year": 2024,
    505       "arxiv_id": "2403.19114",
    506       "relevance": "Questions whether leaderboard rankings reflect true coding proficiency and proposes evolving benchmarks via LLMs, directly relevant to benchmark reliability."
    507     }
    508   ],
    509   "engagement_factors": {
    510     "practical_relevance": {
    511       "score": 2,
    512       "justification": "The framework and metrics could be used by benchmark developers to assess mitigation strategies, but it requires significant compute infrastructure and is not a drop-in tool."
    513     },
    514     "surprise_contrarian": {
    515       "score": 3,
    516       "justification": "The 'Emperor's New Clothes' framing directly challenges the widely held belief that benchmark update strategies effectively mitigate contamination—finding that none significantly outperforms doing nothing across all benchmarks."
    517     },
    518     "fear_safety": {
    519       "score": 1,
    520       "justification": "Raises concerns about unreliable LLM evaluation due to contamination but does not demonstrate a novel attack or immediate safety risk."
    521     },
    522     "drama_conflict": {
    523       "score": 2,
    524       "justification": "The provocative title and finding that current mitigation strategies are essentially ineffective creates a 'benchmarks are broken' narrative, though it targets methodology rather than specific companies."
    525     },
    526     "demo_ability": {
    527       "score": 1,
    528       "justification": "Code repository is provided but running the full pipeline requires fine-tuning 10 LLMs across multiple configurations on multiple GPUs—not a quick demo."
    529     },
    530     "brand_recognition": {
    531       "score": 1,
    532       "justification": "From University of Illinois Urbana-Champaign, a well-known CS department but not a famous AI lab like OpenAI, Google DeepMind, or Anthropic."
    533     }
    534   }
    535 }

Impressum · Datenschutz