ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (35463B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "The Emperor's New Clothes in Benchmarking? A Rigorous Examination of Mitigation Strategies for LLM Benchmark Data Contamination",
      6     "authors": [
      7       "Yifan Sun",
      8       "Han Wang",
      9       "Dongbai Li",
     10       "Gang Wang",
     11       "Huan Zhang"
     12     ],
     13     "year": 2025,
     14     "venue": "International Conference on Machine Learning",
     15     "arxiv_id": "2503.16402",
     16     "doi": "10.48550/arXiv.2503.16402"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims 'no existing strategy significantly improves resistance over the vanilla case across all benchmarks' and 'none effectively balances fidelity and contamination resistance.' Both are supported by Table 3 (no green-highlighted values span all benchmarks for any strategy) and Figure 4 (no strategy in upper-right corner).",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper uses a controlled experimental design: deliberately selecting uncontaminated LLMs (verified via 3 detection methods), manually introducing contamination via fine-tuning, and measuring effects on evaluation vectors. This controlled manipulation supports the causal claim that contamination affects benchmark performance through specific mitigation strategies.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Claims are bounded to the tested setting: 10 specific LLMs (3B–34B, listed in Table 7), 5 specific benchmarks (Table 8), 20 specific strategies (Table 2), and two contamination recipes. The paper states it covers 'all existing BDC mitigation strategies proposed to date' but does not overclaim to untested models or benchmarks.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss whether fine-tuning-based contamination (their method) may behave fundamentally differently from pre-training contamination. It does not consider whether their metrics might be insensitive to certain forms of strategy effectiveness, or whether model size (3B–34B only) limits conclusions about frontier models.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper measures exactly what it claims: question-level evaluation matching via normalized Hamming distance. The metrics (fidelity and resistance) directly correspond to the stated goals without proxy gaps. Section 3 provides formal definitions.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper has no dedicated Limitations or Threats-to-Validity section. The 'Impact Statement' (after Section 6) is two sentences and does not discuss limitations of the methodology.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No specific threats to validity are discussed anywhere in the paper. Key unaddressed threats include: fine-tuning vs. pre-training contamination differences, model size limitations (3B–34B only), and whether Hamming distance captures all relevant dimensions of strategy effectiveness.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what the results do NOT show. It does not acknowledge limitations of its model size range, benchmark selection, or contamination simulation method. No explicit boundary statements about untested settings.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding source, acknowledgments section, or grant numbers are mentioned anywhere in the paper.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors are listed as affiliated with University of Illinois Urbana-Champaign. No conflict with evaluated products (they evaluate open-source models and use GPT-4o only for benchmark generation, not as the subject of evaluation).",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No funding is disclosed, so independence of the funder cannot be assessed.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or financial interest disclosure is present in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Benchmark Data Contamination is defined in the introduction; fidelity and contamination resistance are formally defined with notation in Section 3; contamination scenarios (clean/contaminated/mitigated) are defined in Table 1.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly states three contributions: two novel metrics, a controlled pipeline with triple contamination verification checks, and comprehensive experiments revealing that no existing strategy works across all benchmarks.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper engages substantively with prior work, demonstrating why existing assessment methods are flawed (Fig. 2), categorizing 20 strategies from prior literature, and situating their framework relative to both mitigation and detection research streams.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The abstract states: 'Our code repository is available at https://github.com/ASTRAL-Group/BDC_mitigation_assessment.' A working URL is provided.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "All five benchmarks used (Arc-C, MMLU, TruthfulQA, GSM8K, RepliQA) are publicly available standard datasets. The paper uses standard public splits documented in Table 8.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Table 12 lists GPU hardware (9x NVIDIA L40S) and training hyperparameters, but no Python version, library versions, requirements.txt, Dockerfile, or dependency specifications are provided in the paper.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The paper provides a detailed pipeline description (Sections 4.1–4.5) and a code repository link, but no step-by-step reproduction instructions, README commands, or 'Reproducing Results' section are included in the paper itself.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Table 3 reports averages over 10 LLMs but includes no confidence intervals, error bars, or ± notation. Only point estimates are presented.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "One-sided paired hypothesis testing at the 0.05 significance level is used throughout (Section 5.1). Green-highlighted values in Tables 3 and 4 indicate statistically significant improvements over vanilla.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Fidelity and resistance scores are reported as absolute values alongside the vanilla baseline (e.g., MPA resistance 0.921 vs. vanilla 0.923 in Table 3), providing full context for the magnitude of differences.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The paper uses 10 LLMs, 5 benchmarks, and 20 strategies. They explain model selection (14 candidates filtered to 10 via contamination checks) but provide no power analysis or justification that 10 LLMs is sufficient for their statistical claims.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Table 3 states 'Each value represents the average of 10 scores obtained using different LLMs' but no standard deviation, interquartile range, or other spread measure is reported.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "The 'vanilla' case (no benchmark update, fidelity=1.000) serves as the baseline throughout. All 20 mitigation strategies are compared against it in Tables 3 and 4.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "The 20 mitigation strategies are from contemporary work (Zhu et al. 2023b, 2024a/b; Ying et al. 2024; Wang et al. 2021). The vanilla baseline is the natural comparison point. The paper claims to 'comprehensively cover all existing BDC mitigation strategies proposed to date' (Section 4.3).",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The study compares 11 single strategies and their combinations (e.g., MPA = S2+S3+S4+S9+S10+S11 vs. individual components S2, S3, S4, etc.), showing how combining strategies affects fidelity and resistance. Two contamination recipes (mild and intensive) also serve as controlled variable manipulation.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Two complementary metrics are proposed and used: fidelity (matching between clean evaluations on original vs. updated benchmark) and contamination resistance (matching between clean and contaminated evaluations on updated benchmark). Section 3 defines both formally.",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "All evaluation is automated (evaluation vectors, normalized Hamming distance). The paper notes the need for manual validation of low-fidelity strategies (Section 5.3, Tables 5–6) but does not actually perform systematic human evaluation.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Table 8 specifies standard splits: Arc-C test (1172), MMLU test (50 per subset), TruthfulQA validation (817), GSM8K test (1319), RepliQA (1000). These are held out from contamination tuning.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down per benchmark (5 benchmarks), per strategy (20 strategies), and per contamination type (mild vs. intensive) in Tables 3 and 4. Figure 4 aggregates at the benchmark level.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 5.3 provides qualitative examples of strategy failures: Table 5 shows Analysis Extension dramatically increasing problem complexity, Table 6 shows MPA introducing incorrect constraints. Appendix C.4.2 shows LLM-generated wrong answers.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The central finding is a negative result: 'no existing strategy significantly improves resistance over the vanilla case across all benchmarks' and 'none achieves strong fidelity and contamination resistance simultaneously' (Section 5.1, Section 6).",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Table 7 lists specific model versions: Llama-3.2-3B-Instruct, Yi-1.5-6B-Chat, vicuna-7b-v1.5, Llama-3.1-8B-Instruct, Falcon3-10B-Instruct, Qwen2.5-14B-Instruct, Phi-3-medium-128k-instruct, DeepSeek-V2-Lite-Chat, Qwen2.5-32B-Instruct, Yi-1.5-34B-Chat. GPT-4o-2024-08-06 is specified for mitigation strategy generation.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "The evaluation template ('Question:{input}\\n Answer:') and full 5-shot GSM8K prompt are provided in Appendix C.5. However, the GPT-4o prompts used to generate the 20 mitigated benchmarks—a key part of the methodology—are not provided. Only natural-language descriptions and output examples are given.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Table 12 reports fine-tuning details: AdamW optimizer, batch sizes 2/3/4, learning rates 1e-5/3e-5, linear LR schedule, weight decay 0, warmup ratio 5%, epochs 1/3. GPT-4o temperature 0.7 is stated for mitigation. Evaluation settings (zero-shot, 5-shot, max tokens) are in Appendix C.5.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used. The paper uses standard LLM fine-tuning and evaluation without agent-based workflows.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "The full pipeline is documented: 14 candidate LLMs filtered to 10 via 3 contamination detection methods (Tables 9–11), benchmark selection with rationale (Section 4.2), contamination recipes with validation checks (Section 4.4, Tables 13–15), and evaluation procedures (Section 4.5).",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "A code repository is provided but the paper does not explicitly state that raw evaluation vectors, contaminated model weights, or per-question results are available for independent verification.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Data collection is described in detail: LLM selection via 3 detection methods on 14 candidates (Section 4.2, Tables 9–11), benchmark selection with rationale and splits (Section 4.2, Table 8), contamination procedure (Section 4.4, Table 12), and evaluation procedure (Section 4.5).",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. Data sources are standard public benchmarks.",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Figure 3 provides an overview of the full pipeline. Each step is documented: LLM-benchmark pair selection with contamination checks, mitigation strategy application, model contamination with two recipes and three validation checks (accuracy inflation, retained correctness, perplexity), evaluation vector computation, and metric derivation.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Training data cutoff dates are not stated for any of the 10 LLMs. The paper instead relies on contamination detection methods (Min-K% Prob, Sharded Rank Comparison Test, TS-Guessing) to verify non-contamination rather than using temporal cutoffs.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "Train/test overlap is the central topic of the paper. Three BDC detection methods are applied to all 14 candidate LLM-benchmark pairs (Section 4.2, Tables 9–11). Only models passing all three detection methods on all benchmarks are retained.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "The entire paper addresses benchmark contamination. They verify non-contamination before experiments (Section 4.2), deliberately introduce contamination (Section 4.4), validate contamination effectiveness (Tables 13–15), and measure mitigation strategy resistance to contamination.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study. All experiments involve LLMs evaluated on benchmarks.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference costs, API costs, or per-example compute costs are reported despite the study requiring extensive LLM evaluations across 10 models × 5 benchmarks × 20 strategies × 2 contamination recipes, plus GPT-4o calls for generating mitigated benchmarks.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Table 12 mentions '9x NVIDIA L40S' GPUs but does not quantify total GPU hours, training time, or overall computational budget for the full experimental campaign.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "No seed sensitivity analysis is reported. Fine-tuning involves stochastic optimization, but no results across multiple random seeds are provided. Variation comes only from different LLMs.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The number of experimental runs per LLM-benchmark-strategy combination is not stated. Each value is averaged across 10 different LLMs, but whether each individual experiment was run once or multiple times is not specified.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "Fixed hyperparameters are reported in Table 12 (e.g., learning rate 1e-5 or 3e-5 depending on benchmark type) but no hyperparameter search was described. The rationale for choosing these specific values is not provided.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": false,
    392           "justification": "Different learning rates are used for multiple-choice (1e-5) vs. open-ended (3e-5) benchmarks without justification for how these values were selected. No validation-based selection process is described.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "One-sided paired hypothesis tests are performed for each of 20 strategies across 5 benchmarks and 2 contamination types, yielding up to 200 comparisons. No multiple comparison correction (Bonferroni, Holm, etc.) is mentioned.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The paper introduces two novel combination strategies (MPA-Ques+Trans-CN and MPA-Choice+Trans-CN) alongside 18 existing strategies but does not acknowledge potential bias in evaluating their own proposed combinations within the same framework.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "Different mitigation strategies have vastly different computational costs (e.g., adding a typo vs. translation via GPT-4o vs. generating entirely new questions), but performance is never analyzed as a function of compute cost.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": true,
    416           "justification": "The fidelity metric directly addresses construct validity: it measures whether the updated benchmark still evaluates the same model capability as the original. Section 3 and Figure 2(b) explicitly argue that matching aggregate accuracy is insufficient because the evaluation objective may have shifted. Tables 5–6 provide concrete examples.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": false,
    421           "answer": false,
    422           "justification": "No scaffolding is involved. The paper evaluates LLMs directly on benchmarks without agent scaffolding.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": true,
    430           "justification": "RepliQA is explicitly included because 'Its recent release (December 9, 2024) and non-factual nature ensure that none of the LLMs in our study have been contaminated by this benchmark' (Section 4.2). All other model-benchmark pairs are verified via three detection methods.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "The paper does not discuss whether the evaluation format (e.g., multiple-choice probability-based selection) or prompt structure could leak answer information, or whether mitigation strategies inadvertently introduce feature leakage.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "The paper does not discuss potential non-independence between training data and benchmark examples beyond the contamination detection step. Structural similarities or near-duplicates between benchmark questions and training corpora are not analyzed.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": true,
    448           "justification": "Three concrete detection methods are applied: Min-K% Prob (token probability-based, Table 11), Sharded Rank Comparison Test (order-based, Table 9), and TS-Guessing (generation-based, Table 10). These are applied to all 14 candidate LLMs across 4 benchmarks.",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "Existing BDC assessment methods (accuracy drop and accuracy matching) are insufficient and can produce misleading conclusions because they focus on aggregate accuracy rather than question-level evaluation.",
    457       "evidence": "Fig. 2 shows that even when mitigated accuracy matches clean accuracy exactly, the per-question correct/incorrect pattern can differ significantly, meaning the updated benchmark evaluates different capabilities.",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "No existing BDC mitigation strategy significantly improves contamination resistance over the vanilla (no update) case across all five benchmarks.",
    462       "evidence": "Table 3 with one-sided paired hypothesis testing at p=0.05 across 10 LLMs shows that while some strategies achieve significant gains on subsets of benchmarks, no single strategy achieves significance across all five.",
    463       "supported": "strong"
    464     },
    465     {
    466       "claim": "No existing strategy effectively balances high fidelity and high contamination resistance simultaneously.",
    467       "evidence": "Fig. 4 shows strategies cluster in either the lower-right (high fidelity, low-to-moderate resistance) or upper-left (high resistance, low fidelity) with no strategy achieving the upper-right ideal region.",
    468       "supported": "strong"
    469     },
    470     {
    471       "claim": "Semantic-altering strategies achieve significantly higher contamination resistance than vanilla but at the cost of approximately 0.15 lower fidelity on average.",
    472       "evidence": "Table 4 shows Remember-Understand and Application extensions reach resistance ~0.97–0.98 but fidelity ~0.65–0.77, compared to semantic-preserving strategies with fidelity >0.85.",
    473       "supported": "strong"
    474     },
    475     {
    476       "claim": "The controlled fine-tuning contamination process effectively causes benchmark memorization while preserving general model capabilities.",
    477       "evidence": "Tables 13–15 show substantial accuracy inflation (e.g., 40.4% for Yi-1.5-6B on Arc-C), retained correctness >90% in most cases, and stable perplexity on held-out Alpaca dataset after contamination.",
    478       "supported": "strong"
    479     },
    480     {
    481       "claim": "Translation-based strategies achieve relatively higher resistance on open-ended benchmarks (RepliQA) without sacrificing fidelity as severely.",
    482       "evidence": "Table 3 shows MPA-Ques+Trans-CN achieves 0.965/0.964 and Translation (Chinese) achieves 0.966/0.959 resistance on RepliQA under mild/intensive contamination, compared to vanilla's 0.709/0.597.",
    483       "supported": "moderate"
    484     }
    485   ],
    486   "methodology_tags": [
    487     "benchmark-eval"
    488   ],
    489   "key_findings": "No existing benchmark data contamination (BDC) mitigation strategy significantly outperforms the unmodified baseline across all tested benchmarks when assessed using question-level fidelity and resistance metrics. Semantic-altering strategies (e.g., analysis extension) achieve high contamination resistance (~0.97) but sacrifice approximately 0.15 fidelity, meaning they evaluate different model capabilities than the original benchmark. The paper's proposed question-level metrics reveal that the dominant existing evaluation method — aggregate accuracy matching — is misleading: per-question results can diverge substantially even when aggregate accuracy matches. These findings indicate that the BDC mitigation problem remains fundamentally unsolved and that prior claims of successful mitigation in the literature were based on inadequate evaluation methodology.",
    490   "red_flags": [
    491     {
    492       "flag": "No variance reported",
    493       "detail": "Table 3 reports averages across 10 LLMs without standard deviations, error bars, or per-model breakdown, making it impossible to assess cross-model variability or whether results are driven by outliers."
    494     },
    495     {
    496       "flag": "No limitations section",
    497       "detail": "The paper has no dedicated limitations or threats-to-validity section. Critical questions about whether fine-tuning-based contamination generalizes to real pre-training contamination are not addressed."
    498     },
    499     {
    500       "flag": "Contamination mechanism validity gap",
    501       "detail": "Real-world BDC occurs during pre-training on web-scraped data; the paper simulates it via targeted fine-tuning, which may produce qualitatively different memorization behavior and limit external validity of conclusions."
    502     },
    503     {
    504       "flag": "No funding disclosure",
    505       "detail": "No acknowledgments or funding disclosure section is present anywhere in the paper."
    506     },
    507     {
    508       "flag": "No environment specification",
    509       "detail": "Despite releasing code, no requirements.txt, Dockerfile, or Python version specification is provided, limiting reproducibility."
    510     }
    511   ],
    512   "cited_papers": [
    513     {
    514       "title": "Detecting pretraining data from large language models (Min-K% Prob, Shi et al. 2023)",
    515       "relevance": "Core BDC detection method used to verify LLM uncontamination before introducing controlled contamination"
    516     },
    517     {
    518       "title": "Proving test set contamination in black box language models (Sharded Rank Comparison, Oren et al. 2023)",
    519       "relevance": "Second BDC detection method used in the triple-verification uncontamination pipeline"
    520     },
    521     {
    522       "title": "Investigating data contamination in modern benchmarks for large language models (TS-Guessing, Deng et al. 2023)",
    523       "relevance": "Third BDC detection method used to verify clean LLM-benchmark pairs"
    524     },
    525     {
    526       "title": "Clean-eval: Clean evaluation on contaminated large language models (Zhu et al. 2023b)",
    527       "relevance": "One of the 20 BDC mitigation strategies evaluated; combines syntactic modification, synonym replacement, and back-translation"
    528     },
    529     {
    530       "title": "Dynamic evaluation of large language models by meta probing agents / MPA (Zhu et al. 2024a)",
    531       "relevance": "Key combined mitigation strategy evaluated; one of the strongest-performing strategies in terms of resistance"
    532     },
    533     {
    534       "title": "Inference-time decontamination: Reusing leaked benchmarks for large language model evaluation / ITD (Zhu et al. 2024b)",
    535       "relevance": "Another key combined mitigation strategy evaluated across all 5 benchmarks"
    536     },
    537     {
    538       "title": "Automating dataset updates towards reliable and timely evaluation of large language models (Ying et al. 2024)",
    539       "relevance": "Source of the 4 semantic-altering strategies (mimicking, remember-understand, application, analysis extension) evaluated in Section 5.2"
    540     },
    541     {
    542       "title": "Don't make your LLM an evaluation benchmark cheater (Zhou et al. 2023)",
    543       "relevance": "Early influential work on BDC concerns that motivates this paper's research question"
    544     }
    545   ],
    546   "engagement_factors": {
    547     "practical_relevance": {
    548       "score": 2,
    549       "justification": "The framework and metrics could be used by benchmark developers to assess mitigation strategies, but it requires significant compute infrastructure and is not a drop-in tool."
    550     },
    551     "surprise_contrarian": {
    552       "score": 3,
    553       "justification": "The 'Emperor's New Clothes' framing directly challenges the widely held belief that benchmark update strategies effectively mitigate contamination—finding that none significantly outperforms doing nothing across all benchmarks."
    554     },
    555     "fear_safety": {
    556       "score": 1,
    557       "justification": "Raises concerns about unreliable LLM evaluation due to contamination but does not demonstrate a novel attack or immediate safety risk."
    558     },
    559     "drama_conflict": {
    560       "score": 2,
    561       "justification": "The provocative title and finding that current mitigation strategies are essentially ineffective creates a 'benchmarks are broken' narrative, though it targets methodology rather than specific companies."
    562     },
    563     "demo_ability": {
    564       "score": 1,
    565       "justification": "Code repository is provided but running the full pipeline requires fine-tuning 10 LLMs across multiple configurations on multiple GPUs—not a quick demo."
    566     },
    567     "brand_recognition": {
    568       "score": 1,
    569       "justification": "From University of Illinois Urbana-Champaign, a well-known CS department but not a famous AI lab like OpenAI, Google DeepMind, or Anthropic."
    570     }
    571   },
    572   "hn_data": {
    573     "threads": [
    574       {
    575         "hn_id": "45489599",
    576         "title": "Tutorials for Sandia's Lammps Simulation Package",
    577         "points": 8,
    578         "comments": 1,
    579         "url": "https://news.ycombinator.com/item?id=45489599"
    580       },
    581       {
    582         "hn_id": "43454946",
    583         "title": "Exploring Hidden Reasoning Process of Large Language Models by Misleading Them",
    584         "points": 8,
    585         "comments": 0,
    586         "url": "https://news.ycombinator.com/item?id=43454946"
    587       },
    588       {
    589         "hn_id": "47533914",
    590         "title": "An Efficient Heterogeneous Co-Design for Fine-Tuning on a Single GPU",
    591         "points": 3,
    592         "comments": 0,
    593         "url": "https://news.ycombinator.com/item?id=47533914"
    594       },
    595       {
    596         "hn_id": "45015577",
    597         "title": "AetherCode: Evaluating LLMs' Ability to Win in Premier Programming Competitions",
    598         "points": 2,
    599         "comments": 1,
    600         "url": "https://news.ycombinator.com/item?id=45015577"
    601       },
    602       {
    603         "hn_id": "26657061",
    604         "title": "Intel HEXL: Accelerating Homomorphic Encryption with Intel AVX512-IFMA52",
    605         "points": 2,
    606         "comments": 0,
    607         "url": "https://news.ycombinator.com/item?id=26657061"
    608       },
    609       {
    610         "hn_id": "45010576",
    611         "title": "AetherCode: Evaluating LLMs' Ability to Win in Premier Programming Competitions",
    612         "points": 1,
    613         "comments": 0,
    614         "url": "https://news.ycombinator.com/item?id=45010576"
    615       }
    616     ],
    617     "top_points": 8,
    618     "total_points": 24,
    619     "total_comments": 2
    620   }
    621 }

Impressum · Datenschutz