ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (23694B)


      1 {
      2   "paper": {
      3     "title": "A Survey on Data Contamination for Large Language Models",
      4     "authors": [
      5       "Yuxing Cheng",
      6       "Yi Chang",
      7       "Yuan Wu"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv",
     11     "arxiv_id": "2502.14425",
     12     "doi": "10.48550/arXiv.2502.14425"
     13   },
     14   "scan_version": 2,
     15   "active_modules": ["survey_methodology"],
     16   "methodology_tags": ["meta-analysis"],
     17   "key_findings": "This survey categorizes data contamination into phase-based (pre-training, fine-tuning, post-deployment, multi-modal) and benchmark-based (text, text-label, augmentation-based, benchmark-level) types. It reviews contamination-free evaluation strategies (data updating, data rewriting, prevention-based methods, dynamic evaluation, LLM-as-a-judge) and detection paradigms (white-box, gray-box, black-box). Five key characteristics of contamination are identified: inevitability at scale, scaling law effects (larger models memorize more), cross-stage dynamics, task-specificity, and cross-lingual transfer effects.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No code repository or analysis scripts are released. The survey references external tools (Contamination Detector, Overlapy, etc. in Appendix C) but does not release its own artifacts."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No dataset of surveyed papers, extracted data tables, or supplementary materials are released. The survey lists external benchmarks (WikiMIA, BookMIA, MIMIR, PatentMIA, StackMIAsub) in Section 3.4 but does not release its own collected data."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No environment specifications provided. A survey could release a reproducible analysis environment but this one does not."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No reproduction instructions for the survey's paper selection or analysis process are provided."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "Pure survey paper with no experiments or statistical analysis of its own."
     46       },
     47       "significance_tests": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "Pure survey paper with no experiments or comparative statistical claims of its own."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "Pure survey paper with no experiments producing effect sizes."
     56       },
     57       "sample_size_justified": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "Pure survey paper; no experimental sample size to justify."
     61       },
     62       "variance_reported": {
     63         "applies": false,
     64         "answer": false,
     65         "justification": "Pure survey paper with no experimental runs producing variance."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Table 1 compares this survey against five prior surveys (Ravaut et al., Xu et al., Fu et al., Chen et al., Deng et al.) across coverage dimensions (definition, detection, mitigation). The 'Difference with previous survey' subsection in Section 1 also positions against prior work."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The compared prior surveys are all recent (2024-2025): Ravaut et al. 2024, Xu et al. 2024, Fu et al. 2024, Chen et al. 2025, Deng et al. 2024. These represent the current state of contamination survey literature."
     78       },
     79       "ablation_study": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "Survey paper with no system or components to ablate."
     83       },
     84       "multiple_metrics": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Survey paper with no experiments producing metrics."
     88       },
     89       "human_evaluation": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "Survey paper with no system outputs to evaluate."
     93       },
     94       "held_out_test_set": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "Survey paper with no experiments requiring test sets."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The survey provides detailed categorical breakdowns: contamination types (phase-based vs benchmark-based, Section 2), mitigation strategies (data updating, rewriting, prevention in Section 3.1), evaluation approaches (dynamic, LLM-as-judge in Sections 3.2-3.3), and detection paradigms (white-box, gray-box, black-box in Section 4). Figure 1 provides a comprehensive structural overview."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 5.2 discusses limitations of detection methods: 'Existing black-box contamination detection approaches rely on heuristic rules that Fu et al. (2024) showed fail under certain conditions.' Section 3.3 notes LLM-as-a-judge has preference contamination issues (Li et al. 2025). Section 5.1 notes unlearning may be 'fundamentally unachievable' (Shumailov et al. 2024)."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The survey reports negative findings from the literature: detection methods fail under certain conditions (Section 5.2), safety mechanisms complicate detection, augmentation-based contamination evades traditional methods (Dekoninck et al. 2024b), and MIA methods have 'underwhelming performance on LLMs' (Duan et al. 2024 in Section 4.2)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims to (1) examine definitions and impacts of data contamination (covered in Section 2), (2) review contamination-free evaluation methods (covered in Section 3), and (3) categorize detection methods into white-box, gray-box, and black-box (covered in Section 4). All three are substantively addressed."
    120       },
    121       "causal_claims_justified": {
    122         "applies": false,
    123         "answer": false,
    124         "justification": "The survey reports on others' causal findings but does not make original causal claims requiring its own study design. Statements like 'data contamination artificially inflates model performance' are summaries of cited work, not independent causal claims."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 7 (Limitations) explicitly bounds the scope: 'our focus is primarily on data contamination within the context of LLMs, and we may not have fully incorporated previous research on data contamination in other areas of machine learning.' The title itself scopes to LLMs."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": false,
    133         "answer": false,
    134         "justification": "Pure survey/taxonomy paper with no original empirical results requiring alternative explanations."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": false,
    138         "answer": false,
    139         "justification": "Survey paper with no measurements of its own; no proxy-outcome gap to address."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "Survey paper that does not run any models."
    147       },
    148       "prompts_provided": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "Survey paper that does not use prompting."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "Survey paper with no experiments requiring hyperparameters."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "Survey paper with no agentic scaffolding."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "The survey does not describe its paper selection methodology. No search queries, databases searched, inclusion/exclusion criteria, or paper counts at each filtering stage are provided. It is unclear how the surveyed papers were identified and selected."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 7 is a dedicated 'Limitations' section with substantive discussion of the survey's coverage gaps."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 7 identifies specific limitations: (1) new contamination mechanisms may not be captured, (2) focus on LLMs may miss other ML contamination research, (3) only some static benchmarks are listed, (4) related areas like MIA, machine unlearning, and LLM memorization are not fully covered. These are specific to this survey's scope decisions."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 7 explicitly states what is not covered: 'our focus is primarily on data contamination within the context of LLMs,' 'we may not cover all related areas such as membership inference attacks (MIA), machine unlearning, and LLM memorization.' The paper is clear about its boundaries."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No raw data (e.g., list of all surveyed papers, extracted metadata, classification decisions) is made available for independent verification."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "The survey does not describe how papers were collected. No search databases, queries, time periods, or inclusion/exclusion criteria are specified. It is unclear how the corpus of reviewed work was assembled."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants in this survey paper."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No documentation of the pipeline from initial paper discovery to final inclusion in the survey. The reader cannot determine how comprehensive or systematic the coverage is."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding source or acknowledgments section is present in the paper text."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly listed: Jilin University (College of Software, School of Artificial Intelligence, Engineering Research Center of Knowledge-Driven Human-Machine Intelligence, International Center of Future Science)."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding statement means the reader cannot determine whether any funder had a stake in the survey's conclusions."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Survey paper that does not evaluate any pre-trained model's capability on benchmarks."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this survey."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this survey."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this survey."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this survey."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this survey."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this survey."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this survey."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "Survey paper with no method to cost."
    289       },
    290       "compute_budget_stated": {
    291         "applies": false,
    292         "answer": false,
    293         "justification": "Survey paper with no computation performed."
    294       }
    295     },
    296     "survey_methodology": {
    297       "prisma_or_structured_protocol": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No PRISMA flow diagram, no protocol registration, no structured search strategy with reproducible queries. The survey organizes by topic but does not describe a systematic paper collection methodology. The reader cannot determine how papers were found or whether the coverage is comprehensive."
    301       },
    302       "quality_assessment_of_sources": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The survey treats all cited papers equally regardless of their methodological quality. No quality scoring rubric, risk-of-bias assessment, or structured evaluation of included studies is performed. For example, papers with controlled contamination experiments are cited alongside informal observations with no differentiation."
    306       },
    307       "publication_bias_discussed": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No discussion of publication bias. The survey does not consider whether published contamination research skews toward certain findings (e.g., positive detection results) or whether negative detection results are underrepresented."
    311       }
    312     }
    313   },
    314   "claims": [
    315     {
    316       "claim": "Data contamination is inevitable as LLM training datasets expand, since web-scraped data inadvertently overlaps with evaluation benchmarks.",
    317       "evidence": "Section 2.2.4 cites Villalobos et al. (2024) on scaling data requirements and Deng et al. (2023) on web crawl overlap with benchmarks.",
    318       "supported": "moderate"
    319     },
    320     {
    321       "claim": "Larger models exhibit stronger contamination effects than smaller ones due to increased memorization capacity.",
    322       "evidence": "Section 2.2.4 cites Kocyigit et al. (2025) and Riddell et al. (2024) on scaling laws of memorization.",
    323       "supported": "moderate"
    324     },
    325     {
    326       "claim": "Cross-lingual contamination can inflate English benchmark performance without direct exposure, while evading existing detection methods.",
    327       "evidence": "Section 2.2.4 cites Yao et al. (2024) on cross-lingual benchmark inflation and Zhang et al. (2024a) on evasion of detection.",
    328       "supported": "moderate"
    329     },
    330     {
    331       "claim": "Existing black-box contamination detection methods rely on heuristic rules that fail under certain conditions, raising concerns about their fundamental reliability.",
    332       "evidence": "Section 5.2 cites Fu et al. (2024) showing failure of black-box methods. Section 4.3 discusses assumptions underlying each method (detailed in Appendix B).",
    333       "supported": "moderate"
    334     },
    335     {
    336       "claim": "LLM-as-a-judge evaluation may suffer from preference contamination, where models trained on synthetic data from architecturally similar foundations receive unfair preference.",
    337       "evidence": "Section 3.3 cites Li et al. (2025) identifying systematic bias in LLM-as-a-judge evaluations.",
    338       "supported": "weak"
    339     },
    340     {
    341       "claim": "This survey provides comprehensive coverage of data contamination definition, detection, and mitigation, unlike prior surveys which cover only partial aspects.",
    342       "evidence": "Table 1 compares against five prior surveys, claiming this survey covers all three areas comprehensively while others have gaps.",
    343       "supported": "weak"
    344     }
    345   ],
    346   "red_flags": [
    347     {
    348       "flag": "No systematic search methodology",
    349       "detail": "The survey does not describe how papers were identified, what databases were searched, what search terms were used, or what inclusion/exclusion criteria were applied. The reader cannot assess whether coverage is comprehensive or opportunistic."
    350     },
    351     {
    352       "flag": "No quality assessment of sources",
    353       "detail": "All cited papers are treated equally regardless of methodological rigor. Papers with controlled experiments are mixed with informal observations and position statements without any quality differentiation, potentially laundering weak findings through the survey."
    354     },
    355     {
    356       "flag": "Comprehensiveness claim weakly supported",
    357       "detail": "Table 1 claims 'Comprehensive' coverage of definitions versus partial coverage by competitors, but the criteria for this comparison are not defined. The self-assessment appears subjective — the authors grade themselves highest without explicit evaluation criteria."
    358     }
    359   ],
    360   "cited_papers": [
    361     {
    362       "title": "Leak, cheat, repeat: Data contamination and evaluation malpractices in closed-source LLMs",
    363       "authors": ["Simone Balloccu", "Patrícia Schmidtová", "Mateusz Lango", "Ondrej Dusek"],
    364       "year": 2024,
    365       "relevance": "Studies data contamination and evaluation malpractices in closed-source LLMs, directly relevant to benchmark evaluation integrity."
    366     },
    367     {
    368       "title": "Benchmarking benchmark leakage in large language models",
    369       "authors": ["Ruijie Xu", "Zengzhi Wang", "Run-Ze Fan", "Pengfei Liu"],
    370       "year": 2024,
    371       "arxiv_id": "2404.18824",
    372       "relevance": "Analyzes 31 LLMs for benchmark leakage in mathematical reasoning, quantifying widespread data contamination."
    373     },
    374     {
    375       "title": "Quantifying memorization across neural language models",
    376       "authors": ["Nicholas Carlini", "Daphne Ippolito", "Matthew Jagielski", "Katherine Lee", "Florian Tramer", "Chiyuan Zhang"],
    377       "year": 2022,
    378       "arxiv_id": "2202.07646",
    379       "relevance": "Foundational work on measuring LLM memorization of training data, key to understanding contamination mechanisms."
    380     },
    381     {
    382       "title": "NLP evaluation in trouble: On the need to measure LLM data contamination for each benchmark",
    383       "authors": ["Oscar Sainz", "Jon Campos", "Iker García-Ferrero", "Julen Etxaniz", "Oier Lopez de Lacalle", "Eneko Agirre"],
    384       "year": 2023,
    385       "relevance": "Argues that contamination evidence is fragmented and prevalence is underestimated, calling for per-benchmark contamination measurement."
    386     },
    387     {
    388       "title": "Detecting pretraining data from large language models",
    389       "authors": ["Weijia Shi", "Anirudh Ajith", "Mengzhou Xia", "Yangsibo Huang", "Daogao Liu", "Terra Blevins", "Danqi Chen", "Luke Zettlemoyer"],
    390       "year": 2024,
    391       "arxiv_id": "2310.16789",
    392       "relevance": "Proposes Min-K% method and WikiMIA/BookMIA benchmarks for pretraining data detection, a key gray-box contamination detection technique."
    393     },
    394     {
    395       "title": "Quantifying contamination in evaluating code generation capabilities of language models",
    396       "authors": ["Martin Riddell", "Ansong Ni", "Arman Cohan"],
    397       "year": 2024,
    398       "arxiv_id": "2403.04811",
    399       "relevance": "Quantifies contamination between code generation benchmarks and pretraining corpora, directly relevant to LLM code evaluation reliability."
    400     },
    401     {
    402       "title": "LiveBench: A challenging, contamination-free LLM benchmark",
    403       "authors": ["Colin White", "Samuel Dooley", "Manley Roberts"],
    404       "year": 2024,
    405       "arxiv_id": "2406.19314",
    406       "relevance": "Introduces a dynamically updated contamination-free benchmark with automated scoring, exemplifying data updating-based mitigation."
    407     },
    408     {
    409       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    410       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    411       "year": 2024,
    412       "arxiv_id": "2403.07974",
    413       "relevance": "Dynamic code evaluation benchmark that addresses contamination through continuous updating, relevant to contamination-free evaluation of code LLMs."
    414     },
    415     {
    416       "title": "Evading data contamination detection for language models is (too) easy",
    417       "authors": ["Jasper Dekoninck", "Mark Niklas Müller", "Maximilian Baader", "Marc Fischer", "Martin Vechev"],
    418       "year": 2024,
    419       "arxiv_id": "2402.02823",
    420       "relevance": "Demonstrates that traditional detection methods fail against augmentation-based contamination, highlighting fundamental challenges in contamination detection."
    421     },
    422     {
    423       "title": "Evaluation data contamination in LLMs: how do we measure it and (when) does it matter?",
    424       "authors": ["Aaditya K Singh", "Muhammed Yusuf Kocyigit", "Andrew Poulton"],
    425       "year": 2024,
    426       "arxiv_id": "2411.03923",
    427       "relevance": "Proposes ConTAM contamination evaluation protocol and explores when contamination actually affects LLM evaluation results."
    428     },
    429     {
    430       "title": "Unveiling the spectrum of data contamination in language models: A survey from detection to remediation",
    431       "authors": ["Chunyuan Deng", "Yilun Zhao", "Yuzhao Heng", "Yitong Li", "Jiannan Cao", "Xiangru Tang", "Arman Cohan"],
    432       "year": 2024,
    433       "relevance": "Prior comprehensive contamination survey covering detection through remediation, one of the key comparison points for this work."
    434     },
    435     {
    436       "title": "Overestimation in LLM evaluation: A controlled large-scale study on data contamination's impact on machine translation",
    437       "authors": ["Muhammed Yusuf Kocyigit", "Eleftheria Briakou", "Daniel Deutsch"],
    438       "year": 2025,
    439       "arxiv_id": "2501.18771",
    440       "relevance": "Large-scale controlled study showing larger models exhibit stronger contamination effects and cross-lingual contamination thresholds, key empirical evidence for contamination scaling laws."
    441     }
    442   ]
    443 }

Impressum · Datenschutz