scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23458B)
      1 {
      2   "paper": {
      3     "title": "Benchmark Data Contamination of Large Language Models: A Survey",
      4     "authors": ["Cheng Xu", "Shuhao Guan", "Derek Greene", "M-Tahar Kechadi"],
      5     "year": 2024,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2406.04244",
      8     "doi": "XXXXXXX.XXXXXXX"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": false,
     15         "justification": "No repository URL, code archive, or data release is mentioned anywhere in the paper. As a survey, it could have released analysis scripts, a curated bibliography, or a structured dataset of reviewed papers."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No structured dataset of reviewed papers, methods, or results is released. The paper presents tables summarizing methods (Tables 2 and 3), but no downloadable data artifact is provided."
     21       },
     22       "environment_specified": {
     23         "applies": false,
     24         "answer": false,
     25         "justification": "This is a literature survey with no computational experiments, so environment specifications are not applicable."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No instructions are provided for reproducing the survey methodology — how papers were found, which databases were searched, what search queries were used, or how the final set of reviewed works was determined."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": false,
     36         "answer": false,
     37         "justification": "This is a narrative literature survey with no original experiments or statistical analysis of its own."
     38       },
     39       "significance_tests": {
     40         "applies": false,
     41         "answer": false,
     42         "justification": "No statistical tests are performed. The paper reviews and summarizes existing work narratively."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "No original quantitative analysis is performed. The paper cites results from other works but does not aggregate or compute effect sizes."
     48       },
     49       "sample_size_justified": {
     50         "applies": false,
     51         "answer": false,
     52         "justification": "No experiments are conducted. This is a qualitative survey paper."
     53       },
     54       "variance_reported": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "No experiments are conducted, so variance across runs is not applicable."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The paper does not compare its survey against prior surveys on benchmark contamination. It mentions that 'there is currently no comprehensive and systematic research that thoroughly discusses and defines this problem' (Section 1) but does not formally compare against any prior survey work."
     65       },
     66       "baselines_contemporary": {
     67         "applies": false,
     68         "answer": false,
     69         "justification": "No baseline comparisons are made, so contemporaneity of baselines is not applicable."
     70       },
     71       "ablation_study": {
     72         "applies": false,
     73         "answer": false,
     74         "justification": "This is a survey paper with no system components to ablate."
     75       },
     76       "multiple_metrics": {
     77         "applies": false,
     78         "answer": false,
     79         "justification": "No evaluation metrics are used for the survey itself. The paper discusses metrics used by others (Table 1) but does not apply evaluation metrics to its own work."
     80       },
     81       "human_evaluation": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "The paper makes no claims about system outputs that would require human evaluation. It is a literature review."
     85       },
     86       "held_out_test_set": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "No experiments are conducted, so held-out test sets are not applicable."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The paper provides structured breakdowns of methods by category: detection techniques are split into matching-based and comparison-based (Table 2, Section 3), and mitigation strategies into data curation, data refactoring, and benchmark-free evaluation (Table 3, Section 4). Seven NLP task categories susceptible to BDC are also enumerated (Section 2.4)."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The paper discusses limitations and failure modes of each category of methods. For example, it notes that n-gram matching has high false negative rates (Section 3.1), that paraphrasing can bypass matching-based detection (Section 3.1, citing Yang et al. and Dekoninck et al.), and that benchmark-free methods face secondary contamination risks (Section 4.3)."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports negative findings from the literature, including that contamination does not always correlate with improved model performance (Li et al. [91], Section 3.1), and that 'for classification tasks without task contamination, LLMs show no significant improvement over simple majority baselines' (Section 3.1). Section 5 also frankly states that it is 'impracticable to fully remove the risks associated with contamination.'"
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims the paper 'reviews the complex challenge of BDC in LLM evaluation and explores alternative assessment methods to mitigate the risks associated with traditional benchmarks.' These claims are directly supported by Sections 3 (detection) and 4 (mitigation), which comprehensively review these topics."
    112       },
    113       "causal_claims_justified": {
    114         "applies": false,
    115         "answer": false,
    116         "justification": "The paper is a literature survey and does not make its own causal claims. It describes causal findings from other papers but does not generate new causal evidence."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper claims to be a 'comprehensive survey' of BDC but does not describe its search methodology, inclusion criteria, or whether any systematic search was conducted. Without knowing how papers were selected, the comprehensiveness claim is unbounded. The title and framing suggest completeness that is not substantiated by a documented selection process."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper discusses alternative perspectives on contamination. Section 1 notes that 'some studies see this phenomenon as beneficial [12] or do not consider it to be a problem [16].' Section 5 discusses multiple alternative future directions and trade-offs. The paper considers that contamination may not always lead to inflated performance (Section 3.1, citing Li et al. [91])."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": false,
    132         "answer": false,
    133         "justification": "This is a survey paper that does not run any models. Model versions discussed are those from the reviewed papers."
    134       },
    135       "prompts_provided": {
    136         "applies": false,
    137         "answer": false,
    138         "justification": "No prompting is used in this survey paper."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": false,
    142         "answer": false,
    143         "justification": "No experiments are conducted, so hyperparameters are not applicable."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No agentic scaffolding is used in this survey paper."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The paper does not describe any systematic literature search methodology. There is no description of databases searched, search queries used, inclusion/exclusion criteria, or how the final set of reviewed papers was selected. The paper appears to be a narrative review rather than a systematic one, but even so, the selection process is undocumented."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "There is no dedicated limitations section. Section 5 ('Challenges and Future Directions') discusses challenges of the BDC problem in general but does not discuss limitations of the survey itself (e.g., potential incompleteness, selection bias, methodological limitations of the review)."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "No threats to validity of the survey methodology are discussed. The paper does not acknowledge potential biases in paper selection, gaps in coverage, or limitations of its narrative review approach."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "The paper does not explicitly state what is out of scope. It does not specify time boundaries for literature coverage, which types of models or benchmarks are included/excluded, or what the survey does not claim to cover. The framing as 'comprehensive' without stated boundaries is a concern."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No raw data (e.g., a complete list of papers considered, search results, screening decisions) is made available. The reader cannot verify what was included or excluded."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The paper does not describe how the reviewed papers were collected. There is no mention of databases searched (e.g., Google Scholar, Semantic Scholar, ACL Anthology), search terms, date ranges, or other data collection procedures."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants are involved. Data source is existing published literature."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "No data pipeline is documented. The paper jumps from 'there is currently no comprehensive and systematic research' to presenting categorized methods without explaining how the literature corpus was assembled or organized."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No funding information is provided anywhere in the paper. There is no acknowledgments section listing grants or sponsors."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "All four authors are listed with their affiliation at University College Dublin, Ireland, with email addresses provided. The affiliations are clearly stated on the first page."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": false,
    208         "answer": false,
    209         "justification": "No funding source is disclosed, so independence cannot be assessed. Since the paper is an academic survey from a university and does not evaluate any commercial product, this is treated as NA (likely unfunded academic work)."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is provided anywhere in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "This is a survey paper that does not evaluate any pre-trained model on a benchmark."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "This is a survey paper that does not evaluate any pre-trained model on a benchmark. (Ironically, the paper is about contamination, but it does not evaluate models itself.)"
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "No model evaluation is performed in this survey paper."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants are involved in this survey paper."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved in this survey paper."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved in this survey paper."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved in this survey paper."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved in this survey paper."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved in this survey paper."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved in this survey paper."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "This is a survey paper with no computational method of its own."
    276       },
    277       "compute_budget_stated": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "This is a survey paper with no computational experiments."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "Benchmark Data Contamination (BDC) poses significant challenges to the reliability and validity of LLM evaluations, undermining trust in their outputs.",
    287       "evidence": "Section 1 cites multiple works [69, 83, 98, 119, 126, 178] supporting this claim. The paper provides a four-level taxonomy of contamination severity (semantic, information, data, label levels) in Section 2.2.",
    288       "supported": "moderate"
    289     },
    290     {
    291       "claim": "BDC detection methods can be categorized into matching-based and comparison-based approaches, each with distinct strengths and limitations.",
    292       "evidence": "Section 3 reviews 13 representative works organized into these two categories, with detailed descriptions of each approach. Table 2 provides a structured summary.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "BDC mitigation strategies can be categorized into data curation, data refactoring, and benchmark-free evaluation approaches.",
    297       "evidence": "Section 4 reviews these three categories with representative works for each. Table 3 provides a structured summary of methods, descriptions, and references.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "It is currently impracticable to fully remove the risks associated with benchmark contamination.",
    302       "evidence": "Section 5 provides two main reasons: the imperative of large-scale pre-training and the ascendancy of AI-generated content. The argument is logical but based on reasoning rather than empirical evidence from the paper itself.",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "LLMs perform better on datasets released before their training data creation date, suggesting the presence of contamination.",
    307       "evidence": "Section 3.1, citing Li and Flanigan [85], who employed four detection methods to provide evidence of this contamination. Also corroborated by Huang et al. [61] showing GPT-4's performance decline on problems released after September 2021 (Section 3.2).",
    308       "supported": "moderate"
    309     },
    310     {
    311       "claim": "Contamination does not always correlate with improved model performance.",
    312       "evidence": "Section 3.1, citing Li et al. [91] who found 'contamination does not always correlate with improved model performance' and that 'for classification tasks without task contamination, LLMs show no significant improvement over simple majority baselines.'",
    313       "supported": "moderate"
    314     }
    315   ],
    316   "methodology_tags": ["meta-analysis"],
    317   "key_findings": "This survey provides a comprehensive taxonomy of benchmark data contamination in LLMs, categorizing the problem into four severity levels (semantic, information, data, and label) and organizing existing research into detection techniques (matching-based and comparison-based) and mitigation strategies (data curation, data refactoring, and benchmark-free evaluation). The paper identifies that while methods like n-gram overlap detection are simple and efficient, they have high false negative rates and can be easily bypassed by paraphrasing. The authors argue that fully eliminating BDC is impracticable due to the necessity of large-scale pre-training and the rise of AI-generated content, and propose five future directions including human evaluation, dynamic systems, benchmark content tags, adversarial evaluation, and comprehensive evaluation systems.",
    318   "red_flags": [
    319     {
    320       "flag": "No systematic search methodology",
    321       "detail": "The paper claims to be a 'comprehensive survey' but provides no description of how papers were selected — no search databases, queries, date ranges, inclusion/exclusion criteria, or PRISMA-style flow diagram. This makes it impossible to assess whether coverage is truly comprehensive or systematically biased toward certain sub-areas."
    322     },
    323     {
    324       "flag": "No quality assessment of reviewed papers",
    325       "detail": "The survey summarizes findings from reviewed papers without any structured quality assessment. Papers with strong experimental designs are presented alongside those with weaker evidence, potentially laundering the signal-to-noise ratio of the surveyed literature."
    326     },
    327     {
    328       "flag": "Unbounded comprehensiveness claim",
    329       "detail": "The paper claims there is 'currently no comprehensive and systematic research that thoroughly discusses and defines this problem' and positions itself as filling this gap, but without documented methodology, this claim is unfalsifiable."
    330     },
    331     {
    332       "flag": "No limitations discussion for the survey itself",
    333       "detail": "Section 5 discusses challenges of the BDC problem but never reflects on limitations of the survey methodology itself — potential gaps in coverage, language bias (only English-language literature appears to be reviewed), recency bias, or selection bias."
    334     }
    335   ],
    336   "cited_papers": [
    337     {
    338       "title": "Evaluating Large Language Models Trained on Code",
    339       "authors": ["Mark Chen"],
    340       "year": 2021,
    341       "arxiv_id": "2107.03374",
    342       "relevance": "Introduces HumanEval benchmark for code generation, a key contamination-susceptible benchmark in LLM evaluation."
    343     },
    344     {
    345       "title": "Don't Make Your LLM an Evaluation Benchmark Cheater",
    346       "authors": ["Kun Zhou"],
    347       "year": 2023,
    348       "arxiv_id": "2311.01964",
    349       "relevance": "Directly addresses the integrity of LLM evaluation benchmarks and BDC risks."
    350     },
    351     {
    352       "title": "Quantifying Contamination in Evaluating Code Generation Capabilities of Language Models",
    353       "authors": ["Martin Riddell"],
    354       "year": 2024,
    355       "arxiv_id": "2403.04811",
    356       "relevance": "Quantifies contamination effects specifically in code generation evaluation, directly relevant to LLM programming assessment quality."
    357     },
    358     {
    359       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    360       "authors": ["Naman Jain"],
    361       "year": 2024,
    362       "arxiv_id": "2403.07974",
    363       "relevance": "Proposes a contamination-free coding benchmark using continuously updated competitive programming problems."
    364     },
    365     {
    366       "title": "Top Leaderboard Ranking = Top Coding Proficiency, Always? EvoEval: Evolving Coding Benchmarks via LLM",
    367       "authors": ["Chunqiu Steven Xia"],
    368       "year": 2024,
    369       "arxiv_id": "2403.19114",
    370       "relevance": "Addresses benchmark gaming in code generation through evolving evaluation, showing 39.4% average performance reduction when contamination is mitigated."
    371     },
    372     {
    373       "title": "Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference",
    374       "authors": ["Wei-Lin Chiang"],
    375       "year": 2024,
    376       "arxiv_id": "2403.04132",
    377       "relevance": "Major benchmark-free evaluation platform using crowdsourced human preferences, relevant to alternative LLM evaluation methodology."
    378     },
    379     {
    380       "title": "FreeEval: A Modular Framework for Trustworthy and Efficient Evaluation of Large Language Models",
    381       "authors": ["Zhuohao Yu"],
    382       "year": 2024,
    383       "arxiv_id": "2404.06003",
    384       "relevance": "Comprehensive evaluation framework integrating multiple methods including LLM-as-judge and human participation to mitigate BDC."
    385     },
    386     {
    387       "title": "Leak, Cheat, Repeat: Data Contamination and Evaluation Malpractices in Closed-Source LLMs",
    388       "authors": ["Simone Balloccu"],
    389       "year": 2024,
    390       "relevance": "Documents evaluation malpractices and data contamination in closed-source LLMs, directly relevant to methodological quality assessment."
    391     },
    392     {
    393       "title": "Inadequacies of Large Language Model Benchmarks in the Era of Generative Artificial Intelligence",
    394       "authors": ["Timothy R. McIntosh"],
    395       "year": 2024,
    396       "arxiv_id": "2402.09880",
    397       "relevance": "Critiques current LLM benchmarking approaches, relevant to understanding evaluation methodology limitations."
    398     },
    399     {
    400       "title": "Rethinking Benchmark and Contamination for Language Models with Rephrased Samples",
    401       "authors": ["Shuo Yang"],
    402       "year": 2023,
    403       "arxiv_id": "2311.04850",
    404       "relevance": "Demonstrates that paraphrasing can bypass contamination detection, proposes stronger decontamination methods."
    405     },
    406     {
    407       "title": "Concerned with Data Contamination? Assessing Countermeasures in Code Language Model",
    408       "authors": ["Jialun Cao"],
    409       "year": 2024,
    410       "arxiv_id": "2403.16898",
    411       "relevance": "Assesses contamination countermeasures specifically in code language models."
    412     },
    413     {
    414       "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena",
    415       "authors": ["Lianmin Zheng"],
    416       "year": 2023,
    417       "relevance": "Foundational work on using LLMs as evaluators, relevant to benchmark-free evaluation methodology and its reliability."
    418     }
    419   ]
    420 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs