scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29511B)
      1 {
      2   "scan_version": 3,
      3   "active_modules": [
      4     "experimental_rigor",
      5     "data_leakage"
      6   ],
      7   "paper": {
      8     "title": "Investigating Data Contamination in Modern Benchmarks for Large Language Models",
      9     "authors": [
     10       "Chunyuan Deng",
     11       "Yilun Zhao",
     12       "Xiangru Tang",
     13       "Mark Gerstein",
     14       "Arman Cohan"
     15     ],
     16     "year": 2023,
     17     "venue": "arXiv",
     18     "arxiv_id": "2311.09783"
     19   },
     20   "methodology_tags": [
     21     "benchmark-eval"
     22   ],
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No repository URL, code archive, or link to released code found anywhere in the paper."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper uses publicly available benchmarks (MMLU, TruthfulQA, HellaSwag, WinoGrande, GSM8K, OpenbookQA, PIQA) and publicly available pretraining corpora (The Pile, C4). All datasets are publicly accessible."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions Pyserini but does not give version details or dependency lists."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The method is described at a high level but without sufficient detail to reproduce exact results."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "All results in Tables 1-3 are point estimates with no confidence intervals or error bars reported."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper makes comparative claims (e.g., GPT-4 vs ChatGPT, stronger vs weaker models) but uses no statistical significance tests. Differences are compared by raw numbers only."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The paper reports exact match rates with context: e.g., 'ChatGPT and GPT-4 demonstrated an exact match rate of 52% and 57%, respectively' (§4.2.2), with baselines like LLaMa 2-13B at 0% for comparison."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No justification is given for sample sizes. The human evaluation uses 100 data points from 7 benchmarks and 17 annotators, but no power analysis or justification for these numbers is provided."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No variance, standard deviation, or spread measures are reported for any experiment. All results appear to be single-run numbers."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Multiple models serve as implicit baselines against each other (ChatGPT, GPT-4, Claude-2, Claude-instant-1, LLaMa 2-13B, Mistral-7B). Open-source models with transparent training data serve as baselines for proprietary models."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All evaluated models (GPT-4, ChatGPT, Claude-2, Mistral-7B, LLaMa 2-13B) were state-of-the-art at the time of writing (2023)."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper ablates query types (question-only, label-only, question+label) in Table 4, tests different hint types (type-hint, category-hint, url-hint) in Table 2, and varies top-k retrieval settings (1, 5, 10) in Table 1."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Multiple metrics are used: BM25, SacreBLEU, Rouge-L, BLEURT, GPTscore for retrieval (Table 1); Exact Match and Rouge-L F1 for TS-Guessing (Table 3); Spearman correlation for metric validation (Figure 3)."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "17 NLP volunteers evaluated 100 data points on a binary scale for contamination. Inter-annotator agreement was measured via Krippendorff's alpha (0.8673). Described in §4.1.1."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper uses established benchmark test sets (MMLU test, TruthfulQA, etc.). For benchmarks without public test labels (HellaSwag, WinoGrande, PIQA), development sets are used, which is explicitly stated (§4.2.1)."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Results are broken down per benchmark (7 benchmarks in Table 1, 6 in Table 3), per model, per query type, and per hint type. Individual benchmark contamination levels are discussed."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 7 (Limitations) discusses failure modes: BM25 limitations, open-source models failing to follow TS-Guessing instructions, and models predicting correct answers instead of masked wrong answers."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The paper reports that PIQA does not show high contamination despite word overlaps (§4.1.2), that open-source models (LLaMa 2, Mistral) show near-zero EM in multichoice guessing on several benchmarks (Table 3), and that TruthfulQA has negative correlation between TS-Guessing and task accuracy (Table 6)."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The abstract claims ChatGPT/GPT-4 achieve 52%/57% EM on MMLU, which matches Table 3. Claims about the TS-Guessing method and retrieval system are supported by experimental results."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper implies that high TS-Guessing scores are caused by training data contamination, but acknowledges this is only 'suspicion' — it cannot rule out that models are reasoning to the answer. The contamination probing experiment (§4.3) partially addresses this by showing fully contaminated ChatGPT reaches ~100% EM, but the causal link from high EM to contamination in the 'clean' model remains unproven."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title says 'Modern Benchmarks' broadly but tests only 7 QA-style benchmarks. The TS-Guessing method is tested on multiple-choice formats only. The paper does not bound its claims to these specific benchmark types."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Section 5 discusses tradeoffs and limitations of both methods. The paper acknowledges that TS-Guessing success could partly reflect reasoning rather than memorization, and that PIQA's overlapping words don't indicate contamination due to its physical reasoning nature (§4.1.2). Appendix D discusses the correlation between TS-Guessing and actual task performance."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper measures exact match rates for slot guessing and retrieval overlap rates, and frames findings at that granularity: 'ChatGPT and GPT-4 demonstrated an exact match rate of 52% and 57%.' The paper explicitly distinguishes between detecting contamination signals (what is measured) and proving benchmark inflation (the broader concern), noting limitations of each method."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Models are listed as 'ChatGPT (GPT-3.5-turbo)', 'GPT-4', 'Claude-instant-1-100k', 'Claude-2', 'LLaMa 2-13B', 'Mistral-7B'. No snapshot dates or API versions are specified for any model."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Full prompt templates are shown in Figure 2 for both Question-based and Question-Multichoice settings, including the actual text used with masking instructions."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "No temperature, top-p, or other sampling parameters are reported for any of the LLM API calls. The Rouge-L threshold of 0.65 for filtering is mentioned, but inference hyperparameters are absent."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No agentic scaffolding is used. The paper makes direct API calls to models."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Pre-filtering steps are well-documented in §4.2.1: removal of short questions (≤4 words), removal of 'Indexical Error' category from TruthfulQA, removal of Yes/No and True/False options, Rouge-L threshold filtering at 0.65. Keyword searching process using Stanford POS Tagger and ChatGPT ICL is described."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 7 'Limitations' is a dedicated section discussing multiple specific limitations of both the retrieval and TS-Guessing methods."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Specific threats discussed: BM25 may miss contamination, computation time makes the system impractical without high-performance hardware, text generation scores are superficial for true contamination detection, open-source models tend to predict correct answers instead of following TS-Guessing instructions, and models may be overfitting to multi-choice format (§7)."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": false,
    188         "justification": "The paper does not explicitly state what it does NOT show. It doesn't bound claims to specific benchmark types (only QA-style) or acknowledge that TS-Guessing cannot work for non-multiple-choice benchmarks. The title implies broad coverage without explicit scoping."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "No raw experimental data (retrieval results, per-example TS-Guessing outputs, human annotations) is released for independent verification."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Data collection is clearly described: 7 public benchmarks are named with citations, 2 pretraining corpora (The Pile, C4) are specified, 100 data points randomly sampled for human evaluation from 7 benchmarks."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "17 volunteers with NLP backgrounds participated in human evaluation (§4.1.1), but no details on how they were recruited, their experience level, or potential selection bias are provided."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The pipeline is documented: indexing corpora with Pyserini → BM25 retrieval with top-k → 13-gram tokenization → overlap scoring. For TS-Guessing: filtering → keyword extraction → masking → model querying → EM/Rouge-L scoring."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding source is disclosed. The Acknowledgement section thanks reviewers and colleagues but mentions no grants or sponsors."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations are clearly listed: Georgia Institute of Technology, Yale University, Allen Institute for AI. None of the affiliations are companies whose models are evaluated."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No funding is disclosed, so independence cannot be assessed. One author is from Allen Institute for AI, but no funding statement is provided."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests statement or financial disclosure is present in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "The paper states 'According to OpenAI, their training data is current up to September 2021' (§4.2.2). For open-source models, the training corpora (The Pile, C4) are identified."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "This is the central focus of the paper. The entire retrieval-based method (§3.1) directly measures train-test overlap, and TS-Guessing (§3.2) probes for memorization of test data."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": true,
    249         "justification": "The paper directly addresses that MMLU, TruthfulQA, and other benchmarks were available before model training cutoffs and could be in training data. This is the paper's core contribution."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": true,
    255         "answer": false,
    256         "justification": "The human evaluation component (17 annotators, 100 data points) is not pre-registered."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "No IRB or ethics approval is mentioned despite using human annotators. The Ethics Statement (§8) discusses data ethics but not human subjects approval."
    262       },
    263       "demographics_reported": {
    264         "applies": true,
    265         "answer": false,
    266         "justification": "Annotators are described only as '17 volunteers with backgrounds in NLP.' No further demographics (experience level, education, etc.) are reported."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": true,
    270         "answer": false,
    271         "justification": "No inclusion/exclusion criteria for annotator selection are stated beyond 'backgrounds in NLP.'"
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "This is not a randomized experiment comparing conditions for human participants; annotators all performed the same evaluation task."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "Blinding is not applicable — annotators evaluated contamination overlap, not a treatment condition."
    282       },
    283       "attrition_reported": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "No information on whether all 17 annotators completed the task or whether any dropped out."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No API costs or total token consumption is reported despite making extensive API calls to GPT-4, ChatGPT, and Claude models. Only mentions '2-3 minutes per data point' for retrieval in §7."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "The paper mentions needing approximately 4TB (reduced to 2TB) disk space for corpora and notes the system requires a 'high-performance computer' (§7), but no GPU hours, total API spend, or hardware specs are provided."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No multiple random seeds or sensitivity analysis is reported. Results appear to be from single runs."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The number of experimental runs is not stated for any experiment."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The Rouge-L threshold of 0.65 is described as 'chosen based on initial experiments' (§4.2.1 footnote) but no search budget or alternatives tried are reported."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The filtering threshold of 0.65 and choice of top-k values are not justified beyond brief mentions. No validation set-based selection is described."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "No statistical hypothesis tests are performed, so multiple comparison correction is not applicable."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors propose and evaluate their own TS-Guessing method without acknowledging potential bias in evaluating their own system."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "Not applicable — the methods don't involve training or compute-intensive baselines where compute budget differences would affect comparison."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": true,
    340         "justification": "The paper explicitly questions whether high TS-Guessing scores truly indicate contamination versus reasoning ability. Section 5 discusses tradeoffs and §4.2.2 notes that the method 'may not heavily rely on advanced reasoning skills, although its performance may vary depending on the training data available.' The contamination probing experiment (§4.3) validates the construct."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": false,
    344         "answer": false,
    345         "justification": "The paper probes models directly via prompting without any scaffolding. No multi-model comparisons are made through different scaffolds. The study evaluates contamination signals, not model capability through a scaffold."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": true,
    352         "justification": "The paper directly addresses temporal leakage: TruthfulQA's camera-ready was May 2022 but source data from Wikipedia predates training cutoffs (§4.2.2). This temporal analysis is central to the paper."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "The paper addresses how TruthfulQA metadata (type, category, URL) acts as hints that could leak information. The hint-augmented experiments (Table 2) explicitly test whether metadata provides unfair signal."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": true,
    362         "justification": "The retrieval-based approach (§3.1) directly measures overlap between training corpora (Pile, C4) and benchmark instances, testing whether benchmark items appear in training data."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": true,
    367         "justification": "Two concrete detection methods are applied: (1) BM25 retrieval with 13-gram overlap scoring on The Pile and C4, and (2) TS-Guessing protocol that probes for memorization via slot-filling. Both are applied and results reported."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "ChatGPT and GPT-4 can guess missing incorrect options in MMLU with 52% and 57% exact match rates respectively.",
    374       "evidence": "Table 3 shows EM rates of 0.52 (ChatGPT) and 0.57 (GPT-4) for MMLU Question-Multichoice guessing, after filtering correlated options.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Stronger models do not necessarily show higher proficiency in TS-Guessing, suggesting the protocol measures memorization rather than reasoning.",
    379       "evidence": "Table 2 shows only 1% difference between ChatGPT and GPT-4 in Question-based guessing without hints. Claude-instant-1 sometimes outperforms Claude-2 (Table 2).",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "After fully contaminating ChatGPT with MMLU test data via fine-tuning, TS-Guessing EM rate reaches nearly 100%, validating the method's sensitivity.",
    384       "evidence": "Figure 4 shows ~100% EM for both question-based and question-multichoice formats after fine-tuning on MMLU test set (§4.3).",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "TruthfulQA has significant overlap with pretraining corpora due to its web-sourced content.",
    389       "evidence": "Table 1 shows high BM25 and GPTscore for TruthfulQA-C4 overlap. Appendix C provides a specific contamination example from C4. Human annotators identified 23/100 examples as contaminated (§4.1.2).",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "GPTscore aligns more closely with human evaluation than traditional metrics for contamination detection.",
    394       "evidence": "Figure 3 shows Spearman correlation between metrics and human scores across 100 examples. GPTscore achieves the highest correlation.",
    395       "supported": "moderate"
    396     }
    397   ],
    398   "key_findings": "The paper proposes two methods for detecting benchmark data contamination in LLMs: retrieval-based search in pretraining corpora and a novel TS-Guessing protocol that masks test set elements and asks models to fill them in. ChatGPT achieves 52% exact match guessing missing incorrect MMLU options, suggesting potential contamination. TruthfulQA shows significant overlap with the C4 corpus. A validation experiment fine-tuning ChatGPT on MMLU reaches ~100% EM, confirming the method's sensitivity to contamination.",
    399   "red_flags": [
    400     {
    401       "flag": "No code or data release",
    402       "detail": "Despite proposing a contamination detection methodology, no code, scripts, or raw experimental outputs are released for others to verify or apply the methods."
    403     },
    404     {
    405       "flag": "No uncertainty quantification",
    406       "detail": "All results are point estimates with no error bars, confidence intervals, or variance across runs. The TS-Guessing results could be sensitive to prompt variations or API non-determinism."
    407     },
    408     {
    409       "flag": "Model versions unspecified",
    410       "detail": "No API snapshot dates or model versions specified. ChatGPT and GPT-4 behavior changes across versions, making results non-reproducible."
    411     },
    412     {
    413       "flag": "Causal inference gap",
    414       "detail": "High TS-Guessing scores are interpreted as evidence of contamination, but the paper cannot rule out that models are reasoning about likely wrong answers from the structure of multiple-choice questions."
    415     }
    416   ],
    417   "cited_papers": [
    418     {
    419       "title": "Time Travel in LLMs: Tracing Data Contamination in Large Language Models",
    420       "authors": [
    421         "Shahriar Golchin",
    422         "Mihai Surdeanu"
    423       ],
    424       "year": 2023,
    425       "relevance": "Proposes alternative contamination detection method using prompt-based probing of LLMs."
    426     },
    427     {
    428       "title": "Proving Test Set Contamination in Black Box Language Models",
    429       "authors": [
    430         "Yonatan Oren",
    431         "Nicole Meister",
    432         "Niladri Chatterji",
    433         "Faisal Ladhak",
    434         "Tatsunori B. Hashimoto"
    435       ],
    436       "year": 2023,
    437       "relevance": "Presents probing method for contamination detection using canonical data order, applicable to black-box models."
    438     },
    439     {
    440       "title": "NLP evaluation in trouble: On the need to measure LLM data contamination for each benchmark",
    441       "authors": [
    442         "Oscar Sainz",
    443         "Jon Campos",
    444         "Iker García-Ferrero",
    445         "Julen Etxaniz",
    446         "Oier Lopez de Lacalle",
    447         "Eneko Agirre"
    448       ],
    449       "year": 2023,
    450       "relevance": "Argues for systematic contamination measurement across NLP benchmarks."
    451     },
    452     {
    453       "title": "Detecting Pretraining Data from Large Language Models",
    454       "authors": [
    455         "Weijia Shi",
    456         "Anirudh Ajith",
    457         "Mengzhou Xia",
    458         "Yangsibo Huang",
    459         "Daogao Liu",
    460         "Terra Blevins",
    461         "Danqi Chen",
    462         "Luke Zettlemoyer"
    463       ],
    464       "year": 2023,
    465       "relevance": "Proposes membership inference methods for detecting whether specific data was in LLM pretraining."
    466     },
    467     {
    468       "title": "Stop Uploading Test Data in Plain Text: Practical Strategies for Mitigating Data Contamination by Evaluation Benchmarks",
    469       "authors": [
    470         "Alon Jacovi",
    471         "Avi Caciularu",
    472         "Omer Goldman",
    473         "Yoav Goldberg"
    474       ],
    475       "year": 2023,
    476       "relevance": "Proposes practical mitigation strategies against benchmark contamination."
    477     },
    478     {
    479       "title": "DyVal: Graph-informed Dynamic Evaluation of Large Language Models",
    480       "authors": [
    481         "Kaijie Zhu",
    482         "Jiaao Chen",
    483         "Jindong Wang",
    484         "Neil Zhenqiang Gong",
    485         "Diyi Yang",
    486         "Xing Xie"
    487       ],
    488       "year": 2023,
    489       "relevance": "Proposes dynamic evaluation as a solution to benchmark contamination."
    490     },
    491     {
    492       "title": "Don't Make Your LLM an Evaluation Benchmark Cheater",
    493       "authors": [
    494         "Kun Zhou",
    495         "Yutao Zhu",
    496         "Zhipeng Chen",
    497         "Wentong Chen",
    498         "Wayne Xin Zhao",
    499         "Xu Chen",
    500         "Yankai Lin",
    501         "Ji-Rong Wen",
    502         "Jiawei Han"
    503       ],
    504       "year": 2023,
    505       "relevance": "Investigates how LLMs can game evaluation benchmarks through contamination."
    506     },
    507     {
    508       "title": "Quantifying Contamination in Evaluating Code Generation Capabilities of Language Models",
    509       "authors": [
    510         "Martin Riddell",
    511         "Ansong Ni",
    512         "Arman Cohan"
    513       ],
    514       "year": 2024,
    515       "relevance": "Extends contamination investigation to code generation benchmarks at both surface and semantic levels."
    516     },
    517     {
    518       "title": "Data contamination: From memorization to exploitation",
    519       "authors": [
    520         "Inbal Magar",
    521         "Roy Schwartz"
    522       ],
    523       "year": 2022,
    524       "relevance": "Foundational work investigating the relationship between pretraining memorization and downstream task exploitation."
    525     },
    526     {
    527       "title": "What's In My Big Data?",
    528       "authors": [
    529         "Yanai Elazar",
    530         "Akshita Bhagia",
    531         "Ian Magnusson"
    532       ],
    533       "year": 2023,
    534       "relevance": "Provides analysis of contamination in open pretraining corpora including C4 and The Pile."
    535     },
    536     {
    537       "title": "Measuring Massive Multitask Language Understanding",
    538       "authors": [
    539         "Dan Hendrycks",
    540         "Collin Burns",
    541         "Steven Basart",
    542         "Andy Zou",
    543         "Mantas Mazeika",
    544         "Dawn Song",
    545         "Jacob Steinhardt"
    546       ],
    547       "year": 2021,
    548       "relevance": "MMLU benchmark — the primary subject of contamination investigation in this paper."
    549     },
    550     {
    551       "title": "Pretraining on the Test Set Is All You Need",
    552       "authors": [
    553         "Rylan Schaeffer"
    554       ],
    555       "year": 2023,
    556       "relevance": "Demonstrates that pretraining contamination can artificially inflate benchmark performance."
    557     }
    558   ],
    559   "engagement_factors": {
    560     "practical_relevance": {
    561       "score": 1,
    562       "justification": "The TS-Guessing method could help benchmark designers check for contamination, but requires significant adaptation and no code is released."
    563     },
    564     "surprise_contrarian": {
    565       "score": 2,
    566       "justification": "The finding that ChatGPT can guess 52% of missing wrong MMLU options is genuinely surprising and suggests widely-cited benchmark scores may be inflated."
    567     },
    568     "fear_safety": {
    569       "score": 1,
    570       "justification": "Raises concerns about trustworthiness of LLM evaluations but doesn't demonstrate direct safety risks or attacks."
    571     },
    572     "drama_conflict": {
    573       "score": 2,
    574       "justification": "Directly challenges the validity of MMLU scores for ChatGPT and GPT-4, implying OpenAI's flagship benchmarks may be contaminated."
    575     },
    576     "demo_ability": {
    577       "score": 0,
    578       "justification": "No code, scripts, or reproducible artifacts are released despite proposing a detection methodology."
    579     },
    580     "brand_recognition": {
    581       "score": 3,
    582       "justification": "Directly investigates ChatGPT and GPT-4 on MMLU, one of the most widely discussed benchmarks in the LLM space."
    583     }
    584   }
    585 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs