ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (34994B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Investigating Data Contamination in Modern Benchmarks for Large Language Models",
      6     "authors": [
      7       "Chunyuan Deng",
      8       "Yilun Zhao",
      9       "Xiangru Tang",
     10       "Mark Gerstein",
     11       "Arman Cohan"
     12     ],
     13     "year": 2023,
     14     "venue": "arXiv",
     15     "arxiv_id": "2311.09783",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims ChatGPT/GPT-4 achieve 52%/57% EM on MMLU, which matches Table 3. Claims about the TS-Guessing method and retrieval system are supported by experimental results.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper implies that high TS-Guessing scores are caused by training data contamination, but acknowledges this is only 'suspicion' — it cannot rule out that models are reasoning to the answer. The contamination probing experiment (§4.3) partially addresses this by showing fully contaminated ChatGPT reaches ~100% EM, but the causal link from high EM to contamination in the 'clean' model remains unproven.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title says 'Modern Benchmarks' broadly but tests only 7 QA-style benchmarks. The TS-Guessing method is tested on multiple-choice formats only. The paper does not bound its claims to these specific benchmark types.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section 5 discusses tradeoffs and limitations of both methods. The paper acknowledges that TS-Guessing success could partly reflect reasoning rather than memorization, and that PIQA's overlapping words don't indicate contamination due to its physical reasoning nature (§4.1.2). Appendix D discusses the correlation between TS-Guessing and actual task performance.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper measures exact match rates for slot guessing and retrieval overlap rates, and frames findings at that granularity: 'ChatGPT and GPT-4 demonstrated an exact match rate of 52% and 57%.' The paper explicitly distinguishes between detecting contamination signals (what is measured) and proving benchmark inflation (the broader concern), noting limitations of each method.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 7 'Limitations' is a dedicated section discussing multiple specific limitations of both the retrieval and TS-Guessing methods.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats discussed: BM25 may miss contamination, computation time makes the system impractical without high-performance hardware, text generation scores are superficial for true contamination detection, open-source models tend to predict correct answers instead of following TS-Guessing instructions, and models may be overfitting to multi-choice format (§7).",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what it does NOT show. It doesn't bound claims to specific benchmark types (only QA-style) or acknowledge that TS-Guessing cannot work for non-multiple-choice benchmarks. The title implies broad coverage without explicit scoping.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding source is disclosed. The Acknowledgement section thanks reviewers and colleagues but mentions no grants or sponsors.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly listed: Georgia Institute of Technology, Yale University, Allen Institute for AI. None of the affiliations are companies whose models are evaluated.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No funding is disclosed, so independence cannot be assessed. One author is from Allen Institute for AI, but no funding statement is provided.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or financial disclosure is present in the paper.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "'Data contamination' is operationally defined as benchmark instances appearing in pretraining corpora; 'TS-Guessing' is formally defined with problem formulations and mathematical notation in Sections 3.2.1 and 3.2.2.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly contributes two methods: a retrieval-based IR system for open models with accessible training data, and the novel TS-Guessing protocol applicable to both open and closed models; stated clearly in abstract and Section 1.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 2 substantively compares the proposed methods against n-gram matching (GPT-3, PaLM, LLaMA approaches), Golchin and Surdeanu (2023), Oren et al. (2023), and others, explaining specific limitations this work addresses.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No repository URL, code archive, or link to released code found anywhere in the paper.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The paper uses publicly available benchmarks (MMLU, TruthfulQA, HellaSwag, WinoGrande, GSM8K, OpenbookQA, PIQA) and publicly available pretraining corpora (The Pile, C4). All datasets are publicly accessible.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions Pyserini but does not give version details or dependency lists.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The method is described at a high level but without sufficient detail to reproduce exact results.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables 1-3 are point estimates with no confidence intervals or error bars reported.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper makes comparative claims (e.g., GPT-4 vs ChatGPT, stronger vs weaker models) but uses no statistical significance tests. Differences are compared by raw numbers only.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "The paper reports exact match rates with context: e.g., 'ChatGPT and GPT-4 demonstrated an exact match rate of 52% and 57%, respectively' (§4.2.2), with baselines like LLaMa 2-13B at 0% for comparison.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No justification is given for sample sizes. The human evaluation uses 100 data points from 7 benchmarks and 17 annotators, but no power analysis or justification for these numbers is provided.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or spread measures are reported for any experiment. All results appear to be single-run numbers.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Multiple models serve as implicit baselines against each other (ChatGPT, GPT-4, Claude-2, Claude-instant-1, LLaMa 2-13B, Mistral-7B). Open-source models with transparent training data serve as baselines for proprietary models.",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "All evaluated models (GPT-4, ChatGPT, Claude-2, Mistral-7B, LLaMa 2-13B) were state-of-the-art at the time of writing (2023).",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The paper ablates query types (question-only, label-only, question+label) in Table 4, tests different hint types (type-hint, category-hint, url-hint) in Table 2, and varies top-k retrieval settings (1, 5, 10) in Table 1.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Multiple metrics are used: BM25, SacreBLEU, Rouge-L, BLEURT, GPTscore for retrieval (Table 1); Exact Match and Rouge-L F1 for TS-Guessing (Table 3); Spearman correlation for metric validation (Figure 3).",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "17 NLP volunteers evaluated 100 data points on a binary scale for contamination. Inter-annotator agreement was measured via Krippendorff's alpha (0.8673). Described in §4.1.1.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "The paper uses established benchmark test sets (MMLU test, TruthfulQA, etc.). For benchmarks without public test labels (HellaSwag, WinoGrande, PIQA), development sets are used, which is explicitly stated (§4.2.1).",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Results are broken down per benchmark (7 benchmarks in Table 1, 6 in Table 3), per model, per query type, and per hint type. Individual benchmark contamination levels are discussed.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section 7 (Limitations) discusses failure modes: BM25 limitations, open-source models failing to follow TS-Guessing instructions, and models predicting correct answers instead of masked wrong answers.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper reports that PIQA does not show high contamination despite word overlaps (§4.1.2), that open-source models (LLaMa 2, Mistral) show near-zero EM in multichoice guessing on several benchmarks (Table 3), and that TruthfulQA has negative correlation between TS-Guessing and task accuracy (Table 6).",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Models are listed as 'ChatGPT (GPT-3.5-turbo)', 'GPT-4', 'Claude-instant-1-100k', 'Claude-2', 'LLaMa 2-13B', 'Mistral-7B'. No snapshot dates or API versions are specified for any model.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Full prompt templates are shown in Figure 2 for both Question-based and Question-Multichoice settings, including the actual text used with masking instructions.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No temperature, top-p, or other sampling parameters are reported for any of the LLM API calls. The Rouge-L threshold of 0.65 for filtering is mentioned, but inference hyperparameters are absent.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used. The paper makes direct API calls to models.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Pre-filtering steps are well-documented in §4.2.1: removal of short questions (≤4 words), removal of 'Indexical Error' category from TruthfulQA, removal of Yes/No and True/False options, Rouge-L threshold filtering at 0.65. Keyword searching process using Stanford POS Tagger and ChatGPT ICL is described.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "No raw experimental data (retrieval results, per-example TS-Guessing outputs, human annotations) is released for independent verification.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Data collection is clearly described: 7 public benchmarks are named with citations, 2 pretraining corpora (The Pile, C4) are specified, 100 data points randomly sampled for human evaluation from 7 benchmarks.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": false,
    282           "justification": "17 volunteers with NLP backgrounds participated in human evaluation (§4.1.1), but no details on how they were recruited, their experience level, or potential selection bias are provided.",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The pipeline is documented: indexing corpora with Pyserini → BM25 retrieval with top-k → 13-gram tokenization → overlap scoring. For TS-Guessing: filtering → keyword extraction → masking → model querying → EM/Rouge-L scoring.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "The paper states 'According to OpenAI, their training data is current up to September 2021' (§4.2.2). For open-source models, the training corpora (The Pile, C4) are identified.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "This is the central focus of the paper. The entire retrieval-based method (§3.1) directly measures train-test overlap, and TS-Guessing (§3.2) probes for memorization of test data.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "The paper directly addresses that MMLU, TruthfulQA, and other benchmarks were available before model training cutoffs and could be in training data. This is the paper's core contribution.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "The human evaluation component (17 annotators, 100 data points) is not pre-registered.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": true,
    321           "answer": false,
    322           "justification": "No IRB or ethics approval is mentioned despite using human annotators. The Ethics Statement (§8) discusses data ethics but not human subjects approval.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": true,
    327           "answer": false,
    328           "justification": "Annotators are described only as '17 volunteers with backgrounds in NLP.' No further demographics (experience level, education, etc.) are reported.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": true,
    333           "answer": false,
    334           "justification": "No inclusion/exclusion criteria for annotator selection are stated beyond 'backgrounds in NLP.'",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "This is not a randomized experiment comparing conditions for human participants; annotators all performed the same evaluation task.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "Blinding is not applicable — annotators evaluated contamination overlap, not a treatment condition.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": true,
    351           "answer": false,
    352           "justification": "No information on whether all 17 annotators completed the task or whether any dropped out.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No API costs or total token consumption is reported despite making extensive API calls to GPT-4, ChatGPT, and Claude models. Only mentions '2-3 minutes per data point' for retrieval in §7.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "The paper mentions needing approximately 4TB (reduced to 2TB) disk space for corpora and notes the system requires a 'high-performance computer' (§7), but no GPU hours, total API spend, or hardware specs are provided.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "No multiple random seeds or sensitivity analysis is reported. Results appear to be from single runs.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The number of experimental runs is not stated for any experiment.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "The Rouge-L threshold of 0.65 is described as 'chosen based on initial experiments' (§4.2.1 footnote) but no search budget or alternatives tried are reported.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": false,
    392           "justification": "The filtering threshold of 0.65 and choice of top-k values are not justified beyond brief mentions. No validation set-based selection is described.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": false,
    397           "answer": false,
    398           "justification": "No statistical hypothesis tests are performed, so multiple comparison correction is not applicable.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The authors propose and evaluate their own TS-Guessing method without acknowledging potential bias in evaluating their own system.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": false,
    409           "answer": false,
    410           "justification": "Not applicable — the methods don't involve training or compute-intensive baselines where compute budget differences would affect comparison.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": true,
    416           "justification": "The paper explicitly questions whether high TS-Guessing scores truly indicate contamination versus reasoning ability. Section 5 discusses tradeoffs and §4.2.2 notes that the method 'may not heavily rely on advanced reasoning skills, although its performance may vary depending on the training data available.' The contamination probing experiment (§4.3) validates the construct.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": false,
    421           "answer": false,
    422           "justification": "The paper probes models directly via prompting without any scaffolding. No multi-model comparisons are made through different scaffolds. The study evaluates contamination signals, not model capability through a scaffold.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": true,
    430           "justification": "The paper directly addresses temporal leakage: TruthfulQA's camera-ready was May 2022 but source data from Wikipedia predates training cutoffs (§4.2.2). This temporal analysis is central to the paper.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": true,
    436           "justification": "The paper addresses how TruthfulQA metadata (type, category, URL) acts as hints that could leak information. The hint-augmented experiments (Table 2) explicitly test whether metadata provides unfair signal.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": true,
    442           "justification": "The retrieval-based approach (§3.1) directly measures overlap between training corpora (Pile, C4) and benchmark instances, testing whether benchmark items appear in training data.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": true,
    448           "justification": "Two concrete detection methods are applied: (1) BM25 retrieval with 13-gram overlap scoring on The Pile and C4, and (2) TS-Guessing protocol that probes for memorization via slot-filling. Both are applied and results reported.",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "ChatGPT achieves 52% and GPT-4 achieves 57% exact match rate when predicting masked incorrect options in MMLU test set",
    457       "evidence": "Table 3 reports MMLU EM: ChatGPT=0.52, GPT-4=0.57. Post-filtering removed correlated options, Yes-No answers, and mathematical symbols to minimize reasoning-based guessing.",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "MMLU shows significantly higher contamination signals than other tested benchmarks (PIQA, HellaSwag, WinoGrande show near-zero EM)",
    462       "evidence": "Table 3: PIQA EM=0.00 for ChatGPT and GPT-4; HellaSwag ChatGPT EM=0.00, GPT-4 EM=0.02; MMLU ChatGPT EM=0.52 — a qualitative gap across benchmarks.",
    463       "supported": "strong"
    464     },
    465     {
    466       "claim": "Fine-tuning ChatGPT on the MMLU test set produces near-100% EM rate in TS-Guessing, validating the method's sensitivity",
    467       "evidence": "Figure 4 shows the contaminated ChatGPT achieves close to 100% EM in both question-based and question-multichoice formats, compared to ~52% for the clean model.",
    468       "supported": "strong"
    469     },
    470     {
    471       "claim": "Stronger models (GPT-4 vs ChatGPT) do not show meaningfully higher TS-Guessing performance",
    472       "evidence": "Table 3 shows only 5pp gap (0.52 vs 0.57) on MMLU; Section 4.2.2 notes only 1% difference in original setting and ~4% with URL hints, described as 'not significant improvements.'",
    473       "supported": "moderate"
    474     },
    475     {
    476       "claim": "TruthfulQA exhibits significant overlap with C4 corpus, likely because its content is sourced from Wikipedia",
    477       "evidence": "Appendix C shows a specific TruthfulQA example with BM25 score 50.24 matching C4 document text almost verbatim; Section 4.1.2 attributes this to Wikipedia-sourced content in both TruthfulQA and C4.",
    478       "supported": "moderate"
    479     },
    480     {
    481       "claim": "GPTscore correlates more closely with human judgment on contamination than traditional metrics (BM25, BLEU, ROUGE, BLEURT)",
    482       "evidence": "Figure 3 shows Spearman correlation coefficients where GPTscore visually outperforms other metrics against human evaluation on 100 examples across four benchmarks.",
    483       "supported": "weak"
    484     }
    485   ],
    486   "methodology_tags": [
    487     "benchmark-eval",
    488     "observational"
    489   ],
    490   "key_findings": "The paper introduces two complementary contamination detection methods: a BM25-based information retrieval system for open models and the novel TS-Guessing protocol applicable to black-box models. Applied to MMLU, ChatGPT and GPT-4 can guess the exact missing incorrect option at 52% and 57% rates respectively — far above chance for an open-ended generation task — suggesting significant exposure to MMLU during training. TruthfulQA shows substantial textual overlap with the C4 pretraining corpus via retrieval analysis, while benchmarks requiring physical or commonsense reasoning (PIQA, HellaSwag) show near-zero contamination signals. A validation probe fine-tuning ChatGPT on MMLU test data achieves ~100% EM, confirming the method can detect known contamination.",
    491   "red_flags": [
    492     {
    493       "flag": "Internal number inconsistency",
    494       "detail": "Abstract correctly states ChatGPT=52% and GPT-4=57% EM on MMLU, but Section 4.2.2 and the conclusion both incorrectly state 'ChatGPT could precisely predict missing choices in MMLU with 57% EM rate' — attributing GPT-4's number to ChatGPT."
    495     },
    496     {
    497       "flag": "No random baseline",
    498       "detail": "EM rates for predicting specific incorrect options are never compared against a random baseline. For a 4-option MCQ where one correct option is revealed, estimating what chance performance would be for the remaining three options is non-trivial and not calculated."
    499     },
    500     {
    501       "flag": "Sample sizes after filtering not reported",
    502       "detail": "The filtering procedure (Rouge-L threshold, removing Yes-No/math options) is described qualitatively but the number of examples remaining per benchmark for TS-Guessing is never reported, making the absolute EM counts unclear."
    503     },
    504     {
    505       "flag": "No confidence intervals on key claims",
    506       "detail": "The headline 52%/57% EM rates on MMLU have no statistical uncertainty quantification; single-run point estimates are presented as definitive contamination evidence."
    507     },
    508     {
    509       "flag": "Alternative hypothesis not ruled out",
    510       "detail": "High MMLU EM rates could partially reflect that incorrect MMLU distractors are domain-predictable (e.g., wrong answers about China policy that fit a narrow vocabulary). The filtering procedure reduces but does not eliminate this alternative explanation."
    511     },
    512     {
    513       "flag": "Human evaluation on only 100 examples",
    514       "detail": "The correlation between automated metrics and human judgment is computed on only 100 data points from 7 benchmarks (~14 per benchmark), which is too small for reliable per-benchmark conclusions."
    515     }
    516   ],
    517   "cited_papers": [
    518     {
    519       "title": "Measuring Massive Multitask Language Understanding (MMLU)",
    520       "relevance": "Primary benchmark shown to have high contamination signals; central to the TS-Guessing empirical results"
    521     },
    522     {
    523       "title": "TruthfulQA: Measuring How Models Mimic Human Falsehoods",
    524       "relevance": "Second key benchmark analyzed for contamination; shown to have substantial C4 corpus overlap due to Wikipedia sourcing"
    525     },
    526     {
    527       "title": "Time Travel in LLMs: Tracing Data Contamination in Large Language Models (Golchin and Surdeanu, 2023)",
    528       "relevance": "Direct prior work on contamination detection that TS-Guessing is positioned against"
    529     },
    530     {
    531       "title": "Proving Test Set Contamination in Black Box Language Models (Oren et al., 2023)",
    532       "relevance": "Related black-box contamination detection method using canonical ordering; compared to TS-Guessing in positioning"
    533     },
    534     {
    535       "title": "Data Contamination Through the Lens of Time (Roberts et al., 2023)",
    536       "relevance": "Related temporal contamination analysis approach"
    537     },
    538     {
    539       "title": "What's In My Big Data? (Elazar et al., 2023)",
    540       "relevance": "Prior work on retrieval-based contamination detection for GLUE/SuperGLUE; motivates extending to more recent benchmarks"
    541     },
    542     {
    543       "title": "NLP evaluation in trouble: On the need to measure LLM data contamination for each benchmark (Sainz et al., 2023)",
    544       "relevance": "Concurrent work arguing for systematic contamination measurement; supports paper's motivation"
    545     },
    546     {
    547       "title": "Stop Uploading Test Data in Plain Text: Practical Strategies for Mitigating Data Contamination by Evaluation Benchmarks (Jacovi et al., 2023)",
    548       "relevance": "Complementary mitigation perspective — the problem this paper helps detect"
    549     }
    550   ],
    551   "engagement_factors": {
    552     "practical_relevance": {
    553       "score": 1,
    554       "justification": "The TS-Guessing method could help benchmark designers check for contamination, but requires significant adaptation and no code is released."
    555     },
    556     "surprise_contrarian": {
    557       "score": 2,
    558       "justification": "The finding that ChatGPT can guess 52% of missing wrong MMLU options is genuinely surprising and suggests widely-cited benchmark scores may be inflated."
    559     },
    560     "fear_safety": {
    561       "score": 1,
    562       "justification": "Raises concerns about trustworthiness of LLM evaluations but doesn't demonstrate direct safety risks or attacks."
    563     },
    564     "drama_conflict": {
    565       "score": 2,
    566       "justification": "Directly challenges the validity of MMLU scores for ChatGPT and GPT-4, implying OpenAI's flagship benchmarks may be contaminated."
    567     },
    568     "demo_ability": {
    569       "score": 0,
    570       "justification": "No code, scripts, or reproducible artifacts are released despite proposing a detection methodology."
    571     },
    572     "brand_recognition": {
    573       "score": 3,
    574       "justification": "Directly investigates ChatGPT and GPT-4 on MMLU, one of the most widely discussed benchmarks in the LLM space."
    575     }
    576   },
    577   "hn_data": {
    578     "threads": [
    579       {
    580         "hn_id": "40229022",
    581         "title": "When can transformers reason with abstract symbols?",
    582         "points": 3,
    583         "comments": 0,
    584         "url": "https://news.ycombinator.com/item?id=40229022",
    585         "created_at": "2024-05-01T20:34:43Z"
    586       },
    587       {
    588         "hn_id": "29493664",
    589         "title": "Training Neural Networks with Fixed Sparse Masks",
    590         "points": 2,
    591         "comments": 1,
    592         "url": "https://news.ycombinator.com/item?id=29493664",
    593         "created_at": "2021-12-09T03:58:40Z"
    594       },
    595       {
    596         "hn_id": "42299972",
    597         "title": "Modeling AdaGrad, RMSProp, and Adam with Integro-Differential Equations",
    598         "points": 2,
    599         "comments": 0,
    600         "url": "https://news.ycombinator.com/item?id=42299972",
    601         "created_at": "2024-12-02T20:17:28Z"
    602       },
    603       {
    604         "hn_id": "42171440",
    605         "title": "Modeling AdaGrad, RMSProp, and Adam with Integro-Differential Equations",
    606         "points": 2,
    607         "comments": 0,
    608         "url": "https://news.ycombinator.com/item?id=42171440",
    609         "created_at": "2024-11-18T11:19:17Z"
    610       },
    611       {
    612         "hn_id": "39313991",
    613         "title": "Information content of note transitions in the music of J. S. Bach",
    614         "points": 2,
    615         "comments": 0,
    616         "url": "https://news.ycombinator.com/item?id=39313991",
    617         "created_at": "2024-02-09T12:09:43Z"
    618       },
    619       {
    620         "hn_id": "33625737",
    621         "title": "Optimal sizing of seasonal renewable energy storage considering degradation",
    622         "points": 2,
    623         "comments": 0,
    624         "url": "https://news.ycombinator.com/item?id=33625737",
    625         "created_at": "2022-11-16T16:25:26Z"
    626       },
    627       {
    628         "hn_id": "38576071",
    629         "title": "Large Language Models on Graphs: A Comprehensive Survey",
    630         "points": 1,
    631         "comments": 0,
    632         "url": "https://news.ycombinator.com/item?id=38576071",
    633         "created_at": "2023-12-08T23:15:13Z"
    634       },
    635       {
    636         "hn_id": "37771757",
    637         "title": "Can LLMs provide useful feedback on research papers? A broad empirical analysis",
    638         "points": 1,
    639         "comments": 0,
    640         "url": "https://news.ycombinator.com/item?id=37771757",
    641         "created_at": "2023-10-04T21:03:08Z"
    642       },
    643       {
    644         "hn_id": "35284995",
    645         "title": "Self Supervision Does Not Help Natural Language Supervision at Scale",
    646         "points": 1,
    647         "comments": 0,
    648         "url": "https://news.ycombinator.com/item?id=35284995",
    649         "created_at": "2023-03-24T04:12:16Z"
    650       },
    651       {
    652         "hn_id": "29320363",
    653         "title": "ClipClap: Image Captioning with Clip Encoder and GPT2 [pdf]",
    654         "points": 1,
    655         "comments": 0,
    656         "url": "https://news.ycombinator.com/item?id=29320363",
    657         "created_at": "2021-11-23T17:15:20Z"
    658       }
    659     ],
    660     "top_points": 3,
    661     "total_points": 17,
    662     "total_comments": 1
    663   }
    664 }

Impressum · Datenschutz