ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30414B)


      1 {
      2   "paper": {
      3     "title": "On Leakage of Code Generation Evaluation Datasets",
      4     "authors": [
      5       "Alexandre Matton",
      6       "Tom Sherborne",
      7       "Dennis Aumiller",
      8       "Elena Tommasone",
      9       "Milad Alizadeh",
     10       "Jingyi He",
     11       "Raymond Ma",
     12       "Maxime Voisin",
     13       "Ellen Gilsenan-McMahon",
     14       "Matthias Gallé"
     15     ],
     16     "year": 2024,
     17     "venue": "Conference on Empirical Methods in Natural Language Processing",
     18     "arxiv_id": "2407.07565",
     19     "doi": "10.48550/arXiv.2407.07565"
     20   },
     21   "scan_version": 3,
     22   "active_modules": ["experimental_rigor", "data_leakage"],
     23   "methodology_tags": ["benchmark-eval", "observational"],
     24   "key_findings": "HumanEval and MBPP are pervasively contaminated in LLM training data through three mechanisms: direct data leakage (every HumanEval prompt appears 43+ times on GitHub), indirect leakage through synthetic data pipelines (high embedding similarity between evol-instruct/Starcoder-Instruct and HumanEval/MBPP), and checkpoint selection overfitting. The authors release LBPP, a 161-problem uncontaminated Python benchmark on which SOTA models score up to 43% lower than HumanEval, with model rankings changing between contaminated and uncontaminated benchmarks.",
     25   "checklist": {
     26     "artifacts": {
     27       "code_released": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No source code repository is provided. Only the LBPP dataset is released on HuggingFace. No evaluation harness, embedding analysis scripts, or reproduction code is linked."
     31       },
     32       "data_released": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "LBPP is released at https://huggingface.co/datasets/CohereForAI/lbpp as stated in the abstract and Section 4."
     36       },
     37       "environment_specified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No environment specifications, requirements files, or dependency information is provided for reproducing the evaluation or analysis."
     41       },
     42       "reproduction_instructions": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No step-by-step instructions for reproducing the benchmark evaluations, GitHub search analysis, or embedding similarity experiments are provided."
     46       }
     47     },
     48     "statistical_methodology": {
     49       "confidence_intervals_or_error_bars": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Table 2 reports Pass@1 as single point estimates for all 28 models across 3 benchmarks. No confidence intervals or error bars are provided for any result."
     53       },
     54       "significance_tests": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper claims ranking changes between benchmarks and performance drops of up to 43% without any statistical significance tests. The correlation in Fig. 3 is described as 'strong significant' but no test statistics or p-values are reported."
     58       },
     59       "effect_sizes_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Raw performance numbers are given for all models on all benchmarks (Table 2), with rank changes and absolute differences. The abstract states 'up to 43% worse' and Section 3.2 reports specific deltas (+9% HumanEval, +2% MBPP, unchanged LBPP) from fine-tuning."
     63       },
     64       "sample_size_justified": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "LBPP contains 161 prompts with no justification for this size. No power analysis or discussion of whether 161 problems are sufficient to reliably distinguish model capabilities."
     68       },
     69       "variance_reported": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "All Pass@1 results are single-run point estimates. No standard deviation, interquartile range, or variance across runs is reported for any model or benchmark."
     73       }
     74     },
     75     "evaluation_design": {
     76       "baselines_included": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Table 2 compares 28 models across 5 families (Mistral, Meta, OpenAI, Anthropic, Qwen, Cohere, Deepseek, Databricks) on HumanEval, MBPP, and LBPP, serving as mutual baselines."
     80       },
     81       "baselines_contemporary": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Models include GPT-4o, Claude 3.5 Sonnet, Llama3 70B, Codestral, and Qwen 2—all contemporary at time of publication (2024)."
     85       },
     86       "ablation_study": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "LBPP is a benchmark, not a multi-component system. There are no components to ablate."
     90       },
     91       "multiple_metrics": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "Only Pass@1 is reported. No other metrics (e.g., Pass@10, code quality metrics, or diversity measures) are used for any benchmark."
     95       },
     96       "human_evaluation": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "All evaluation is automated through unit test pass/fail. While human annotators created LBPP and reviewed prompts, no human evaluation of model outputs is performed."
    100       },
    101       "held_out_test_set": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "LBPP is explicitly designed as a held-out, uncontaminated test set. The paper's core contribution is creating a benchmark that was not in any model's training data."
    105       },
    106       "per_category_breakdown": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 4 provides a per-category error analysis: 21% 2D/3D array problems, 18% graph algorithms, 17% complex programming concepts, 8% bit arithmetic, 8% Pandas, 8% file IO."
    110       },
    111       "failure_cases_discussed": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 4 analyzes common failure patterns between Claude 3.5 Sonnet and Command R Refresh. Table 1 shows specific unsolved problems. Failure categories are enumerated."
    115       },
    116       "negative_results_reported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The paper is fundamentally about negative results: contamination undermines benchmarks. Section 3.2 reports that fine-tuning with evol-instruct leaves LBPP unchanged despite boosting HumanEval +9%. Section 5 acknowledges LBPP will eventually become contaminated too."
    120       }
    121     },
    122     "claims_and_evidence": {
    123       "abstract_claims_supported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Abstract claims about three contamination sources (direct, synthetic, checkpoint selection) are each supported with evidence in Sections 3.1, 3.2, and 3.3. The claim about 'up to 43% worse' on LBPP is supported by Table 2 (Mistral Large: 0.92 HumanEval vs 0.50 LBPP = 42pp drop)."
    127       },
    128       "causal_claims_justified": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper claims contamination causes inflated benchmark scores, but the evidence is primarily observational. The fine-tuning experiment (Section 3.2, evol-instruct) is a single model/dataset pair. The core difficulty confound—LBPP is intentionally harder—is not controlled for, making it impossible to separate contamination effects from difficulty effects."
    132       },
    133       "generalization_bounded": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The title claims to address 'Code Generation Evaluation Datasets' generally, and the conclusion states 'contamination is likely unavoidable at the LLM scale,' but the analysis covers only HumanEval, MBPP, and two synthetic datasets. No other code generation benchmarks (APPS, CodeContests, SWE-bench) are analyzed."
    137       },
    138       "alternative_explanations_discussed": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Section 3.2 explicitly considers whether high similarity between synthetic data and benchmarks is due to 'synthetic data filling the space of problems similar to HumanEval/MBPP or more direct leakage.' The paper presents three distinct hypotheses for contamination rather than a single explanation."
    142       },
    143       "proxy_outcome_distinction": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The paper's central argument is that Pass@1 on contaminated benchmarks is a poor proxy for code generation capability. They explicitly argue LBPP provides 'a less biased measure' and note that HumanEval/MBPP 'cannot be used as the only proxies to evaluate a model's code abilities' (Section 3.2)."
    147       }
    148     },
    149     "setup_transparency": {
    150       "model_versions_specified": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "Table 2 lists marketing names (GPT-4o, Claude-3.5-Sonnet, Llama3 70B Instruct) without API versions or snapshot dates. No model is specified with a version string like 'gpt-4-0613'. The schema requires exact versions."
    154       },
    155       "prompts_provided": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "Table 2 describes 'zero-shot' evaluation but the exact prompt format used to evaluate models is not shown. Table 4 shows partial prompts used for the memorization test, but the standard evaluation prompt template is not provided."
    159       },
    160       "hyperparameters_reported": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "No temperature, top-p, max tokens, or other sampling parameters are reported for any of the 28 model evaluations."
    164       },
    165       "scaffolding_described": {
    166         "applies": false,
    167         "answer": false,
    168         "justification": "No agentic scaffolding is used. Models are evaluated directly on code completion tasks."
    169       },
    170       "data_preprocessing_documented": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 4 documents the LBPP creation pipeline: annotators with competitive programming experience create problems, model-in-the-loop filtering, about one-third disqualified, manual review by authors. Section 3.1 describes the GitHub keyword search methodology. Section 3.2 describes the embedding similarity analysis using Cohere embed v3."
    174       }
    175     },
    176     "limitations_and_scope": {
    177       "limitations_section_present": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 6 is a dedicated 'Limitations' section discussing black-box analysis constraints and the inevitability of future LBPP contamination."
    181       },
    182       "threats_to_validity_specific": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 6 states specific limitations: 'All the model analysis was done black-box, without inspecting the model weights or the training set' and acknowledges LBPP 'will follow the same path' as HumanEval/MBPP regarding future contamination."
    186       },
    187       "scope_boundaries_stated": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 6 explicitly bounds what the analysis does NOT show: they did not inspect model weights or training sets (except synthetic data analysis). They acknowledge they cannot definitively prove contamination, only present supporting evidence."
    191       }
    192     },
    193     "data_integrity": {
    194       "raw_data_available": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "While LBPP is released on HuggingFace, the underlying analysis data is not: raw GitHub search results, embedding vectors, model outputs for Table 2, and fine-tuning experiment data are all unavailable for independent verification."
    198       },
    199       "data_collection_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Section 4 describes LBPP annotation in detail: annotators created novel problems, inspired by textbooks not freely available online, with model-in-the-loop filtering and manual review. Section 3.1 describes GitHub search methodology. Section 3.2 describes embedding comparison process."
    203       },
    204       "recruitment_methods_described": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "Annotators are described as having 'competitive programming experience' and being 'paid above minimum wage,' but recruitment channels and selection process are not described. All authors are from Cohere, suggesting annotators may be employees/contractors, but this is not stated."
    208       },
    209       "data_pipeline_documented": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "The LBPP pipeline is documented: creation → model-in-the-loop filtering → review → ~1/3 disqualified → final manual review by authors. The contamination analysis pipeline is also documented: GitHub keyword search, embedding with Cohere embed v3, nearest-neighbor similarity comparison."
    213       }
    214     },
    215     "conflicts_of_interest": {
    216       "funding_disclosed": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No funding information or acknowledgments section is present. All authors are from Cohere but no corporate funding disclosure is made."
    220       },
    221       "affiliations_disclosed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "All authors list Cohere as their affiliation on the title page."
    225       },
    226       "funder_independent_of_outcome": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The work is produced by Cohere, which has commercial interest in (a) promoting its own LBPP benchmark, (b) demonstrating that competitors' benchmark scores may be inflated by contamination, and (c) evaluating its own Command R models alongside competitors. The funder is not independent of the outcome."
    230       },
    231       "financial_interests_declared": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No competing interests or financial disclosure statement is present. The authors work for Cohere which sells competing LLM products evaluated in the paper."
    235       }
    236     },
    237     "contamination": {
    238       "training_cutoff_stated": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "Despite the paper being about benchmark contamination, no training cutoff dates are stated for any of the 28 evaluated models. This makes it impossible to precisely determine which models could have seen which benchmark data."
    242       },
    243       "train_test_overlap_discussed": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "Train/test overlap is the central topic of the paper. Section 3.1 quantifies HumanEval presence on GitHub (Fig. 1). Section 3.2 analyzes embedding similarity between training and test data (Fig. 2). Table 4 shows apparent memorization of HumanEval solutions."
    247       },
    248       "benchmark_contamination_addressed": {
    249         "applies": true,
    250         "answer": true,
    251         "justification": "Benchmark contamination is the paper's primary focus. Three contamination mechanisms are analyzed across Sections 3.1-3.3, and LBPP is designed specifically as an uncontaminated alternative."
    252       }
    253     },
    254     "human_studies": {
    255       "pre_registered": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in the evaluation. LBPP annotators created the benchmark but were not study participants."
    259       },
    260       "irb_or_ethics_approval": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in the evaluation study."
    264       },
    265       "demographics_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in the evaluation study."
    269       },
    270       "inclusion_exclusion_criteria": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in the evaluation study."
    274       },
    275       "randomization_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in the evaluation study."
    279       },
    280       "blinding_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in the evaluation study."
    284       },
    285       "attrition_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human participants in the evaluation study."
    289       }
    290     },
    291     "cost_and_practicality": {
    292       "inference_cost_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No inference costs, API costs, or latency data reported despite evaluating 28 models across 3 benchmarks."
    296       },
    297       "compute_budget_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No compute budget stated for model evaluations, the fine-tuning experiment, the embedding similarity analysis, or the GitHub search analysis."
    301       }
    302     },
    303     "experimental_rigor": {
    304       "seed_sensitivity_reported": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "No results across multiple random seeds. All Pass@1 numbers in Table 2 appear to be from single runs."
    308       },
    309       "number_of_runs_stated": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "The number of evaluation runs per model is never stated. It is unclear whether Pass@1 values are from 1 sample or averaged over multiple generations."
    313       },
    314       "hyperparameter_search_budget": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "No hyperparameter search budget reported for the fine-tuning experiment (Command R + evol-instruct) or any model evaluation configuration."
    318       },
    319       "best_config_selection_justified": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "No explanation of how model configurations were selected for evaluation (default settings? specific versions?). The fine-tuning experiment configuration is not justified."
    323       },
    324       "multiple_comparison_correction": {
    325         "applies": false,
    326         "answer": false,
    327         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    328       },
    329       "self_comparison_bias_addressed": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "Cohere employees evaluate Cohere models (Command R, R+, R Refresh, R+ Refresh) alongside competitors and promote Cohere's LBPP benchmark without acknowledging self-evaluation bias."
    333       },
    334       "compute_budget_vs_performance": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "Models ranging from 7B parameters to unknown large sizes are compared in Table 2 without any discussion of compute requirements or performance per unit compute."
    338       },
    339       "benchmark_construct_validity": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "The paper's central thesis is that HumanEval and MBPP lack construct validity due to contamination. Section 3.2 explicitly argues these benchmarks 'cannot be used as the only proxies to evaluate a model's code abilities' and Section 3.3 discusses overfitting to narrow metrics."
    343       },
    344       "scaffold_confound_addressed": {
    345         "applies": false,
    346         "answer": false,
    347         "justification": "No scaffolding is involved; models are evaluated directly on code completion."
    348       }
    349     },
    350     "data_leakage": {
    351       "temporal_leakage_addressed": {
    352         "applies": true,
    353         "answer": true,
    354         "justification": "Section 3.1 discusses how HumanEval (published 2021) and MBPP predated most evaluated models' training, and Section 3 describes how benchmark data accumulates in training corpora over time through replication and synthetic data generation."
    355       },
    356       "feature_leakage_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "The paper does not discuss whether evaluation setups might leak answer information through context (e.g., whether prompt formats or test harness details provide hints unavailable in real usage)."
    360       },
    361       "non_independence_addressed": {
    362         "applies": true,
    363         "answer": true,
    364         "justification": "Section 3.2 and Fig. 2 directly analyze the non-independence of training data (evol-instruct, Starcoder-Instruct) and test data (HumanEval, MBPP) through embedding similarity analysis, showing high cosine similarities."
    365       },
    366       "leakage_detection_method": {
    367         "applies": true,
    368         "answer": true,
    369         "justification": "Multiple concrete detection methods are used: GitHub keyword search for direct contamination (Fig. 1), embedding cosine similarity for semantic overlap (Fig. 2), partial-prompt memorization tests on a commercial model (Table 4), and qualitative prompt matching (Tables 3, 5)."
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "HumanEval prompts are pervasively present in public code repositories, with every prompt appearing at least 43 times on GitHub (median 99).",
    376       "evidence": "Fig. 1 shows a histogram of GitHub code search hits for all HumanEval prompts. Minimum is 43, median is 99, mean is 130.2 ± 153.7.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Synthetic code training datasets have high semantic similarity to HumanEval and MBPP, suggesting indirect data leakage.",
    381       "evidence": "Fig. 2 shows cosine similarity histograms between HumanEval/MBPP and evol-instruct/Starcoder-Instruct. Table 5 shows near-identical prompt pairs. LBPP has uniformly lower similarity to these training datasets.",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "Leading models perform up to 43% worse on the uncontaminated LBPP benchmark compared to HumanEval.",
    386       "evidence": "Table 2 shows Mistral Large drops from 0.92 (HumanEval) to 0.50 (LBPP), a 42 percentage-point drop. All 28 models perform worse on LBPP.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Model rankings change between contaminated benchmarks (HumanEval/MBPP) and the uncontaminated LBPP benchmark.",
    391       "evidence": "Table 2's rightmost column shows rank changes. For example, Claude-3-Haiku is rank 8 on HumanEval but rank 14 on LBPP. Mistral Large is rank 1 on HumanEval but rank 5 on LBPP.",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Fine-tuning on evol-instruct increases HumanEval by +9% and MBPP by +2% but leaves LBPP unchanged, indicating differential contamination.",
    396       "evidence": "Section 3.2 reports this result for Command R Refresh. Only a single model/dataset pair, no variance or repeated trials.",
    397       "supported": "weak"
    398     },
    399     {
    400       "claim": "A major commercial model can reproduce exact HumanEval gold solutions from under-specified partial prompts, demonstrating memorization.",
    401       "evidence": "Table 4 in Appendix A shows four examples where a commercial system completes partial HumanEval prompts with solutions matching the gold standard exactly, despite deliberately ambiguous prompts.",
    402       "supported": "moderate"
    403     }
    404   ],
    405   "red_flags": [
    406     {
    407       "flag": "Conflict of interest: company evaluating own models",
    408       "detail": "All authors are Cohere employees. The paper evaluates 4 Cohere models alongside competitors and promotes Cohere's LBPP benchmark. Cohere benefits commercially from showing competitors' scores may be inflated and from establishing LBPP as a standard benchmark."
    409     },
    410     {
    411       "flag": "No variance or error bars on any result",
    412       "detail": "All 84 Pass@1 values in Table 2 (28 models × 3 benchmarks) are single point estimates. With no variance, it is impossible to know whether observed ranking changes are meaningful or within run-to-run noise."
    413     },
    414     {
    415       "flag": "Difficulty confound not controlled",
    416       "detail": "LBPP is intentionally designed to be harder than HumanEval/MBPP ('adversarial collection resulted in more difficult problems'). Performance drops could be entirely explained by difficulty rather than contamination. The paper cannot separate these effects."
    417     },
    418     {
    419       "flag": "Single metric only",
    420       "detail": "Only Pass@1 is reported. No Pass@k for other k values, no code quality metrics, no partial credit. This limits the sensitivity of the comparison and may miss important performance patterns."
    421     },
    422     {
    423       "flag": "Anonymized evidence for strongest claim",
    424       "detail": "Table 4 demonstrates exact memorization of HumanEval solutions by a 'popular commercial system (kept anonymous).' Anonymizing the model prevents verification and limits the scientific value of this evidence."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "Evaluating large language models trained on code",
    430       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    431       "year": 2021,
    432       "arxiv_id": "2107.03374",
    433       "relevance": "Introduces HumanEval, one of the two primary contaminated benchmarks studied in this paper."
    434     },
    435     {
    436       "title": "Program synthesis with large language models",
    437       "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"],
    438       "year": 2021,
    439       "arxiv_id": "2108.07732",
    440       "relevance": "Introduces MBPP, the other primary contaminated benchmark studied in this paper."
    441     },
    442     {
    443       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    444       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    445       "year": 2024,
    446       "arxiv_id": "2403.07974",
    447       "relevance": "Proposes continuously updated code evaluation to combat contamination, closely related to LBPP's motivation."
    448     },
    449     {
    450       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    451       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"],
    452       "year": 2024,
    453       "relevance": "Proposes a more challenging software engineering benchmark as an alternative to simpler code generation benchmarks."
    454     },
    455     {
    456       "title": "Quantifying contamination in evaluating code generation capabilities of language models",
    457       "authors": ["Martin Riddell", "Ansong Ni", "Arman Cohan"],
    458       "year": 2024,
    459       "arxiv_id": "2403.04811",
    460       "relevance": "Directly quantifies code benchmark contamination in pretraining datasets, finding 12.2% of HumanEval in The Pile and 18.9% in The Stack."
    461     },
    462     {
    463       "title": "Starcoder: may the source be with you",
    464       "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"],
    465       "year": 2023,
    466       "arxiv_id": "2305.06161",
    467       "relevance": "Major code LLM whose decontamination is analyzed; shows limitations of exact-match decontamination for code."
    468     },
    469     {
    470       "title": "WizardLM: Empowering large pre-trained language models to follow complex instructions",
    471       "authors": ["Can Xu", "Qingfeng Sun", "Kai Zheng"],
    472       "year": 2023,
    473       "relevance": "Source of evol-instruct, a widely-used synthetic code dataset shown to have high similarity to HumanEval/MBPP."
    474     },
    475     {
    476       "title": "Magicoder: Source code is all you need",
    477       "authors": ["Yuxiang Wei", "Zhe Wang", "Jiawei Liu"],
    478       "year": 2023,
    479       "arxiv_id": "2312.02120",
    480       "relevance": "Uses synthetic data pipeline for code LLM training; authors reported high similarity between evol-instruct and HumanEval."
    481     },
    482     {
    483       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    484       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang"],
    485       "year": 2024,
    486       "relevance": "Proposes additional tests for HumanEval/MBPP evaluation, addressing a related benchmark quality concern."
    487     },
    488     {
    489       "title": "A careful examination of large language model performance on grade school arithmetic",
    490       "authors": ["Hugh Zhang", "Jeff Da", "Dean Lee"],
    491       "year": 2024,
    492       "arxiv_id": "2405.00332",
    493       "relevance": "Proposes hidden evaluation sets to combat contamination, a related approach discussed in Section 2."
    494     },
    495     {
    496       "title": "What's in my big data?",
    497       "authors": ["Yanai Elazar", "Akshita Bhagia", "Ian Helgi Magnusson"],
    498       "year": 2024,
    499       "relevance": "Reports that only 1.22% of verbatim HumanEval appears in OSCAR, illustrating limitations of surface-level decontamination."
    500     },
    501     {
    502       "title": "The Stack: 3 TB of permissively licensed source code",
    503       "authors": ["Denis Kocetkov", "Raymond Li", "Loubna Ben Allal"],
    504       "year": 2022,
    505       "arxiv_id": "2211.15533",
    506       "relevance": "Major code pretraining dataset found to contain 18.9% of HumanEval samples per Riddell et al. analysis."
    507     },
    508     {
    509       "title": "Deduplicating training data makes language models better",
    510       "authors": ["Katherine Lee", "Daphne Ippolito", "Andrew Nystrom"],
    511       "year": 2022,
    512       "relevance": "Foundational work on training data deduplication whose n-gram methods are shown to be insufficient for code decontamination."
    513     }
    514   ],
    515   "engagement_factors": {
    516     "practical_relevance": {
    517       "score": 2,
    518       "justification": "LBPP is a directly usable drop-in benchmark replacement, and the contamination findings inform how practitioners should interpret model evaluations."
    519     },
    520     "surprise_contrarian": {
    521       "score": 2,
    522       "justification": "Challenges widely-cited HumanEval/MBPP rankings and suggests model scores are inflated, though contamination concerns are not entirely new in the community."
    523     },
    524     "fear_safety": {
    525       "score": 0,
    526       "justification": "No AI safety, security, or risk concerns raised; the paper is about evaluation methodology rather than dangerous capabilities."
    527     },
    528     "drama_conflict": {
    529       "score": 2,
    530       "justification": "'Benchmarks are contaminated' narrative with concrete model rankings changing, plus anonymized evidence of a major commercial system memorizing benchmark solutions."
    531     },
    532     "demo_ability": {
    533       "score": 2,
    534       "justification": "LBPP is released on HuggingFace and can be used immediately for model evaluation, though no evaluation harness code is provided."
    535     },
    536     "brand_recognition": {
    537       "score": 2,
    538       "justification": "From Cohere (a notable AI company) and evaluates GPT-4o, Claude 3.5 Sonnet, Llama3, and other well-known models."
    539     }
    540   }
    541 }

Impressum · Datenschutz