scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29045B)
      1 {
      2   "paper": {
      3     "title": "LessLeak-Bench: A First Investigation of Data Leakage in LLMs Across 83 Software Engineering Benchmarks",
      4     "authors": [
      5       "Xin Zhou",
      6       "Martin Weyssow",
      7       "Ratnadira Widyasari",
      8       "Ting Zhang",
      9       "Junda He",
     10       "Yunbo Lyu",
     11       "Jianming Chang",
     12       "Beiqi Zhang",
     13       "Dan Huang",
     14       "David Lo"
     15     ],
     16     "year": 2025,
     17     "venue": "arXiv",
     18     "arxiv_id": "2502.06215",
     19     "doi": "10.48550/arXiv.2502.06215"
     20   },
     21   "scan_version": 2,
     22   "active_modules": ["experimental_rigor", "data_leakage"],
     23   "methodology_tags": ["benchmark-eval", "observational"],
     24   "key_findings": "Data leakage in 83 SE benchmarks is generally minimal (avg 4.8% Python, 2.8% Java, 0.7% C/C++), but some benchmarks are severely affected—QuixBugs at 100% and BigCloneBench at 55.7%. Leakage significantly inflates evaluation metrics: StarCoder-7b achieves Pass@1 4.9x higher on leaked vs non-leaked APPS samples. Four causes of high leakage are identified: direct inclusion of benchmark repos in pre-training data, repository overlap, reliance on coding platforms like LeetCode, and shared data sources like GitHub issues. Perplexity-based automated leakage detection is ineffective, achieving only 40-50% accuracy.",
     25   "checklist": {
     26     "artifacts": {
     27       "code_released": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper introduces DetectLeak and LessLeak-Bench but provides no repository URL or download link for their code or cleaned benchmarks in the paper text."
     31       },
     32       "data_released": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "LessLeak-Bench and AutoDetectLeak-Bench are described but no download URL or data archive is provided in the paper. The paper references StarCoder's pre-training data on HuggingFace but does not release their own labeled data."
     36       },
     37       "environment_specified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper mentions 'NVIDIA GeForce A5000 GPU with 24 GB of memory' and references the BigCode Team's implementation, but provides no requirements.txt, dependency list, or sufficient detail to recreate the environment."
     41       },
     42       "reproduction_instructions": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No step-by-step reproduction instructions are provided. The methodology is described at a conceptual level but without runnable scripts, commands, or a README."
     46       }
     47     },
     48     "statistical_methodology": {
     49       "confidence_intervals_or_error_bars": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "All leakage ratios (Tables 2-4) and Pass@k results (Table 7) are reported as point estimates with no confidence intervals or error bars."
     53       },
     54       "significance_tests": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper claims 'StarCoder-7b achieves a Pass@1 score that is 4.9 times higher on leaked samples' but provides no statistical significance tests for any comparative claims."
     58       },
     59       "effect_sizes_reported": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "The paper reports effect sizes with context: '4.9 times higher' with actual values (4.4% vs 0.9% Pass@1), '5.6 and 6.0 times higher' for Pass@2 and Pass@3 (Table 7, Section 5.3). Leakage ratios are reported as proportions with counts."
     63       },
     64       "sample_size_justified": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No justification for the number of benchmarks studied (83), number of annotators (8), or the size of AutoDetectLeak-Bench (1,300 samples). No power analysis is discussed."
     68       },
     69       "variance_reported": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No variance, standard deviation, or spread measures are reported for any experimental results. Pass@k results appear to be from single runs."
     73       }
     74     },
     75     "evaluation_design": {
     76       "baselines_included": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Section 3.1 discusses and compares against alternative approaches (Exact Match, code clone detection tools) and justifies choosing MinHash+LSH. RQ3 compares leaked vs non-leaked subsets. RQ4 evaluates Perplexity as a detection method. Related work (Section 7) compares against Yang et al. and López et al."
     80       },
     81       "baselines_contemporary": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Related work includes 2024 publications (López et al., Riddell et al., Matton et al.). MinHash+LSH is the method used by contemporary LLMs (StarCoder, StarCoder2, Qwen) for deduplication."
     85       },
     86       "ablation_study": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "The DetectLeak pipeline has multiple components (automated detection + manual verification) but no ablation study removes individual components to measure their contribution. Figure 3 shows overall filtering statistics but not a controlled ablation."
     90       },
     91       "multiple_metrics": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Multiple metrics are used: leakage count, leakage ratio (Section 3.3), Pass@1/2/3 (Table 7), Cohen's Kappa for annotator agreement (0.9424), and top-k accuracy for Perplexity detection (Section 5.4)."
     95       },
     96       "human_evaluation": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Eight experienced annotators (3 postdocs, 4 PhD, 1 Master's) manually labeled 6,643 potential duplicate pairs flagged by MinHash+LSH, with two independent annotators per pair and a third resolving conflicts (Section 3.2)."
    100       },
    101       "held_out_test_set": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "For RQ3, the APPS test set (5,000 samples, separate from 5,000 training samples) is used. For RQ4, AutoDetectLeak-Bench is constructed from manually labeled data as a separate evaluation set (Section 5.4)."
    105       },
    106       "per_category_breakdown": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Results are broken down by programming language (Tables 2-4), by individual benchmark (83 benchmarks), by SE task type (Section 5.1.4), and by model size (StarCoder-7b/3b/1b in Table 7)."
    110       },
    111       "failure_cases_discussed": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "RQ4 reports that Perplexity-based detection achieves only 40-50% accuracy, a clear failure case. Figure 3 shows 72% of auto-flagged pairs are non-duplicates (false positive analysis). Section 6.3 acknowledges the automated tool may miss cases."
    115       },
    116       "negative_results_reported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "RQ4 is essentially a negative result: Perplexity-based automated leakage detection is unreliable (40-50% accuracy). The paper also reports that most SE benchmarks have minimal leakage, which is a 'negative' finding relative to the concern being investigated."
    120       }
    121     },
    122     "claims_and_evidence": {
    123       "abstract_claims_supported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Abstract claims are supported: average leakage ratios of 4.8%/2.8%/0.7% match Tables 2-4; QuixBugs 100% and BigCloneBench 55.7% match Table 2/3; StarCoder-7b 4.9x higher Pass@1 on leaked samples matches Table 7."
    127       },
    128       "causal_claims_justified": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper claims 'data leakage has a significant impact on the evaluation of LLMs' (Section 5.3), which is a causal claim. However, the leaked vs non-leaked comparison is observational—leaked APPS samples may differ systematically from non-leaked in difficulty, domain, or complexity. This confound is not addressed."
    132       },
    133       "generalization_bounded": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The title says 'Data Leakage in LLMs' broadly but only StarCoder is studied. Section 6.3 acknowledges limitations ('may not generalize to all SE benchmarks and LLMs'), but the paper also claims generalizability to derivative LLMs (WizardCoder, OctoPack, CodeShell, DeepSeek-Coder) based on shared pre-training data, which is speculative."
    137       },
    138       "alternative_explanations_discussed": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "For RQ3 (leaked vs non-leaked performance gap), the paper does not discuss alternative explanations such as systematic difficulty differences between leaked and non-leaked samples. The threats to validity section (6.3) discusses representativeness but not alternative explanations for observed effects."
    142       },
    143       "proxy_outcome_distinction": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The paper defines data leakage as inclusion of evaluation data in pre-training data (Section 2.1), which is exactly what MinHash+LSH detects. Their measurement (near-duplicate overlap with pre-training corpus) directly maps to their definition. RQ3 separately tests whether this overlap actually affects performance, demonstrating awareness of the gap between presence in training data and behavioral impact."
    147       }
    148     },
    149     "setup_transparency": {
    150       "model_versions_specified": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "StarCoder-7b, StarCoder-3b, and StarCoder-1b are specified with sizes. StarCoder is referenced via its paper [45] and HuggingFace homepage. These are specific, versioned models."
    154       },
    155       "prompts_provided": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section 5.3 provides the exact prompt format: '### Instruction: [instruction input from APPS data sample] ### Response:'. The fill values come from the public APPS dataset, making prompts fully reconstructable."
    159       },
    160       "hyperparameters_reported": {
    161         "applies": true,
    162         "answer": false,
    163         "justification": "MinHash+LSH hyperparameters are reported (n-gram=2, Jaccard threshold=0.7). However, for the StarCoder evaluation in RQ3, no LLM inference parameters are stated (temperature, top-p, max tokens, sampling strategy)."
    164       },
    165       "scaffolding_described": {
    166         "applies": false,
    167         "answer": false,
    168         "justification": "No agentic scaffolding is used. StarCoder models are prompted directly in zero-shot mode."
    169       },
    170       "data_preprocessing_documented": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The data pipeline is well-documented: benchmark selection from a survey of 395 papers (Section 4.2), MinHash+LSH detection with 1.7 trillion comparisons, manual labeling with 8 annotators, 6,643 flagged pairs → 1,950 real duplicates and 4,741 non-duplicates (Figure 2, Section 3). For RQ4, deduplication and under-sampling are described."
    174       }
    175     },
    176     "limitations_and_scope": {
    177       "limitations_section_present": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 6.3 'Threats to Validity' provides substantive discussion of limitations including generalizability, automated technique limitations, and LLM selection rationale."
    181       },
    182       "threats_to_validity_specific": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 6.3 identifies specific threats: 'limited to the specific SE benchmarks and LLMs studied,' the automated technique 'may not detect all leaked SE benchmark samples,' and explains the specific rationale for StarCoder selection."
    186       },
    187       "scope_boundaries_stated": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The paper states scope boundaries: focus on pre-training leakage (not fine-tuning, Section 2.1), three programming languages only, StarCoder family only, and 83 specific benchmarks. Section 6.3 explicitly states results 'may not generalize to all SE benchmarks and LLMs.'"
    191       }
    192     },
    193     "data_integrity": {
    194       "raw_data_available": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "The manually labeled duplicate pairs, AutoDetectLeak-Bench, and LessLeak-Bench are described but no download URL or data archive is provided in the paper text for independent verification."
    198       },
    199       "data_collection_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Data collection is described in detail: benchmarks selected from a survey of 395 papers (Section 4.2), StarCoder pre-training data from The Stack on HuggingFace (Section 4.1), manual annotation process with 8 annotators (Section 3.2)."
    203       },
    204       "recruitment_methods_described": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The 8 annotators are characterized: 'three post-doctoral researchers, four PhD students, and one Master's student, each with a minimum of four years of experience in programming' (Section 3.2)."
    208       },
    209       "data_pipeline_documented": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Figure 2 illustrates the full DetectLeak pipeline. Each step includes counts: 1.7 trillion comparisons → 6,643 auto-flagged pairs → 1,950 verified duplicates → 606/816/108 leaked samples for Python/Java/C++. Cohen's Kappa reported (0.9424)."
    213       }
    214     },
    215     "conflicts_of_interest": {
    216       "funding_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Acknowledgement section states: 'supported by the National Research Foundation, under its Investigatorship Grant (NRF-NRFI08-2022-0002).'"
    220       },
    221       "affiliations_disclosed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "All author affiliations are disclosed: primarily Singapore Management University, with co-authors at Southeast University (China) and Wuhan University (China). Authors are not affiliated with any LLM company being evaluated."
    225       },
    226       "funder_independent_of_outcome": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "The National Research Foundation of Singapore is a government research funding body with no financial stake in whether SE benchmarks exhibit data leakage."
    230       },
    231       "financial_interests_declared": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No competing interests or financial interests statement is present in the paper."
    235       }
    236     },
    237     "contamination": {
    238       "training_cutoff_stated": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "Section 4.1 states StarCoder's pre-training data is from The Stack, 'collected from public GitHub repositories between 2015 and 2022.'"
    242       },
    243       "train_test_overlap_discussed": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "Train/test overlap is the central topic of the entire paper. The DetectLeak framework systematically identifies overlap between pre-training data and 83 SE benchmarks."
    247       },
    248       "benchmark_contamination_addressed": {
    249         "applies": true,
    250         "answer": true,
    251         "justification": "Benchmark contamination is the core research question. The paper quantifies exactly how much of each benchmark appeared in pre-training data and introduces LessLeak-Bench to remove contaminated samples."
    252       }
    253     },
    254     "human_studies": {
    255       "pre_registered": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants. The 8 annotators are research team members performing data labeling, not study participants."
    259       },
    260       "irb_or_ethics_approval": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants. The study analyzes code data and LLM pre-training datasets."
    264       },
    265       "demographics_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants. Annotator qualifications are described but they are researchers, not study participants."
    269       },
    270       "inclusion_exclusion_criteria": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in the study."
    274       },
    275       "randomization_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants; not an experimental study with human subjects."
    279       },
    280       "blinding_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants; not an experimental study with human subjects."
    284       },
    285       "attrition_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human participants in the study."
    289       }
    290     },
    291     "cost_and_practicality": {
    292       "inference_cost_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "The paper mentions '1.7 trillion comparisons' but does not report actual compute time, cost, or wall-clock time for any stage of the pipeline or the LLM inference experiments."
    296       },
    297       "compute_budget_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The GPU type is mentioned (NVIDIA GeForce A5000, 24GB) but total compute time, GPU hours, or overall budget for the extensive experiments is not quantified."
    301       }
    302     },
    303     "experimental_rigor": {
    304       "seed_sensitivity_reported": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The Pass@k experiments in RQ3 (Table 7) do not report results across multiple random seeds. Results appear to be from single runs."
    308       },
    309       "number_of_runs_stated": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "The number of experimental runs for the LLM evaluation in RQ3 is not stated. It is unclear whether results are from one or multiple runs."
    313       },
    314       "hyperparameter_search_budget": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "MinHash+LSH parameters (n-gram=2, threshold=0.7) were set 'based on initial small-scale trials' (Section 3.1) but the number of configurations tried and selection criteria are not reported."
    318       },
    319       "best_config_selection_justified": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "The MinHash+LSH configuration is justified only as 'based on initial small-scale trials' without describing the trials, how many configurations were tested, or the selection criterion."
    323       },
    324       "multiple_comparison_correction": {
    325         "applies": false,
    326         "answer": false,
    327         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    328       },
    329       "self_comparison_bias_addressed": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "The authors evaluate their own DetectLeak framework without acknowledging self-evaluation bias. No independent evaluation or discussion of author-evaluation bias is present."
    333       },
    334       "compute_budget_vs_performance": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "Three model sizes are compared (StarCoder-7b/3b/1b) in Table 7 but performance is not discussed as a function of compute budget. The compute cost differences across model sizes are not addressed."
    338       },
    339       "benchmark_construct_validity": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "The paper uses APPS for RQ3 evaluation without discussing whether Pass@k on APPS adequately measures the impact of data leakage, or whether the leaked/non-leaked split introduces systematic differences beyond leakage (e.g., difficulty confound)."
    343       },
    344       "scaffold_confound_addressed": {
    345         "applies": false,
    346         "answer": false,
    347         "justification": "No scaffolding is used. StarCoder models are evaluated directly with zero-shot prompting."
    348       }
    349     },
    350     "data_leakage": {
    351       "temporal_leakage_addressed": {
    352         "applies": true,
    353         "answer": true,
    354         "justification": "The paper explicitly addresses temporal leakage: StarCoder's pre-training data is from 2015-2022 (Section 4.1), and they identify that many benchmarks were created before this period, enabling leakage. Table 6 traces causes to benchmark creation dates and pre-training data overlap."
    355       },
    356       "feature_leakage_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "The paper focuses on data leakage (presence of benchmark data in pre-training) but does not discuss feature leakage—whether evaluation inputs contain information not available at prediction time in real usage."
    360       },
    361       "non_independence_addressed": {
    362         "applies": true,
    363         "answer": true,
    364         "justification": "Non-independence between pre-training and benchmark data is the central research question. The paper systematically identifies and quantifies overlap between StarCoder's pre-training corpus and 83 SE benchmarks."
    365       },
    366       "leakage_detection_method": {
    367         "applies": true,
    368         "answer": true,
    369         "justification": "The paper develops DetectLeak, a concrete leakage detection pipeline combining MinHash+LSH near-duplicate detection with manual verification by 8 annotators (Section 3, Figure 2). They also evaluate Perplexity-based detection as an alternative (RQ4)."
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "Data leakage in SE benchmarks is generally minimal, with average leakage ratios of only 4.8% (Python), 2.8% (Java), and 0.7% (C/C++).",
    376       "evidence": "Tables 2-4 present per-benchmark leakage ratios computed from 1,950 manually verified duplicate pairs out of 6,643 auto-flagged pairs across 83 benchmarks (Section 5.1).",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Some benchmarks exhibit very high leakage: QuixBugs has 100% leakage, BigCloneBench has 55.7%.",
    381       "evidence": "Table 2 shows QuixBugs (40/40 samples leaked), Table 3 shows BigCloneBench (508/912 leaked). Root causes identified in Table 6 (Section 5.2).",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Data leakage significantly inflates LLM evaluation metrics: StarCoder-7b achieves Pass@1 4.9 times higher on leaked APPS samples than non-leaked.",
    386       "evidence": "Table 7 shows StarCoder-7b Pass@1 of 4.4% on leaked vs 0.9% on non-leaked APPS test samples (Section 5.3). Similar ratios for StarCoder-3b and 1b.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Four primary causes of high data leakage: direct inclusion of benchmark repos, repository overlap, coding platform dependence (LeetCode), and shared data sources (GitHub issues).",
    391       "evidence": "Section 5.2, Tables 5-6 analyze the top leaked benchmarks and trace leakage to specific repositories and patterns (e.g., PatrickShaw/QuixBugs, LeetCode-named repos).",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Perplexity-based automated leakage detection without access to pre-training data is ineffective, with accuracy only 40-50%.",
    396       "evidence": "Figure 5 shows detection accuracy across top-k rankings for StarCoder-7b/3b/1b, all achieving near-chance accuracy (Section 5.4). Figure 4 shows no clear separation in Perplexity distributions.",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "Program repair benchmarks are particularly vulnerable to data leakage, with an average leakage rate of 12.5% versus 0.62% for code generation.",
    401       "evidence": "Section 5.1.4 reports that only 1/9 program repair benchmarks has 0% leakage, while 26/33 code generation benchmarks have 0% leakage.",
    402       "supported": "strong"
    403     }
    404   ],
    405   "red_flags": [
    406     {
    407       "flag": "Confound in leaked vs non-leaked comparison",
    408       "detail": "RQ3 compares LLM performance on leaked vs non-leaked APPS samples, but leaked samples may systematically differ from non-leaked in difficulty, domain, or problem type. This confound is not discussed—the performance gap could partially reflect difficulty differences, not just memorization from leakage."
    409     },
    410     {
    411       "flag": "Single LLM family studied despite broad title",
    412       "detail": "The title says 'Data Leakage in LLMs' but only StarCoder is studied. Claims of generalizability to derivative models (WizardCoder, DeepSeek-Coder) are speculative since derivative models may deduplicate differently or use additional data."
    413     },
    414     {
    415       "flag": "No statistical significance tests",
    416       "detail": "The 4.9x performance gap on leaked vs non-leaked data (Table 7) and all comparative claims lack statistical significance testing. With small leaked sample sizes for some benchmarks, observed differences could be unstable."
    417     },
    418     {
    419       "flag": "Missing artifact release",
    420       "detail": "LessLeak-Bench, DetectLeak, and AutoDetectLeak-Bench are introduced as contributions but no repository URL, data download link, or archive is provided in the paper text."
    421     }
    422   ],
    423   "cited_papers": [
    424     {
    425       "title": "Evaluating large language models trained on code",
    426       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    427       "year": 2021,
    428       "arxiv_id": "2107.03374",
    429       "relevance": "Introduces HumanEval benchmark, one of the most widely used code generation benchmarks studied for leakage in this paper."
    430     },
    431     {
    432       "title": "SWE-bench: Can language models resolve real-world github issues?",
    433       "authors": ["C. E. Jimenez", "J. Yang", "A. Wettig"],
    434       "year": 2024,
    435       "relevance": "Major SE benchmark for issue fixing that the paper finds has 8.7% leakage ratio, raising concerns for LLM evaluation."
    436     },
    437     {
    438       "title": "StarCoder: may the source be with you!",
    439       "authors": ["R. Li", "L. B. Allal", "Y. Zi"],
    440       "year": 2023,
    441       "relevance": "The target LLM for this study, chosen for its fully open pre-training data enabling concrete leakage analysis."
    442     },
    443     {
    444       "title": "On inter-dataset code duplication and data leakage in large language models",
    445       "authors": ["J. A. H. López", "B. Chen", "M. Saaz"],
    446       "year": 2024,
    447       "relevance": "Prior work on data leakage in smaller LLMs (CodeBERT), limited to 3 benchmarks—this paper extends to 83 benchmarks and larger models."
    448     },
    449     {
    450       "title": "Unveiling memorization in code models",
    451       "authors": ["Z. Yang", "Z. Zhao", "C. Wang"],
    452       "year": 2024,
    453       "relevance": "Studies memorization in CodeParrot using clone detection, related but distinct focus from benchmark-specific data leakage."
    454     },
    455     {
    456       "title": "Program synthesis with large language models",
    457       "authors": ["J. Austin", "A. Odena", "M. I. Nye"],
    458       "year": 2021,
    459       "arxiv_id": "2108.07732",
    460       "relevance": "Introduces MBPP benchmark, widely used for code generation evaluation and studied for leakage."
    461     },
    462     {
    463       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    464       "authors": ["N. Jain", "K. Han", "A. Gu"],
    465       "year": 2024,
    466       "arxiv_id": "2403.07974",
    467       "relevance": "Benchmark designed to be contamination-free using temporal splits, found to have 0% leakage in this study."
    468     },
    469     {
    470       "title": "Leak, cheat, repeat: Data contamination and evaluation malpractices in closed-source LLMs",
    471       "authors": ["S. Balloccu", "P. Schmidtová", "M. Lango"],
    472       "year": 2024,
    473       "arxiv_id": "2402.03927",
    474       "relevance": "Addresses data contamination in closed-source LLMs, complementary to this paper's focus on open-source LLMs."
    475     },
    476     {
    477       "title": "Quantifying contamination in evaluating code generation capabilities of language models",
    478       "authors": ["M. Riddell", "A. Ni", "A. Cohan"],
    479       "year": 2024,
    480       "relevance": "Studies leakage in HumanEval and MBPP using Levenshtein similarity, a gray-literature baseline for this work."
    481     },
    482     {
    483       "title": "Measuring coding challenge competence with APPS",
    484       "authors": ["D. Hendrycks", "S. Basart", "S. Kadavath"],
    485       "year": 2021,
    486       "relevance": "Introduces APPS benchmark used in RQ3 to demonstrate the impact of data leakage on LLM evaluation metrics."
    487     },
    488     {
    489       "title": "DeepSeek-Coder: When the large language model meets programming",
    490       "authors": ["D. Guo"],
    491       "year": 2024,
    492       "relevance": "Derivative LLM using data curation techniques inspired by StarCoder, illustrating how leakage findings propagate to other models."
    493     },
    494     {
    495       "title": "Large language models for software engineering: A systematic literature review",
    496       "authors": ["X. Hou", "Y. Zhao", "Y. Liu"],
    497       "year": 2024,
    498       "relevance": "Comprehensive survey of LLMs for SE tasks that served as the source for benchmark selection in this study."
    499     }
    500   ]
    501 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs