scan-v5.json (26640B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "LessLeak-Bench: A First Investigation of Data Leakage in LLMs Across 83 Software Engineering Benchmarks", 6 "authors": [ 7 "Xin Zhou", 8 "Martin Weyssow", 9 "Ratnadira Widyasari", 10 "Ting Zhang", 11 "Junda He", 12 "Yunbo Lyu", 13 "Jianming Chang", 14 "Beiqi Zhang", 15 "Dan Huang", 16 "David Lo" 17 ], 18 "year": 2025, 19 "venue": "arXiv.org", 20 "arxiv_id": "2502.06215", 21 "doi": "10.48550/arXiv.2502.06215" 22 }, 23 "checklist": { 24 "claims_and_evidence": { 25 "abstract_claims_supported": { 26 "applies": true, 27 "answer": true, 28 "justification": "All major abstract claims — average leakage ratios (4.8%/2.8%/0.7%), specific high-leakage benchmarks (QuixBugs 100%, BigCloneBench 55.7%), and the 4.9× inflation finding — are directly supported by Tables 2–4 and Table 7.", 29 "source": "haiku" 30 }, 31 "causal_claims_justified": { 32 "applies": true, 33 "answer": true, 34 "justification": "The causal claim that leakage inflates metrics is tested by splitting APPS into leaked vs. non-leaked subsets and comparing Pass@k on the same models; this controlled comparison is appropriate for the claim, though confounds like problem difficulty are not ruled out.", 35 "source": "haiku" 36 }, 37 "generalization_bounded": { 38 "applies": true, 39 "answer": true, 40 "justification": "The paper explicitly states findings are limited to StarCoder (one fully open-source LLM family) and 83 SE benchmarks; the threats-to-validity section acknowledges results may not generalize to all LLMs and benchmarks.", 41 "source": "haiku" 42 }, 43 "alternative_explanations_discussed": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper does not discuss whether leaked samples might be systematically easier problems (a plausible confound), attributing all performance inflation solely to memorization without ruling out difficulty bias.", 47 "source": "haiku" 48 }, 49 "proxy_outcome_distinction": { 50 "applies": true, 51 "answer": true, 52 "justification": "Pass@k measures functional correctness directly via test case execution; leakage count/ratio measures data overlap directly. The paper does not conflate proxy measures with downstream claims.", 53 "source": "haiku" 54 } 55 }, 56 "limitations_and_scope": { 57 "limitations_section_present": { 58 "applies": true, 59 "answer": true, 60 "justification": "Section 6.3 'Threats to Validity' is a dedicated subsection discussing generalizability and detection completeness limitations.", 61 "source": "haiku" 62 }, 63 "threats_to_validity_specific": { 64 "applies": true, 65 "answer": true, 66 "justification": "Specific threats named include: restriction to one LLM (StarCoder) and 83 benchmarks, and that MinHash+LSH may miss leakage cases since it operates on near-duplicates rather than semantic equivalents.", 67 "source": "haiku" 68 }, 69 "scope_boundaries_stated": { 70 "applies": true, 71 "answer": true, 72 "justification": "The paper explicitly bounds results to StarCoder's pre-training data (The Stack, 2015–2022 GitHub) and three programming languages across 83 benchmarks, stating findings on proprietary LLMs cannot be verified.", 73 "source": "haiku" 74 } 75 }, 76 "conflicts_of_interest": { 77 "funding_disclosed": { 78 "applies": true, 79 "answer": true, 80 "justification": "Acknowledgment section discloses funding from Singapore's National Research Foundation under Investigatorship Grant NRF-NRFI08-2022-0002.", 81 "source": "haiku" 82 }, 83 "affiliations_disclosed": { 84 "applies": true, 85 "answer": true, 86 "justification": "All author affiliations are listed (Singapore Management University, Southeast University, Wuhan University) and none have affiliation with the evaluated LLM providers.", 87 "source": "haiku" 88 }, 89 "funder_independent_of_outcome": { 90 "applies": true, 91 "answer": true, 92 "justification": "NRF Singapore is a government research funder with no commercial stake in LLM benchmark validity findings.", 93 "source": "haiku" 94 }, 95 "financial_interests_declared": { 96 "applies": true, 97 "answer": false, 98 "justification": "There is no competing interests or financial interests declaration beyond the funding acknowledgment; no explicit 'no competing interests' statement is present.", 99 "source": "haiku" 100 } 101 }, 102 "scope_and_framing": { 103 "key_terms_defined": { 104 "applies": true, 105 "answer": true, 106 "justification": "Section 2.1 formally defines 'data leakage' with mathematical notation distinguishing exact leakage and semantic leakage, and Section 2.2 distinguishes data leakage detection from code clone detection.", 107 "source": "haiku" 108 }, 109 "intended_contribution_clear": { 110 "applies": true, 111 "answer": true, 112 "justification": "The contribution is explicitly stated: first large-scale analysis of leakage across 83 SE benchmarks, DetectLeak framework, AutoDetectLeak-Bench labeled dataset, and LessLeak-Bench cleaned benchmarks.", 113 "source": "haiku" 114 }, 115 "engagement_with_prior_work": { 116 "applies": true, 117 "answer": true, 118 "justification": "Section 7 specifically contrasts this work with Yang et al., Lopez et al., Matton et al., and Riddell et al. on scale (83 vs. 2–3 benchmarks), model size (StarCoder vs. lightweight LLMs), and methodology (manual verification vs. automated tools only).", 119 "source": "haiku" 120 } 121 } 122 }, 123 "type_checklist": { 124 "empirical": { 125 "artifacts": { 126 "code_released": { 127 "applies": true, 128 "answer": false, 129 "justification": "DetectLeak framework is described methodologically but no GitHub URL or code release link is provided anywhere in the paper.", 130 "source": "haiku" 131 }, 132 "data_released": { 133 "applies": true, 134 "answer": false, 135 "justification": "LessLeak-Bench and AutoDetectLeak-Bench are introduced but no download URL or repository link is provided; the paper only describes their creation.", 136 "source": "haiku" 137 }, 138 "environment_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "Only a GPU model (NVIDIA A5000, 24GB) and two library names (BigCode DataSketches) are mentioned; no requirements.txt, Dockerfile, or complete dependency list is provided.", 142 "source": "haiku" 143 }, 144 "reproduction_instructions": { 145 "applies": true, 146 "answer": false, 147 "justification": "The three-phase DetectLeak pipeline (automated detection → manual labeling → quantification) is described conceptually but no step-by-step instructions or runnable scripts are provided.", 148 "source": "haiku" 149 } 150 }, 151 "statistical_methodology": { 152 "confidence_intervals_or_error_bars": { 153 "applies": true, 154 "answer": false, 155 "justification": "Leakage ratios in Tables 2–4 and Pass@k scores in Table 7 are presented as point estimates with no confidence intervals or error bars.", 156 "source": "haiku" 157 }, 158 "significance_tests": { 159 "applies": true, 160 "answer": false, 161 "justification": "Table 7 compares Pass@k on leaked vs. non-leaked subsets across three model sizes with no statistical significance tests; the 4.9× difference is reported without testing.", 162 "source": "haiku" 163 }, 164 "effect_sizes_reported": { 165 "applies": true, 166 "answer": true, 167 "justification": "Effect sizes are reported as ratio comparisons (e.g., '4.9 times higher Pass@1', '5.6 times higher Pass@2') rather than just raw differences, providing context for the magnitude.", 168 "source": "haiku" 169 }, 170 "sample_size_justified": { 171 "applies": true, 172 "answer": false, 173 "justification": "The 83 benchmark selection is justified by coverage of a prior survey, but there is no power analysis and the APPS-only analysis for RQ3 (108 leaked samples) has no sample size justification.", 174 "source": "haiku" 175 }, 176 "variance_reported": { 177 "applies": true, 178 "answer": false, 179 "justification": "All results are point estimates; no standard deviations, variance, or spread metrics are reported across runs or annotation cycles.", 180 "source": "haiku" 181 } 182 }, 183 "evaluation_design": { 184 "baselines_included": { 185 "applies": true, 186 "answer": true, 187 "justification": "Non-leaked sample performance serves as the natural baseline for RQ3; random baseline (50%) serves as comparison for RQ4 perplexity detection accuracy.", 188 "source": "haiku" 189 }, 190 "baselines_contemporary": { 191 "applies": true, 192 "answer": true, 193 "justification": "StarCoder-7b/3b/1b are the same models whose pre-training data is under investigation, making them the appropriate and contemporary models for the comparison.", 194 "source": "haiku" 195 }, 196 "ablation_study": { 197 "applies": false, 198 "answer": false, 199 "justification": "This is an empirical investigation with a pipeline, not a system with components to ablate; the MinHash threshold sensitivity is partially explored but not as a formal ablation.", 200 "source": "haiku" 201 }, 202 "multiple_metrics": { 203 "applies": true, 204 "answer": true, 205 "justification": "The paper uses leakage count, leakage ratio, Pass@1/2/3, Perplexity, Cohen's Kappa, and Jaccard similarity across different research questions.", 206 "source": "haiku" 207 }, 208 "human_evaluation": { 209 "applies": true, 210 "answer": true, 211 "justification": "Manual labeling by 8 experienced developers (postdocs, PhD/Master's students) with inter-rater agreement measured via Cohen's Kappa (0.9424) is the core validation mechanism.", 212 "source": "haiku" 213 }, 214 "held_out_test_set": { 215 "applies": true, 216 "answer": true, 217 "justification": "RQ3 experiments use the APPS test set (5,000 samples) split into leaked and non-leaked subsets, which is the held-out test portion of the benchmark.", 218 "source": "haiku" 219 }, 220 "per_category_breakdown": { 221 "applies": true, 222 "answer": true, 223 "justification": "Results are broken down by programming language (Tables 2–4), by SE task category (Section 5.1.4), and individually per benchmark with full leakage statistics.", 224 "source": "haiku" 225 }, 226 "failure_cases_discussed": { 227 "applies": true, 228 "answer": true, 229 "justification": "The paper discusses why models don't achieve perfect performance on leaked samples (problem complexity limits memorization) and why perplexity fails to distinguish leaked from non-leaked (Section 5.4).", 230 "source": "haiku" 231 }, 232 "negative_results_reported": { 233 "applies": true, 234 "answer": true, 235 "justification": "RQ4 is a negative result: perplexity-based automated detection achieves only 40–50% accuracy across all model sizes, barely above random, explicitly framed as a challenge for future work.", 236 "source": "haiku" 237 } 238 }, 239 "setup_transparency": { 240 "model_versions_specified": { 241 "applies": true, 242 "answer": true, 243 "justification": "StarCoder (15.5B), StarCoder-7b, StarCoder-3b, StarCoder-1b are specified by name and size; the pre-training data source (The Stack) is identified with its HuggingFace URL.", 244 "source": "haiku" 245 }, 246 "prompts_provided": { 247 "applies": true, 248 "answer": true, 249 "justification": "The exact prompt format used for APPS experiments is given: '### Instruction: [instruction input] ### Response:' with zero-shot specification.", 250 "source": "haiku" 251 }, 252 "hyperparameters_reported": { 253 "applies": true, 254 "answer": false, 255 "justification": "MinHash parameters (n-gram=2, Jaccard threshold=0.7) are reported, but LLM inference hyperparameters (temperature, top-p, number of samples for Pass@k) are not disclosed.", 256 "source": "haiku" 257 }, 258 "scaffolding_described": { 259 "applies": false, 260 "answer": false, 261 "justification": "No agentic scaffolding is used; the study evaluates LLMs in a straightforward zero-shot prompting setup.", 262 "source": "haiku" 263 }, 264 "data_preprocessing_documented": { 265 "applies": true, 266 "answer": true, 267 "justification": "Section 3 and Figure 2 document the full three-phase pipeline including automated detection, manual annotation process, and conflict resolution procedure with details on annotation categories.", 268 "source": "haiku" 269 } 270 }, 271 "data_integrity": { 272 "raw_data_available": { 273 "applies": true, 274 "answer": false, 275 "justification": "AutoDetectLeak-Bench is described as a resource to be shared but no repository link, DOI, or download URL is provided in the paper.", 276 "source": "haiku" 277 }, 278 "data_collection_described": { 279 "applies": true, 280 "answer": true, 281 "justification": "Section 4.2 describes the benchmark selection process (leveraging a survey of 395 papers from 2017–2024, expanding via citations, requiring replication packages), with explicit inclusion/exclusion criteria.", 282 "source": "haiku" 283 }, 284 "recruitment_methods_described": { 285 "applies": false, 286 "answer": false, 287 "justification": "The benchmarks are standard public datasets, not recruited participant data; annotators are described by role but this is not a participant recruitment study.", 288 "source": "haiku" 289 }, 290 "data_pipeline_documented": { 291 "applies": true, 292 "answer": true, 293 "justification": "Figure 2 and Sections 3.1–3.3 document the full pipeline from raw pre-training data comparison through automated detection, manual annotation, conflict resolution, to leakage quantification.", 294 "source": "haiku" 295 } 296 }, 297 "contamination": { 298 "training_cutoff_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "StarCoder's pre-training data (The Stack) is explicitly described as GitHub repositories collected between 2015 and 2022, establishing a clear data collection period.", 302 "source": "haiku" 303 }, 304 "train_test_overlap_discussed": { 305 "applies": true, 306 "answer": true, 307 "justification": "This is the entire focus of the paper — systematically identifying and quantifying overlap between pre-training data and benchmark test sets across 83 benchmarks.", 308 "source": "haiku" 309 }, 310 "benchmark_contamination_addressed": { 311 "applies": true, 312 "answer": true, 313 "justification": "Contamination is directly measured through MinHash+LSH comparison of 1.7 trillion pairs, manually verified, and quantified per benchmark with specific counts and ratios.", 314 "source": "haiku" 315 } 316 }, 317 "human_studies": { 318 "pre_registered": { 319 "applies": false, 320 "answer": false, 321 "justification": "No human participants as research subjects; annotators are part of the research team, not subjects.", 322 "source": "haiku" 323 }, 324 "irb_or_ethics_approval": { 325 "applies": false, 326 "answer": false, 327 "justification": "No human subjects research; IRB not applicable.", 328 "source": "haiku" 329 }, 330 "demographics_reported": { 331 "applies": false, 332 "answer": false, 333 "justification": "No human participants as subjects.", 334 "source": "haiku" 335 }, 336 "inclusion_exclusion_criteria": { 337 "applies": false, 338 "answer": false, 339 "justification": "No human participants as subjects.", 340 "source": "haiku" 341 }, 342 "randomization_described": { 343 "applies": false, 344 "answer": false, 345 "justification": "No human participants as subjects.", 346 "source": "haiku" 347 }, 348 "blinding_described": { 349 "applies": false, 350 "answer": false, 351 "justification": "No human participants as subjects.", 352 "source": "haiku" 353 }, 354 "attrition_reported": { 355 "applies": false, 356 "answer": false, 357 "justification": "No human participants as subjects.", 358 "source": "haiku" 359 } 360 }, 361 "cost_and_practicality": { 362 "inference_cost_reported": { 363 "applies": true, 364 "answer": false, 365 "justification": "No inference cost, latency, or GPU hours are reported for running StarCoder models on the APPS benchmark experiments.", 366 "source": "haiku" 367 }, 368 "compute_budget_stated": { 369 "applies": true, 370 "answer": false, 371 "justification": "Only the GPU hardware (NVIDIA A5000, 24GB) is mentioned; total compute hours or cost for processing 1.7 trillion pairs is not stated.", 372 "source": "haiku" 373 } 374 } 375 } 376 }, 377 "claims": [ 378 { 379 "claim": "Data leakage in SE benchmarks is generally minimal, with average leakage ratios of only 4.8%, 2.8%, and 0.7% for Python, Java, and C/C++ benchmarks respectively.", 380 "evidence": "Tables 2–4 show per-benchmark leakage counts and ratios across 83 benchmarks, with averages computed at the bottom of each table.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Some benchmarks exhibit very high leakage: QuixBugs has 100% leakage and BigCloneBench has 55.7% leakage.", 385 "evidence": "Table 2 shows QuixBugs with 40/40 samples leaked (100%), Table 3 shows BigCloneBench with 508/912 samples leaked (55.7%), both verified through manual labeling.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Data leakage substantially inflates LLM evaluation metrics: StarCoder-7b achieves Pass@1 that is 4.9 times higher on leaked samples than non-leaked samples on APPS.", 390 "evidence": "Table 7 shows StarCoder-7b Pass@1 of 4.4% on leaked vs. 0.9% on non-leaked; pattern holds across StarCoder-3b and StarCoder-1b and across Pass@1/2/3.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "Program repair benchmarks have significantly higher average leakage (12.5%) compared to code generation benchmarks (0.62%).", 395 "evidence": "Section 5.1.4 reports task-level analysis: only 1/9 program repair benchmarks has 0% leakage vs. 26/33 code generation benchmarks with 0% leakage.", 396 "supported": "strong" 397 }, 398 { 399 "claim": "Perplexity-based automated leakage detection is unreliable, achieving only 40–50% accuracy across model sizes.", 400 "evidence": "Figure 5 shows top-k detection accuracy ranging 40–50% for StarCoder-7b/3b/1b across k=100–1000, barely above the 50% random baseline.", 401 "supported": "strong" 402 }, 403 { 404 "claim": "High leakage is primarily caused by four factors: direct benchmark inclusion, source repository overlap, LeetCode derivation, and shared GitHub issue data sources.", 405 "evidence": "Table 6 traces each high-leakage benchmark to a specific cause; Table 5 shows 'leetcode' appears in repository names 8 times in top 20 leakage-contributing repos.", 406 "supported": "moderate" 407 } 408 ], 409 "methodology_tags": [ 410 "benchmark-eval", 411 "observational" 412 ], 413 "key_findings": "The first large-scale study of data leakage across 83 SE benchmarks reveals that average leakage is low (4.8%/2.8%/0.7% for Python/Java/C++) but highly uneven, with QuixBugs fully leaked (100%) and BigCloneBench 55.7% leaked into StarCoder's pre-training data. Leakage has a large practical impact: StarCoder-7b achieves 4.9× higher Pass@1 on leaked APPS samples, confirming that contamination inflates reported metrics. Four root causes are identified (direct inclusion, repository overlap, LeetCode derivation, shared GitHub issues), and the cleaned LessLeak-Bench is introduced as a mitigation. Automated detection via perplexity scores performs near chance (40–50%), leaving accurate leakage detection without pre-training data access as an open problem.", 414 "red_flags": [ 415 { 416 "flag": "Single LLM generalizability", 417 "detail": "All empirical leakage measurements are restricted to StarCoder and The Stack pre-training data; results may not transfer to GPT-4, Llama3, or other models with different (undisclosed) pre-training corpora." 418 }, 419 { 420 "flag": "No statistical significance testing", 421 "detail": "Table 7's key comparison (leaked vs. non-leaked Pass@k) reports large effect sizes without any statistical tests, making it impossible to assess whether differences on small leaked sample counts are robust." 422 }, 423 { 424 "flag": "Difficulty confound not ruled out", 425 "detail": "Leaked samples may be systematically easier problems (e.g., classic LeetCode-style algorithms) rather than harder novel tasks; no difficulty-controlled analysis is presented to isolate memorization from task difficulty." 426 }, 427 { 428 "flag": "RQ3 analysis on single benchmark", 429 "detail": "The claim that leakage inflates metrics is demonstrated only on APPS; other high-leakage benchmarks (QuixBugs, BigCloneBench) are not used to validate the effect size claim." 430 }, 431 { 432 "flag": "No code or data release URL", 433 "detail": "DetectLeak framework, LessLeak-Bench, and AutoDetectLeak-Bench are described but no repository links are provided, undermining reproducibility claims." 434 } 435 ], 436 "cited_papers": [ 437 { 438 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 439 "relevance": "High-leakage benchmark (8.7%) studied in the paper; central example of issue-fix benchmark with pre-training overlap." 440 }, 441 { 442 "title": "StarCoder: May the Source Be With You!", 443 "relevance": "The target LLM whose pre-training data (The Stack) is the reference dataset for all leakage measurements." 444 }, 445 { 446 "title": "Evaluating Large Language Models Trained on Code (HumanEval/Codex)", 447 "relevance": "HumanEval is one of the studied benchmarks (1.8% leakage) and a widely used code generation benchmark." 448 }, 449 { 450 "title": "On Inter-Dataset Code Duplication and Data Leakage in Large Language Models", 451 "relevance": "Direct prior work on data leakage in lightweight LLMs (CodeBERT); this paper extends to larger models and 83 benchmarks." 452 }, 453 { 454 "title": "Quantifying Contamination in Evaluating Code Generation Capabilities of Language Models", 455 "relevance": "Prior gray-literature work on HumanEval/MBPP leakage; this paper extends scope and adds manual verification." 456 }, 457 { 458 "title": "Measuring Coding Challenge Competence with APPS", 459 "relevance": "APPS is the primary benchmark used for RQ3 impact analysis (10.8% leakage, 4.9× performance inflation)." 460 }, 461 { 462 "title": "Program Synthesis with Large Language Models (MBPP)", 463 "relevance": "MBPP is one of the studied benchmarks (0.4% leakage) and a standard code generation evaluation benchmark." 464 }, 465 { 466 "title": "The Stack: 3 TB of Permissively Licensed Source Code", 467 "relevance": "StarCoder's pre-training dataset; the reference corpus against which all 83 SE benchmarks are compared for leakage." 468 }, 469 { 470 "title": "QuixBugs: A Multi-Lingual Program Repair Benchmark Set", 471 "relevance": "The highest-leakage benchmark studied (100%), used as a central example of complete benchmark contamination." 472 }, 473 { 474 "title": "Towards a Big Data Curated Benchmark of Inter-Project Code Clones (BigCloneBench)", 475 "relevance": "Second highest-leakage benchmark (55.7%), used as a key example of repository-overlap-driven contamination." 476 } 477 ], 478 "engagement_factors": { 479 "practical_relevance": { 480 "score": 3, 481 "justification": "Any researcher evaluating LLMs on SE benchmarks should know which benchmarks are contaminated; LessLeak-Bench is a drop-in replacement they can use immediately." 482 }, 483 "surprise_contrarian": { 484 "score": 2, 485 "justification": "The finding that overall leakage is low (surprising to pessimists) but specific widely-used benchmarks like QuixBugs are 100% contaminated (alarming to practitioners) challenges simple narratives." 486 }, 487 "fear_safety": { 488 "score": 1, 489 "justification": "Benchmark contamination undermines AI evaluation validity but is not a direct safety risk; the concern is methodological integrity rather than harm." 490 }, 491 "drama_conflict": { 492 "score": 2, 493 "justification": "The finding that SWE-bench (a benchmark widely used in high-profile agent competitions) has 8.7–10.6% leakage creates some controversy for the community relying on it." 494 }, 495 "demo_ability": { 496 "score": 1, 497 "justification": "LessLeak-Bench is described as available but no URL is provided; practitioners cannot easily try the cleaned benchmarks without contacting the authors." 498 }, 499 "brand_recognition": { 500 "score": 1, 501 "justification": "Singapore Management University is a respected SE research group (David Lo's lab) but not a globally prominent AI lab name." 502 } 503 }, 504 "hn_data": { 505 "threads": [ 506 { 507 "hn_id": "46908281", 508 "title": "LLMs do plan before they genenrate tokens", 509 "points": 3, 510 "comments": 0, 511 "url": "https://news.ycombinator.com/item?id=46908281", 512 "created_at": "2026-02-06T02:30:48Z" 513 }, 514 { 515 "hn_id": "43025980", 516 "title": "Emergent Response Planning in LLM", 517 "points": 1, 518 "comments": 0, 519 "url": "https://news.ycombinator.com/item?id=43025980", 520 "created_at": "2025-02-12T14:57:10Z" 521 } 522 ], 523 "top_points": 3, 524 "total_points": 4, 525 "total_comments": 0 526 } 527 }