scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20466B)
      1 {
      2   "paper": {
      3     "title": "Concerned with Data Contamination? Assessing Countermeasures in Code Language Model",
      4     "authors": ["Jialun Cao", "Wuqi Zhang", "Shing-Chi Cheung"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2403.16898"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL or code archive link is provided in the paper."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper describes collecting 2.49M Python functions but does not provide a download link or archive for the dataset."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions 'two NVIDIA RTX 6000 Ada GPUs, each with 48GB' but does not provide library versions, requirements.txt, or environment setup details."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, scripts, or README are provided."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Tables II and III report only point estimates (Pass@k scores) with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper compares performance across code groups and claims differences but does not use any statistical significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Table III reports absolute differences (delta columns) between original and refactored code groups, with baseline context (e.g., '38.5 → 35.1, Δ=-3.4')."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Section III-C states '95% confidence level and 5% margin of error' resulting in 384 sampled functions per year, explicitly justifying the sample size."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. Results appear to be from nucleus sampling but no run-level variance is shown."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The study compares CLM performance across contaminated vs. cleansed data groups, serving as the baseline comparison design central to the paper."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Eight CLMs are evaluated including contemporary models like GPT-3.5, GitHub Copilot, Phind-CodeLlama-34b, and WizardCoder (Table I)."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "RQ3 systematically tests five individual refactoring operators (IFF, Loop, Renm, Param, Deco) applied independently to isolate each operator's effect (Section II-A3, Table III)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Pass@1, Pass@3, Pass@5 are used for performance evaluation. Six MIA-related metrics (perplexity, ppl_lower, zlib entropy, MIN-K% at 5/10/20) are used in RQ4. Code complexity uses cyclomatic and cognitive complexity."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Evaluation is entirely automated using exact string matching for code completion correctness. No human evaluation of outputs is conducted."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The temporal splitting design ensures cleansed data (post-cutoff) was not in training data. The study's core design is about separating contaminated from clean test data."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down per model, per year (Code-2018 through Code-2023), per curated dataset (HumanEval, CoderEval), and per refactoring operator (Tables II and III)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses counter-intuitive findings where countermeasures fail (e.g., CLMs performing better on recent/curated data, loop transformation increasing performance). These are essentially failure cases of the countermeasures."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The main finding is negative: existing countermeasures (recent data, curated data, refactoring) do not reliably mitigate data contamination. MIA metrics also shown to be ineffective (Finding 8)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims (CLMs sometimes perform better on post-cutoff data, refactoring can improve performance, MIA metrics cannot distinguish contaminated/cleansed data) are all supported by Tables II, III, and Figure 6."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal suggestions (e.g., 'the popularity of AI programming assistants such as Copilot may further exacerbate data contamination') based on observational evidence without controlling for confounds. The Copilot explanation for Code-2023 performance is speculative."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The threats to validity section explicitly bounds results to Python and code completion tasks, acknowledging these may not generalize to other languages or tasks (Section V)."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section IV-E discusses alternative explanations for Code-2023 performance: copy-paste culture vs. AI coding assistants. The paper also considers code complexity differences as a confound and measures them (Section IV-A)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Table I specifies model sizes and release dates. Section III-A specifies 'ChatGPT3.5-turbo-0613' with specific version. Model commit histories are traced for release dates."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "The task is code completion (prefix → suffix prediction), not prompt-based. The models receive code prefixes directly without crafted prompts."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section IV states 'nucleus sampling... five solution samples are randomly generated with a temperature of 0.2.' MIN-K% parameters (K=5.0, 10.0, 20.0) are also specified."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. Models are evaluated directly on code completion."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section III-B3 describes filtering: extracting functions from Python code, filtering out functions inside classes/other functions, filtering empty functions. Section III-B details data collection from Stack v2 and GitHub crawling with specific criteria (50+ stars, permissive licenses)."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section V 'Threats to Validity' provides a dedicated discussion of limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section V discusses specific threats: Python language selection bias, code completion task limitation (masking only last statement), and possible semantic overlap between code groups with their mitigation (similarity metrics)."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly states scope is limited to Python, code completion tasks, and the specific set of CLMs tested. Section V calls for 'further research to explore a wider array of coding tasks.'"
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The 2.49M Python functions dataset and sampled 2,304 functions are not made available for download."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section III-B describes data collection in detail: Stack v2 for pre-March 2022 data, GitHub crawling for post-March 2022 data (repos with 50+ stars, permissive licenses, created April 2022 - December 2023). Time range and total counts provided."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data is collected from public code repositories."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline is documented: Stack v2 + GitHub crawl → extract Python functions → filter (remove nested/empty functions) → sample 384 per year at 95% confidence → apply to CLMs. Total counts at each stage are provided (12.49M functions total, 384 sampled per year)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section is present in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors are affiliated with The Hong Kong University of Science and Technology, clearly stated on the first page."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information is disclosed, so independence cannot be assessed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial disclosure statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Table I documents training data time spans for all eight CLMs (e.g., StarCoder: Jan 2015 - Mar 2022, ChatGPT-3.5: up to Sep 2021). This is central to the study's design."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "The entire paper is about train/test overlap. The temporal splitting design explicitly addresses this, and Section IV-A validates low overlap using similarity metrics."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "RQ2 directly addresses benchmark contamination by comparing HumanEval and CoderEval release dates against model cutoff dates (Section II-A2, III-D)."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No API costs, tokens consumed, or wall-clock time reported despite running 8 models across multiple code groups with nucleus sampling."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "Hardware is mentioned (two NVIDIA RTX 6000 Ada GPUs) but total GPU hours or compute budget is not stated."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CLMs do not necessarily perform worse on recent data (post-cutoff); they sometimes perform better.",
    286       "evidence": "Table II shows Pass@k scores on Code-2023 and Code-2023-oct consistently outperform earlier code groups across all CLMs (Section IV-B).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "CLMs perform better on curated datasets (HumanEval, CoderEval) compared to contaminated data.",
    291       "evidence": "Table II shows Code-CodE and Code-HumE achieve higher Pass@k than Code-2018 through Code-2022 for most models (Section IV-C).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Syntactic refactoring (IFF, Loop) does not reliably reduce CLM performance; loop transformation can even improve it.",
    296       "evidence": "Table III shows Code-Loop increases Pass@1 for all 8 models, while Code-IFF has mixed results (Section IV-D).",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Semantic refactoring operators (Renm, Param, Deco) are more effective at reducing CLM performance.",
    301       "evidence": "Table III shows Code-Deco consistently decreases Pass@1 across all models, with drops ranging from -1.9 to -15.9 (Section IV-D).",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Existing MIA-related metrics cannot distinguish contaminated from cleansed data.",
    306       "evidence": "Figure 6 shows similar metric scores across contaminated and cleansed code groups for StarCoder (Section IV-E).",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "AI programming assistants like Copilot may exacerbate data contamination.",
    311       "evidence": "Speculative explanation based on GitHub's report that 40% of code is written by Copilot. No direct evidence linking AI-generated code to the observed performance patterns (Section IV-B).",
    312       "supported": "weak"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval", "observational"],
    316   "key_findings": "The paper systematically evaluates three countermeasures against data contamination in code language models: using recent data, curated benchmarks, and code refactoring. Counter-intuitively, CLMs often perform better on post-cutoff and curated data than on contaminated data, suggesting these countermeasures may not effectively mitigate contamination. Semantic refactoring operators (identifier renaming, parameter appending, decorator addition) show more promise in disrupting memorized patterns than syntactic operators. Existing membership inference metrics (perplexity, Zlib entropy, MIN-K% PROB) cannot reliably distinguish contaminated from clean data.",
    317   "red_flags": [
    318     {
    319       "flag": "No statistical significance tests",
    320       "detail": "Performance differences across code groups are compared using only point estimates without any significance testing, despite the study's central claims being about whether differences exist between contaminated and cleansed data."
    321     },
    322     {
    323       "flag": "No variance or error bars reported",
    324       "detail": "Despite using nucleus sampling with 5 samples per example, no run-level variance or confidence intervals are reported for Pass@k scores."
    325     },
    326     {
    327       "flag": "Speculative causal explanation",
    328       "detail": "The claim that AI coding assistants explain why CLMs perform better on Code-2023 is speculative. No evidence is provided to verify the proportion of AI-generated code in the dataset, and alternative explanations (evolving coding conventions, different repository quality filters for 2023) are not fully explored."
    329     },
    330     {
    331       "flag": "No artifacts released",
    332       "detail": "Neither code nor data is released, limiting reproducibility of a study about reproducibility concerns in CLM evaluation."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "Evaluating large language models trained on code",
    338       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    339       "year": 2021,
    340       "relevance": "Introduces HumanEval benchmark, one of the most widely used code generation benchmarks and a key subject of contamination concerns."
    341     },
    342     {
    343       "title": "NLP evaluation in trouble: On the need to measure LLM data contamination for each benchmark",
    344       "authors": ["O. Sainz", "J. Campos", "I. García-Ferrero"],
    345       "year": 2023,
    346       "relevance": "Directly addresses data contamination measurement in LLM evaluation, a core concern for benchmark reliability."
    347     },
    348     {
    349       "title": "Detecting pretraining data from large language models",
    350       "authors": ["W. Shi", "A. Ajith", "M. Xia"],
    351       "year": 2023,
    352       "arxiv_id": "2310.16789",
    353       "relevance": "Proposes MIN-K% PROB for membership inference attacks on LLMs, one of the metrics evaluated in this paper."
    354     },
    355     {
    356       "title": "Extracting training data from large language models",
    357       "authors": ["N. Carlini", "F. Tramèr", "E. Wallace"],
    358       "year": 2021,
    359       "relevance": "Foundational work on training data extraction and memorization in LLMs, directly relevant to contamination concerns."
    360     },
    361     {
    362       "title": "ClassEval: A manually-crafted benchmark for evaluating LLMs on class-level code generation",
    363       "authors": ["X. Du", "M. Liu", "K. Wang"],
    364       "year": 2023,
    365       "relevance": "Example of curated benchmark designed to reduce contamination risk in code generation evaluation."
    366     },
    367     {
    368       "title": "CoderEval: A benchmark of pragmatic code generation with generative pre-trained models",
    369       "authors": ["J. Yu"],
    370       "year": 2023,
    371       "relevance": "Curated coding benchmark used as one of the countermeasure datasets in this study."
    372     },
    373     {
    374       "title": "How effective are neural networks for fixing security vulnerabilities",
    375       "authors": ["Y. Wu", "N. Jiang", "H. V. Pham"],
    376       "year": 2023,
    377       "relevance": "Prior work that adopted identifier renaming and code structure change as contamination countermeasures."
    378     },
    379     {
    380       "title": "Large language models for software engineering: Survey and open problems",
    381       "authors": ["A. Fan", "B. Gokkaya", "M. Harman"],
    382       "year": 2023,
    383       "arxiv_id": "2310.03533",
    384       "relevance": "Survey of LLMs for SE that discusses using AI for code refactoring to address contamination."
    385     },
    386     {
    387       "title": "Time travel in LLMs: Tracing data contamination in large language models",
    388       "authors": ["S. Golchin", "M. Surdeanu"],
    389       "year": 2023,
    390       "arxiv_id": "2308.08493",
    391       "relevance": "Proposes methods for tracing data contamination in LLMs, directly relevant to benchmark reliability."
    392     },
    393     {
    394       "title": "Task contamination: Language models may not be few-shot anymore",
    395       "authors": ["C. Li", "J. Flanigan"],
    396       "year": 2023,
    397       "arxiv_id": "2312.16337",
    398       "relevance": "Studies task-level contamination in LLMs, a related but distinct contamination concern."
    399     }
    400   ]
    401 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs