scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24634B)
      1 {
      2   "paper": {
      3     "title": "CodeMark: Imperceptible Watermarking for Code Datasets against Neural Code Completion Models",
      4     "authors": ["Zhensu Sun", "Xiaoning Du", "Fu Song", "Li Li"],
      5     "year": 2023,
      6     "venue": "ESEC/FSE 2023",
      7     "arxiv_id": "2308.14401",
      8     "doi": "10.1145/3611643.3616297"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "Section 10 (Data Availability) states: 'source code of our toolkit, all the artifacts and results are available on our website [9].' Reference [9] links to https://sites.google.com/view/codemark."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The paper uses the publicly available CodeSearchNet (CSN) dataset [21] for Python and Java. Section 10 also states all artifacts and results are available on their website."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided in the paper. The paper mentions using Tree-sitter and GPT-2 (124M) and CodeT5 (60M) but does not specify library versions or environment details."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper does not include step-by-step reproduction instructions, a README with commands, or a 'Reproducing Results' section. Artifacts are referenced on the website but no instructions are provided in the paper itself."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper reports point estimates for BLEU and EM scores (e.g., 0.233, 0.352) without confidence intervals or error bars. Only p-values from the t-test validation are reported."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "The paper uses independent-samples t-tests (Section 3.5) with a 5% confidence level (alpha=0.05) to validate watermark existence. P-values are reported for all validation experiments in Tables 2 and 5."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper reports effect sizes in context for harmlessness: 'on average 0.6% and 0.1% in terms of BLEU and Exact Match' reduction (Section 5.1). Table 2 provides baseline and watermarked model scores side by side (e.g., BLEU from 0.233 to 0.230), enabling assessment of magnitude."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No power analysis or sample size justification is provided. The validation set size is limited to 1000 (Section 4.3) without justification. The human study uses 22 participants without justification for this number."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Results appear to be from single training runs of GPT-2 and CodeT5 models."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper compares CodeMark against CoProtector [38], an existing dead-code-based watermarking method. Unwatermarked models serve as baselines for harmlessness and verifiability evaluations."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "CoProtector (2021) is the only existing code watermarking method against neural models at the time of this work, making it the most relevant and contemporary baseline."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The paper evaluates individual backdoors (B1, B2, B3, B4) separately and in combination (B1,2 and B3,4), showing the contribution of single vs. multiple backdoors. The robustness experiment (RQ4, Table 5) varies the watermarking rate (100%, 50%, 20%, 10%, 0%) to measure the effect of watermark density."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper uses BLEU, Exact Match (EM), p-value for validation, and Recall/Precision for defense method evaluation (Section 4.4)."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Section 5.3 describes a human study with 22 participants evaluating the imperceptibility of CodeMark vs. CoProtector across three rounds (watermark-unaware, watermark-aware, method-aware). Results are reported in Table 3."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Section 4.1 states the train and test sets come from 'non-overlapping repositories.' The test split is used for evaluating model accuracy, separate from the training data."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down by programming language (Python vs. Java), model architecture (GPT-2 vs. CodeT5), individual backdoors (B1-B4), single vs. multiple backdoor settings, and watermarking rates (Tables 2, 4, 5)."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section 5.4 and Section 7 discuss cases where backdoors fail: B4 becomes invalid at 10% watermarking rate for GPT-2, and B1 fails for CodeT5 at 20% rate. Section 7 discusses the challenge that 'some inappropriate backdoors may lead to unexpected results.'"
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports that certain backdoors fail at lower watermarking rates (Table 5, e.g., B4 at 10% for GPT-2 yields p=0.28, failing the test). Section 7 acknowledges CodeMark cannot 'fully ensure the effectiveness of all the watermark backdoors during the design phase.'"
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims CodeMark is 'validated to fulfill all desired properties of practical watermarks, including harmlessness to model accuracy, verifiability, robustness, and imperceptibility.' Each of these is evaluated in dedicated RQs (RQ1-RQ4) with supporting experimental evidence in Section 5."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper's causal claims relate to watermark embedding causing specific model behaviors. These are tested through controlled experiments: training models with and without watermarks and comparing outcomes, which constitutes adequate controlled manipulation."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Section 6 (Threats to Validity) explicitly bounds generalization: 'we only conducted experiments using two popular NCCMs in two programming languages. Though our method is theoretically applicable to any programming language and NCCM, the effectiveness of CodeMark in other settings has not been experimentally verified yet.'"
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Section 7 discusses alternative factors: varying model learning abilities for different code semantics, the impact of backdoor design choices, and the practical feasibility of dilution attacks. Section 6 discusses threats including limited generalization and potential bias in the human study."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper specifies 'GPT-2' (124M parameters) and 'CodeT5' (60M parameters) but does not provide specific model version identifiers or snapshot dates (e.g., no Hugging Face model IDs or version numbers beyond parameter counts)."
    134       },
    135       "prompts_provided": {
    136         "applies": false,
    137         "answer": false,
    138         "justification": "The paper does not use prompting in the LLM sense. The models are fine-tuned on code datasets for code completion, not prompted with instructions."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper states GPT-2 is fine-tuned for 10 epochs and CodeT5 for 20 epochs (Section 4.2), but does not report learning rate, batch size, optimizer, temperature, or other training hyperparameters."
    144       },
    145       "scaffolding_described": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No agentic scaffolding is used. CodeMark is a data transformation tool, not an agentic system."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 4.1 describes the dataset source (CodeSearchNet), how it was collected (extracting functions and paired comments from GitHub), and provides exact sizes: 412,178 train / 22,176 test for Python, 454,451 train / 26,909 test for Java. The watermark embedding process is documented in detail in Section 3.3-3.4."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 6 is titled 'Threats to Validity' and contains substantive discussion across three subsections: Generalization, Backdoor design, and Limited experiments."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 6 includes specific threats: 'we only conducted experiments using two popular NCCMs in two programming languages,' 'the human study can be inherently biased due to its small scale and the potential differences in expertise and backgrounds of the participants,' and 'some inappropriate backdoors may lead to unexpected results.'"
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 6 explicitly states scope limits: CodeMark targets code completion models only and 'is currently not directly applicable' to code tasks involving natural language (code search, summarization). Section 7 notes they 'cannot fully ensure the effectiveness of all the watermark backdoors during the design phase.'"
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 10 states 'source code of our toolkit, all the artifacts and results are available on our website [9].' The underlying CodeSearchNet dataset is publicly available."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 4.1 describes the CodeSearchNet dataset: 'collected by extracting each function and its paired comment from open-source code repositories on Github.' Train/test split sizes are provided for both languages."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "For the human study (Section 5.3), the paper states '22 participants are recruited' with 'more than one year of development experience' but does not describe how participants were recruited (channels, population, potential selection bias)."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The watermark embedding pipeline is documented in Sections 3.2-3.5: code parsing via AST, pattern selection, SPT application, and validation set creation. Table 1 provides the number of transformable instances for each SPT rule."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The Acknowledgments section discloses: 'This work is supported by the National Natural Science Foundation of China under Grant No.: 62072309, CAS Project for Young Scientists in Basic Research under Grant No.: YSBR-040, and ISCAS New Cultivation Project under Grant No.: ISCAS-PYFX-202201.'"
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are listed: Beihang University, Monash University, and the State Key Laboratory of Computer Science at the Institute of Software, Chinese Academy of Sciences. No product evaluation conflict exists since the work proposes a new method rather than evaluating a commercial product."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The funders are the National Natural Science Foundation of China, CAS, and ISCAS — academic and government funding bodies with no apparent financial interest in the outcome of this watermarking research."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests statement or financial interest declaration is present in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": false,
    220         "answer": false,
    221         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. It fine-tunes GPT-2 and CodeT5 on specific code datasets to test watermark embedding, not model knowledge or benchmark performance."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "The paper tests watermark embedding effectiveness, not model knowledge. Train/test overlap of the pre-trained model's data is not relevant to the claims about watermark verifiability."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "The paper evaluates a watermarking method, not model capability on a benchmark. Contamination is not relevant to the claims being made."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No pre-registration is mentioned for the human imperceptibility study with 22 participants."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No IRB or ethics board approval is mentioned for the human study involving 22 participants."
    244       },
    245       "demographics_reported": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "The paper only reports that participants have 'more than one year of development experience.' No other demographics (gender, geographic distribution, programming background, expertise level beyond one year) are reported."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "The only stated criterion is 'more than one year of development experience.' No formal inclusion/exclusion criteria, screening process, or rationale for the threshold is provided."
    254       },
    255       "randomization_described": {
    256         "applies": true,
    257         "answer": true,
    258         "justification": "Section 5.3 states participants are shown 'ten snippets randomly sampled from the pool' and 'Every participant examines the same ten code snippets throughout the three rounds.'"
    259       },
    260       "blinding_described": {
    261         "applies": true,
    262         "answer": true,
    263         "justification": "The study uses progressive disclosure across three rounds: participants start unaware of watermarks (round 1), are told some snippets may be watermarked (round 2), then given technical details (round 3). This effectively describes blinding conditions."
    264       },
    265       "attrition_reported": {
    266         "applies": true,
    267         "answer": false,
    268         "justification": "No information is provided about whether any participants dropped out or failed to complete all three rounds of the study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "No inference cost, latency, or time required for watermark embedding or validation is reported."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "Section 6 mentions 'Limited by our computing resources' but does not quantify GPU hours, training time, or total computational budget for any experiments."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "CodeMark causes negligible performance reduction: on average 0.6% BLEU and 0.1% Exact Match reduction.",
    287       "evidence": "Table 2 shows BLEU/EM for watermarked vs. unwatermarked models across GPT-2 and CodeT5 for Python and Java. The largest difference is 2.5% of the unwatermarked baseline (Section 5.1).",
    288       "supported": "moderate"
    289     },
    290     {
    291       "claim": "The t-test-based validation method correctly distinguishes watermarked from unwatermarked models with statistical significance.",
    292       "evidence": "Table 2 shows all watermarked models pass the t-test (p-values from 3.2E-126 to 2.1E-3) while no unwatermarked model passes (p-values all ≥ 0.71). Section 5.2.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "CodeMark is significantly more imperceptible than CoProtector to human developers.",
    297       "evidence": "Table 3 shows CodeMark suspicious rate stays at 15.6% across rounds while CoProtector rises from 43.9% to 70.7%. 22 participants across 3 rounds. Section 5.3.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "Automated defense methods (activation clustering and spectral signature) fail to eliminate CodeMark watermarks.",
    302       "evidence": "Table 4 shows both AC and SS achieve low recall (≤0.56 for AC, ≤0.05 for SS) and extremely low precision (≤0.01). Models retrained after elimination still pass watermark validation. Section 5.3.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "CodeMark resists dilution attacks at a 20% watermarking rate, and most backdoors survive at 10%.",
    307       "evidence": "Table 5 shows most backdoors remain valid (p<0.05) at 20% watermarking rate for GPT-2. At 10%, B4 fails for GPT-2 but B3 survives. For CodeT5, more backdoors fail at lower rates. Section 5.4.",
    308       "supported": "moderate"
    309     }
    310   ],
    311   "methodology_tags": ["benchmark-eval"],
    312   "key_findings": "CodeMark proposes semantic-preserving code transformations (syntactic sugar, default parameters, keyword parameters, equivalent implementations) as imperceptible watermarks for code datasets. Experiments on GPT-2 and CodeT5 with Python/Java CodeSearchNet data show negligible accuracy impact (0.6% BLEU, 0.1% EM reduction), statistically significant watermark detection via t-tests, and superior imperceptibility over CoProtector (15.6% vs. 70.7% suspicious rate with method-aware participants). The watermarks resist both automated elimination methods and dataset dilution down to 20% watermarking rate, though some individual backdoors fail at 10%.",
    313   "red_flags": [
    314     {
    315       "flag": "No variance across runs",
    316       "detail": "Model training results appear to be from single runs without reporting standard deviation or variance across different random seeds. DL model training is stochastic and results can vary significantly between runs, making it impossible to know if the reported differences are within normal variation."
    317     },
    318     {
    319       "flag": "Small human study without recruitment details",
    320       "detail": "The imperceptibility human study uses only 22 participants with no IRB approval, no demographics beyond '1+ year experience,' no recruitment method description, and no power analysis. The 3-minute time limit per round and same 10 snippets across 3 rounds introduce learning effects. The small N limits the statistical power of comparisons."
    321     },
    322     {
    323       "flag": "Missing training hyperparameters",
    324       "detail": "Only epochs (10 for GPT-2, 20 for CodeT5) are reported. No learning rate, batch size, optimizer, weight decay, or other hyperparameters are given, making reproduction difficult."
    325     },
    326     {
    327       "flag": "No computational cost reporting",
    328       "detail": "The paper mentions being 'limited by computing resources' but never quantifies training time, GPU hours, or the cost of the watermark embedding process. This makes it impossible to assess practical feasibility."
    329     }
    330   ],
    331   "cited_papers": [
    332     {
    333       "title": "CoProtector: Protect Open-Source Code against Unauthorized Training Usage with Data Poisoning",
    334       "authors": ["Zhensu Sun", "Xiaoning Du", "Fu Song", "Mingze Ni", "Li Li"],
    335       "year": 2021,
    336       "relevance": "Direct baseline for code dataset watermarking against neural models, proposes dead-code-based watermarks."
    337     },
    338     {
    339       "title": "CodeSearchNet Challenge: Evaluating the State of Semantic Code Search",
    340       "authors": ["Hamel Husain", "Hongqi Wu", "Tiferet Gazit", "Miltiadis Allamanis", "Marc Brockschmidt"],
    341       "year": 2019,
    342       "relevance": "Provides the code dataset (CSN) used in CodeMark's evaluation, widely used benchmark for code intelligence."
    343     },
    344     {
    345       "title": "StarCoder: may the source be with you!",
    346       "authors": ["Raymond Li"],
    347       "year": 2023,
    348       "arxiv_id": "2305.06161",
    349       "relevance": "Large-scale code LLM that recruited annotators to remove PII from training data, illustrating the importance of code dataset curation."
    350     },
    351     {
    352       "title": "You Autocomplete Me: Poisoning Vulnerabilities in Neural Code Completion",
    353       "authors": ["R. Schuster", "Congzheng Song", "Eran Tromer", "Vitaly Shmatikov"],
    354       "year": 2020,
    355       "relevance": "Demonstrates backdoor poisoning attacks on code completion models, directly relevant to security of code generation systems."
    356     },
    357     {
    358       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation",
    359       "authors": ["Yue Wang", "Weishi Wang", "Shafiq R. Joty", "Steven C. H. Hoi"],
    360       "year": 2021,
    361       "arxiv_id": "2109.00859",
    362       "relevance": "One of the two code completion models used in CodeMark's evaluation, a prominent pre-trained model for code tasks."
    363     },
    364     {
    365       "title": "On the generalizability of Neural Program Models with respect to semantic-preserving program transformations",
    366       "authors": ["Md. Rafiqul Islam Rabin", "Nghi D. Q. Bui", "Ke Wang", "Yijun Yu", "Lingxiao Jiang", "Mohammad Amin Alipour"],
    367       "year": 2021,
    368       "relevance": "Demonstrates vulnerability of neural code models to semantic-preserving transformations, the theoretical basis for CodeMark's approach."
    369     },
    370     {
    371       "title": "Protecting Intellectual Property of Language Generation APIs with Lexical Watermark",
    372       "authors": ["Xuanli He", "Qiongkai Xu", "L. Lyu", "Fangzhao Wu", "Chenguang Wang"],
    373       "year": 2021,
    374       "arxiv_id": "2112.02701",
    375       "relevance": "Proposes lexical watermarking for NLP datasets using synonyms, the key inspiration for CodeMark's code synonym approach."
    376     },
    377     {
    378       "title": "Turning Your Weakness Into a Strength: Watermarking Deep Neural Networks by Backdooring",
    379       "authors": ["Yossi Adi", "Carsten Baum", "Moustapha Cissé", "Benny Pinkas", "Joseph Keshet"],
    380       "year": 2018,
    381       "relevance": "Foundational work on using backdoor poisoning as watermarks in deep neural networks."
    382     },
    383     {
    384       "title": "Detecting Backdoor Attacks on Deep Neural Networks by Activation Clustering",
    385       "authors": ["Bryant Chen"],
    386       "year": 2019,
    387       "relevance": "One of two automated defense methods tested against CodeMark watermarks; relevant to backdoor detection in ML models."
    388     },
    389     {
    390       "title": "Spectral Signatures in Backdoor Attacks",
    391       "authors": ["Brandon Tran", "Jerry Li", "A. Madry"],
    392       "year": 2018,
    393       "relevance": "Second automated defense method tested against CodeMark; relevant to backdoor detection and ML security."
    394     }
    395   ]
    396 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs