scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24740B)
      1 {
      2   "paper": {
      3     "title": "AI Code Generators for Security: Friend or Foe?",
      4     "authors": ["Roberto Natella", "Pietro Liguori", "Cristina Improta", "Bojan Cukic", "Domenico Cotroneo"],
      5     "year": 2024,
      6     "venue": "IEEE Security & Privacy",
      7     "arxiv_id": "2402.01219",
      8     "doi": "10.1109/MSEC.2024.3355713"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "The dataset is released at https://github.com/dessertlab/violent-python, as stated in the paper with a footnote URL."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The violent-python dataset is publicly available at the GitHub repository linked in the paper (https://github.com/dessertlab/violent-python)."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The paper mentions 'a machine with a Debian-based distribution, with 8 vCPU, 16 GB RAM, and two NVIDIA T4 GPUs' but does not provide a requirements.txt, Dockerfile, or detailed library versions needed to recreate the environment."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions are provided in the paper. The paper describes the experimental setup in general terms but does not include commands, scripts, or a README with instructions for replicating the experiments."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Results are reported as single point estimates (e.g., '77.89%', '65.38%', '60.76%') with no confidence intervals or error bars in any figure or table."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper claims CodeBERT outperforms competitors and that fine-tuning improves performance, but no statistical significance tests are used to support these comparative claims."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper reports absolute edit distance percentages with baselines for comparison (e.g., '19.05% vs 77.89%' for zero-shot vs fine-tuned, and '77.89%' vs '65.38%' vs '60.76%' across generators), and contrasts with general-purpose code generation at '86.7%'. This provides enough context to understand effect magnitude."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The dataset contains 1,372 samples split 80/10/10, but there is no justification for why this size is sufficient. The paper notes the size is 'in line with other state-of-the-art corpora' but provides no power analysis or formal justification."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No standard deviations, variance, or spread measures are reported for any experimental results. All numbers are single-run point estimates with no indication of result stability."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper compares three code generators (CodeBERT, GitHub Copilot, Amazon CodeWhisperer) and also compares fine-tuned vs zero-shot performance. Additionally, it references general-purpose code generation performance (86.7%) as an external baseline."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "GitHub Copilot and Amazon CodeWhisperer were contemporary commercial AI code generators at the time of writing (2024). CodeBERT, while from 2020, is used as a fine-tunable open model and justified as 'representative of the state-of-the-art.'"
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The comparison of zero-shot learning vs fine-tuning (Figure 1) serves as an ablation showing the contribution of fine-tuning. The three granularity levels (line, block, function) also ablate the effect of description granularity."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "Only a single metric is used: edit distance (ED). The paper explicitly justifies not using compilation accuracy but does not employ any additional semantic similarity metrics. The paper references their prior work on metric correlation but only reports ED in this study."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The paper acknowledges that 'manual code review where a human evaluator checks if the code generated by the models is semantically correct' is the 'golden standard' but states it is 'often unfeasible.' No human evaluation was performed. Table 3 provides cherry-picked qualitative examples but this is not systematic human evaluation."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The paper states 'We split the dataset into sets for training (the samples for fine-tuning the model), validation (to tune the hyperparameters of the models), and test (for the evaluation), using a random selection with the common 80%-10%-10% ratio.'"
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down by code granularity level (individual lines, multi-line blocks, entire functions) and Table 1 shows the dataset distribution across four security categories (Penetration Testing, Forensic Analysis, Network Traffic Analysis, OSINT and Social Engineering)."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Table 3 includes qualitative examples showing where CodeBERT and CodeWhisperer fail (e.g., CodeBERT generating 'something close to the expected output, yet incomplete' and CodeWhisperer generating 'a verbose snippet that is syntactically correct but that diverges from the natural language description')."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports that fine-tuning does not boost CodeBERT performance for blocks and functions compared to Copilot, and that CodeWhisperer shows lower performance. The paper also notes that without fine-tuning, performance is much lower."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims that 'LLMs can be close, but not fully match the accuracy of LLMs at generating general-purpose programs' (supported by comparing 77.89% to 86.7%), that fine-grained descriptions give best results (supported by Figure 1-2), and that fine-tuning CodeBERT helps (supported by Figure 1). These are all supported by the results."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The main causal claim is that fine-tuning improves performance, demonstrated through a controlled comparison of zero-shot vs fine-tuned CodeBERT on the same test set (Figure 1). The ablation across granularity levels also uses controlled single-variable manipulation."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The title 'AI Code Generators for Security: Friend or Foe?' is broad, but the study only tests Python code from a single book ('Violent Python'), using a single similarity metric (ED), on three specific tools. The conclusion generalizes to 'cybersecurity professionals must embrace AI code generators' without bounding to the tested setting."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper does not discuss alternative explanations for the observed results. For example, it does not consider whether CodeBERT's advantage could be due to the ED metric favoring shorter outputs, whether the dataset composition biases results, or whether CodeWhisperer's lower performance could be due to API configuration differences."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper refers to 'GitHub Copilot', 'Amazon CodeWhisperer', and 'CodeBERT' without specifying exact versions, snapshot dates, or API versions. No model version identifiers are provided for any of the three systems."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Table 2 provides concrete examples of the natural language intents used as prompts at all three granularity levels (line, block, function). Since the dataset is released, the full set of prompts is available."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No hyperparameters are reported for any of the three models. Temperature, top-p, max tokens, learning rate, number of epochs for fine-tuning, and other training parameters are all absent."
    144       },
    145       "scaffolding_described": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "The paper mentions running CodeBERT 'along with data processing operations, both before translation to prepare the input data, and after translation to improve the quality and the readability of the code in output' but does not describe these processing operations in detail."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The paper describes that the dataset was built from the 'Violent Python' book and that descriptions were 'manually described' in three granularity levels, but does not document specific preprocessing steps, annotation guidelines, or how the 80/10/10 split was performed beyond 'random selection.'"
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "There is no dedicated limitations, threats to validity, or similar section in the paper."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "No specific threats to validity are discussed anywhere in the paper."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "The paper does not explicitly state what the results do NOT show. There is no discussion of scope boundaries, such as limitation to Python, to a single book source, or to specific model versions."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The violent-python dataset is publicly available at https://github.com/dessertlab/violent-python, allowing independent verification of the data."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The paper describes that the dataset was built from the book 'Violent Python' by T.J. O'Connor, covers four security areas, and that descriptions were 'manually described' in three alternative granularities based on 'the contents of the chapter around each script, and on comments in the code where available.'"
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants were involved. The study evaluates AI code generators on a benchmark dataset."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The paper does not document the full pipeline from the book to the final dataset. It mentions manual annotation but does not describe how many annotators were involved, what quality control was applied, or how the final 1,372 samples were selected from the book's content."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The Acknowledgments section states: 'This work has been partially supported by MUR PRIN 2022, project FLEGREA, CUP E53D23007950001.'"
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are clearly listed: University of Naples Federico II and University of North Carolina at Charlotte. None of the authors appear affiliated with the companies whose products are evaluated (GitHub/Microsoft, Amazon)."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The funder is MUR PRIN 2022 (Italian Ministry of University and Research), which has no financial stake in the performance of any of the evaluated AI code generators."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is present in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "The paper does not state the training data cutoff dates for any of the three models (CodeBERT, GitHub Copilot, Amazon CodeWhisperer). This is relevant because the Violent Python book (2012) could be in the training data."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "The paper does not discuss whether Copilot or CodeWhisperer may have seen the Violent Python book's code during pre-training, which is publicly available and from 2012. This is a significant contamination concern."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "The Violent Python book was published in 2012 and its code examples are publicly available online. Both Copilot and CodeWhisperer were trained on public code repositories, creating a significant contamination risk that is not addressed."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants were involved in this study."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants were involved in this study."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants were involved in this study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants were involved in this study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants were involved in this study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants were involved in this study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants were involved in this study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "No inference costs, API costs, tokens consumed, or wall-clock time are reported for any of the three code generators."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "The hardware is mentioned (8 vCPU, 16 GB RAM, two NVIDIA T4 GPUs) but no total compute budget, training time, or GPU hours are stated."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "Fine-tuning CodeBERT on offensive code significantly outperforms zero-shot learning, especially for individual lines (19.05% vs 77.89% ED).",
    287       "evidence": "Figure 1 shows ED scores for zero-shot vs fine-tuned CodeBERT across three granularity levels: lines (19.05% vs 77.89%), blocks (18.23% vs 43.45%), functions (22.31% vs 37.14%).",
    288       "supported": "moderate"
    289     },
    290     {
    291       "claim": "Fine-tuned CodeBERT provides the best performance for single-line generation (77.89%) compared to Copilot (65.38%) and CodeWhisperer (60.76%).",
    292       "evidence": "Figure 2 shows ED scores for all three generators on the same test set for single lines.",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "CodeBERT and Copilot have similar performance for blocks and functions, while CodeWhisperer shows lower performance.",
    297       "evidence": "Figure 2: blocks (43.45% vs 45.42% vs 38.76%), functions (37.14% vs 35.27% vs 31.31%).",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "Current LLMs can achieve performance close to general-purpose code generation for security-oriented individual line generation.",
    302       "evidence": "The paper compares 77.89% ED for fine-tuned CodeBERT on individual lines to 86.7% for general-purpose Python code generation (cited from reference [6]).",
    303       "supported": "weak"
    304     },
    305     {
    306       "claim": "CodeWhisperer's lower performance is attributed to its focus on Amazon-specific platforms and APIs.",
    307       "evidence": "The paper states 'CodeWhisperer, differently from a general purpose tool such as Copilot, caters first and foremost to development use cases associated with Amazon platforms.' This is speculation without supporting evidence.",
    308       "supported": "unsupported"
    309     }
    310   ],
    311   "methodology_tags": ["benchmark-eval"],
    312   "key_findings": "Fine-tuning CodeBERT on security-oriented Python code from the 'Violent Python' book produces the best edit distance scores for individual line generation (77.89%), outperforming commercial tools GitHub Copilot (65.38%) and Amazon CodeWhisperer (60.76%). For more complex multi-line blocks and functions, the fine-tuning advantage diminishes, with Copilot matching CodeBERT. Performance decreases as code complexity increases (lines > blocks > functions) across all generators. The results suggest LLMs can approach general-purpose code generation accuracy (86.7%) for security code at fine granularity, but need fine-tuning for the security domain.",
    313   "red_flags": [
    314     {
    315       "flag": "Benchmark contamination risk",
    316       "detail": "The dataset is derived from the publicly available 2012 book 'Violent Python.' Both Copilot and CodeWhisperer were trained on public code repositories and could have seen this book's code examples during training. This would inflate their scores and is not discussed."
    317     },
    318     {
    319       "flag": "Single metric evaluation",
    320       "detail": "Only edit distance (ED) is used to evaluate code quality. The paper itself acknowledges that manual review is the 'golden standard' but does not perform it. ED measures textual similarity, not semantic correctness — code that works differently but achieves the same goal would score poorly."
    321     },
    322     {
    323       "flag": "No statistical rigor",
    324       "detail": "All results are point estimates without confidence intervals, error bars, or significance tests. The comparative claims (e.g., CodeBERT outperforms Copilot) have no statistical backing."
    325     },
    326     {
    327       "flag": "No model version specification",
    328       "detail": "None of the three AI code generators have specified versions. Copilot and CodeWhisperer are rapidly evolving commercial services; results could differ significantly across versions, making the results non-reproducible."
    329     },
    330     {
    331       "flag": "No limitations section",
    332       "detail": "The paper has no limitations or threats to validity section, despite several significant methodological limitations (single metric, potential contamination, single data source, no version control on commercial APIs)."
    333     },
    334     {
    335       "flag": "Unfair comparison",
    336       "detail": "CodeBERT is fine-tuned on the training split from the same dataset used for evaluation, while Copilot and CodeWhisperer cannot be fine-tuned. This gives CodeBERT a structural advantage that makes direct comparison problematic."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "Can we generate shellcodes via natural language? An empirical study",
    342       "authors": ["P. Liguori", "E. Al-Hossami", "D. Cotroneo", "R. Natella", "B. Cukic", "S. Shaikh"],
    343       "year": 2022,
    344       "doi": "10.1007/s10515-022-00331-3",
    345       "relevance": "Directly studies AI code generation for security (shellcode), a core topic of LLM-based offensive code generation."
    346     },
    347     {
    348       "title": "EVIL: Exploiting Software via Natural Language",
    349       "authors": ["P. Liguori", "E. Al-Hossami", "V. Orbinato", "R. Natella", "S. Shaikh", "D. Cotroneo", "B. Cukic"],
    350       "year": 2021,
    351       "doi": "10.1109/ISSRE52982.2021.00042",
    352       "relevance": "Presents methods for generating exploits from natural language using neural machine translation, directly related to LLM-based code generation for security."
    353     },
    354     {
    355       "title": "LIMA: Less is More for Alignment",
    356       "authors": ["C. Zhou", "P. Liu", "P. Xu", "S. Iyer", "J. Sun"],
    357       "year": 2023,
    358       "arxiv_id": "2305.11206",
    359       "relevance": "Foundational LLM alignment work relevant to understanding fine-tuning with limited data for specialized tasks."
    360     },
    361     {
    362       "title": "Who Evaluates the Evaluators? On Automatic Metrics for Assessing AI-based Offensive Code Generators",
    363       "authors": ["P. Liguori", "C. Improta", "R. Natella", "B. Cukic", "D. Cotroneo"],
    364       "year": 2023,
    365       "doi": "10.1016/j.eswa.2023.120073",
    366       "relevance": "Evaluates similarity metrics for AI-generated security code, directly relevant to methodology of evaluating LLM code generation."
    367     },
    368     {
    369       "title": "IntelliCode Compose: Code Generation Using Transformer",
    370       "authors": ["A. Svyatkovskiy", "S. K. Deng", "S. Fu", "N. Sundaresan"],
    371       "year": 2020,
    372       "doi": "10.1145/3368089.3417058",
    373       "relevance": "Foundational work on transformer-based code generation, providing the 86.7% baseline referenced in this paper."
    374     },
    375     {
    376       "title": "ExploitGen: Template-Augmented Exploit Code Generation Based on CodeBERT",
    377       "authors": ["G. Yang", "Y. Zhou", "X. Chen", "X. Zhang", "T. Han", "T. Chen"],
    378       "year": 2023,
    379       "doi": "10.1016/j.jss.2022.111577",
    380       "relevance": "Uses CodeBERT for exploit generation with template augmentation, directly comparable methodology for security code generation."
    381     },
    382     {
    383       "title": "Harnessing GPT-4 for Generation of Cybersecurity GRC Policies: A Focus on Ransomware Attack Mitigation",
    384       "authors": ["T. R. McIntosh", "T. Liu", "T. Susnjak", "H. Alavizadeh", "A. Ng", "R. Nowrozy", "P. A. Watters"],
    385       "year": 2023,
    386       "doi": "10.1016/j.cose.2023.103424",
    387       "relevance": "Evaluates GPT-4 for generating cybersecurity policies, relevant to LLM applications in security."
    388     },
    389     {
    390       "title": "An Attacker's Dream? Exploring the Capabilities of ChatGPT for Developing Malware",
    391       "authors": ["Y. M. P. Pa", "S. Tanizaki", "T. Kou", "M. van Eeten", "K. Yoshioka", "T. Matsumoto"],
    392       "year": 2023,
    393       "doi": "10.1145/3607505.3607513",
    394       "relevance": "Studies ChatGPT's capability for malware development, directly relevant to LLM security misuse evaluation."
    395     },
    396     {
    397       "title": "From ChatGPT to ThreatGPT: Impact of Generative AI in Cybersecurity and Privacy",
    398       "authors": ["M. Gupta", "C. Akiri", "K. Aryal", "E. Parker", "L. Praharaj"],
    399       "year": 2023,
    400       "doi": "10.1109/ACCESS.2023.3300381",
    401       "relevance": "Survey on generative AI impact on cybersecurity including attack generation, directly relevant to the survey scope."
    402     },
    403     {
    404       "title": "GPThreats-3: Is Automatic Malware Generation a Threat?",
    405       "authors": ["M. Botacin"],
    406       "year": 2023,
    407       "doi": "10.1109/SPW59333.2023.00027",
    408       "relevance": "Evaluates AI code generators for malware generation, a closely related benchmark evaluation study."
    409     }
    410   ]
    411 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs