scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21019B)
      1 {
      2   "paper": {
      3     "title": "Rethinking the Evaluation of Secure Code Generation",
      4     "authors": ["Shih-Chieh Dai", "Jun Xu", "Guanhong Tao"],
      5     "year": 2026,
      6     "venue": "ICSE 2026",
      7     "arxiv_id": "2503.15554",
      8     "doi": "10.1145/3744916.3773217"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "The enhanced SecCodePLT+ dataset is released at https://github.com/Utah-SaLT-Lab/RethinkSecCodeEval (reference [9])."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The study uses two public datasets (BigCodeBench and SecCodePLT) and releases their enhanced SecCodePLT+ with unit tests at the GitHub repository."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. The paper mentions specific model names and tools but not library versions or environment details."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README with commands or reproduction guide is described in the paper itself."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "Results are reported as point estimates without confidence intervals or error bars. Figures show bar charts without error bars."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "T-tests are conducted to assess statistical significance between base models and secure code generation methods (Table 4), with p-values reported. Two-way ANOVA is also used (Section 4.3)."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Cohen's d is reported as effect size in Table 4, with interpretation guidance ('values around 0.2, 0.5, and 0.8 representing small, medium, and large effects')."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No power analysis or justification for why these particular dataset sizes or number of models were chosen. The manual inspection sample of 44 cases is described as using a fixed random seed but no justification for N=44."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper acknowledges not repeating every experiment due to resource limits. They repeated Qwen three times and state 'consistent results' but do not report standard deviations or variance across those runs."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Base models without secure code generation techniques serve as baselines throughout the study (shown as 'Base' in all figures)."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The four evaluated techniques (SVEN, SafeCoder, CodeGuard+, PromSec) are described as 'state-of-the-art' methods from 2023-2024, including a CCS 2023 Distinguished Paper."
     70       },
     71       "ablation_study": {
     72         "applies": false,
     73         "answer": false,
     74         "justification": "This is an evaluation study of existing techniques, not a new system with components to ablate."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Multiple metrics are used: Secure@1, Pass@1 (task-level and case-level), Secure-Pass@1, and the proposed SAFE@1."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Manual inspection is performed: 44 sampled cases for vulnerability scanner validation (Table 3), and manual classification of failure categories in Table 7 with disagreements resolved by majority voting among authors."
     85       },
     86       "held_out_test_set": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "This is not a training/learning study; the paper evaluates existing techniques on benchmark datasets."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down per model (5 open-source + 2 commercial), per scanner (CodeQL, Bearer, Bandit), per dataset (BigCodeBench, SecCodePLT+), per CWE type (Figure 5), and per failure category (Table 7)."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Extensive failure analysis in RQ3 (Section 4.3) with five failure categories (Removed Code, Junk Code, NFI, FN, Other) and concrete examples in Figures 1, 2, and 9."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper's central finding is negative: existing secure code generation techniques often degrade functionality and show limited effectiveness when security and functionality are evaluated together."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Abstract claims about techniques compromising functionality (>50% degradation), CodeQL missing vulnerabilities, and 'garbage code' are all supported by results in Sections 4.1-4.4."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper makes causal claims about techniques causing functionality degradation. These are supported by controlled comparisons (same model with and without technique applied) and manual inspection of failure cases."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Section 5 explicitly bounds scope: Python-only, specific LLMs and scanners, and acknowledges tasks 'may not be representative.' The scope section (3.1) states focus on function-level code generation."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper discusses alternative explanations: data contamination possibility, LLM-generated unit tests being potentially limited, static analyzer false negatives as confounds. Section 5 addresses multiple threats."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Specific model names with sizes are given: CodeLlama-7B, Qwen2.5-Coder-7B, DeepSeek-Coder-V2-Lite, Mistral-7B, StarCoder-1B, GPT-3.5-Turbo, GPT-4o. However, exact API snapshot dates for GPT models are not given."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper describes prompts conceptually (e.g., 'we prompt Qwen2.5-Coder-32B to create a set of test inputs') but does not provide the actual prompt text used in experiments."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No temperature, top-p, or other sampling parameters are reported for the LLM experiments."
    144       },
    145       "scaffolding_described": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The four techniques' mechanisms are described in detail in Section 3.2 and Table 1 (fine-tuning, decoding control, prompt modification, external tool usage)."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "The unit test generation process for SecCodePLT+ is documented (Section 3.2), including how ground truth code was used, pre-existing inputs were included, and indentation issues were corrected via Python syntax parser."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 5 'Limitations and Threats to Validity' provides a dedicated, substantive discussion."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Specific threats discussed: static analyzer false negatives, Python-only evaluation, LLM-generated unit tests may be limited, manual calibration of problematic tests, resource constraints limiting repeated experiments (430 GPU-hours per full run)."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 3.1 explicitly states: 'we focus on function-level code generation,' 'does not aim at the performance of the most advanced models,' and Section 5 states 'these datasets focus mainly on Python, our findings may not generalize to other programming languages.'"
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The SecCodePLT+ dataset with unit tests is released at the GitHub repository. BigCodeBench is a publicly available dataset."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The unit test generation procedure for SecCodePLT+ is described in detail (Section 3.2): LLM-generated test inputs, ground truth code execution for expected outputs, average 7.5 test cases per task."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants were recruited. The study uses benchmark datasets and automated tools."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The pipeline is documented: datasets selected, code generated by LLMs with/without techniques, scanned by three static analyzers, results aggregated. Unit test generation process for SecCodePLT+ is also documented."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Acknowledgments section lists NVIDIA Academic Grant Award, NSF awards #2340198, #2319880, #2213727, and Cisco University Research Program Fund #71858473."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "All three authors are affiliated with University of Utah, clearly stated in the paper header. No evaluated product is from their institution."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Funders are NVIDIA, NSF, and Cisco. The paper evaluates open-source models and third-party techniques. None of the funders have a direct financial stake in the outcome of the evaluation."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is present in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "The paper does not state training data cutoff dates for any of the evaluated models."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Section 5 discusses: 'the samples used in our evaluation might have been included in the training data of LLMs, potentially causing data contamination. Our employed two datasets are generally released after the evaluated models.'"
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "The paper addresses this for SecurityEval (excluded because 'LLMs may have already been trained on these examples') and notes their datasets were 'generally released after the evaluated models' (Section 5)."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants in this study."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants in this study."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in this study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": true,
    275         "justification": "Section 5 states: 'a full run costs 430 GPU-hours (≈18 days) on a single NVIDIA A100' and the repeated Qwen experiment totaled '1,290 GPU-hours (≈54 days) on one A100.'"
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": true,
    280         "justification": "Total compute budget is stated: 430 GPU-hours per full run on NVIDIA A100, with 1,290 GPU-hours for repeated experiments."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "CodeQL fails to detect more than 20% of vulnerabilities in generated code",
    287       "evidence": "Table 3 shows CodeQL has 19 false negatives out of 44 manually inspected samples (43% FN rate). Section 4.1 and Figure 5 show CodeQL identifies far fewer CWEs than Bearer and Bandit.",
    288       "supported": "strong"
    289     },
    290     {
    291       "claim": "Existing secure code generation techniques often compromise functionality to enhance security, with many degrading base LLM performance by more than 50%",
    292       "evidence": "Figures 7-8 show SAFE@1 and Secure-Pass@1 scores. PromSec shows >50% drop on Qwen2.5-Coder and DeepSeek-Coder. Table 4 confirms statistical significance (p<0.05) with Cohen's d values.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "Techniques often remove vulnerable code entirely or generate garbage code rather than truly fixing vulnerabilities",
    297       "evidence": "Table 7 shows Removed Code accounts for 60-92% of failure cases, with concrete examples in Figures 1, 2, and 9.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "LLMs for vulnerability detection have high false positive rates (~50%) that hinder security assessment",
    302       "evidence": "Table 3 shows Qwen has 18/44 false positives and Llama has 21/44 false positives in manual validation.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "Security improvement by existing techniques is not monotonic; they may introduce new vulnerabilities into previously secure code",
    307       "evidence": "Table 6 shows 0.5-24.39% of originally secure code becomes insecure after applying techniques. Average 4.11% on BigCodeBench and 6.87% on SecCodePLT+.",
    308       "supported": "strong"
    309     }
    310   ],
    311   "methodology_tags": ["benchmark-eval"],
    312   "key_findings": "This ICSE 2026 paper demonstrates that existing secure code generation techniques (SVEN, SafeCoder, CodeGuard+, PromSec) show limited effectiveness when security and functionality are evaluated together on the same code. The techniques often sacrifice functionality to appear secure, primarily by removing vulnerability-related code or generating garbage output. CodeQL alone misses many vulnerabilities, and different static analyzers produce inconsistent results. The proposed SAFE metric and enhanced SecCodePLT+ dataset provide a more rigorous evaluation framework.",
    313   "red_flags": [
    314     {
    315       "flag": "Limited repeated experiments",
    316       "detail": "Due to resource constraints (430 GPU-hours per run), only the Qwen model experiments were repeated (3 times). All other results are single-run, though the authors acknowledge this limitation."
    317     },
    318     {
    319       "flag": "LLM-generated unit tests",
    320       "detail": "Unit tests for SecCodePLT+ were generated by Qwen2.5-Coder-32B rather than human-written. While manually calibrated, edge case coverage was not verified, which could affect functionality assessment accuracy."
    321     }
    322   ],
    323   "cited_papers": [
    324     {
    325       "title": "Large language models for code: Security hardening and adversarial testing",
    326       "authors": ["Jingxuan He", "Martin Vechev"],
    327       "year": 2023,
    328       "relevance": "SVEN - one of the four secure code generation techniques evaluated in this study, CCS 2023 Distinguished Paper."
    329     },
    330     {
    331       "title": "Instruction Tuning for Secure Code Generation",
    332       "authors": ["Jingxuan He", "Mark Vero", "Gabriela Krasnopolska", "Martin Vechev"],
    333       "year": 2024,
    334       "relevance": "SafeCoder - fine-tuning-based secure code generation technique evaluated in this study."
    335     },
    336     {
    337       "title": "Constrained decoding for secure code generation",
    338       "authors": ["Yanjun Fu", "Ethan Baker", "Yu Ding", "Yizheng Chen"],
    339       "year": 2024,
    340       "arxiv_id": "2405.00218",
    341       "relevance": "CodeGuard+ - decoding-control-based secure code generation technique evaluated in this study."
    342     },
    343     {
    344       "title": "PromSec: Prompt Optimization for Secure Generation of Functional Source Code with Large Language Models (LLMs)",
    345       "authors": ["Mahmoud Nazzal", "Issa Khalil", "Abdallah Khreishah", "NhatHai Phan"],
    346       "year": 2024,
    347       "relevance": "PromSec - prompt-engineering-based secure code generation technique evaluated, CCS 2024."
    348     },
    349     {
    350       "title": "Bigcodebench: Benchmarking code generation with diverse function calls and complex instructions",
    351       "authors": ["Terry Yue Zhuo"],
    352       "year": 2024,
    353       "arxiv_id": "2406.15877",
    354       "relevance": "One of the two main evaluation datasets used in this study for code generation benchmarking."
    355     },
    356     {
    357       "title": "SecCodePLT: A Unified Platform for Evaluating the Security of Code GenAI",
    358       "authors": ["Yu Yang", "Yuzhou Nie", "Zhun Wang"],
    359       "year": 2024,
    360       "arxiv_id": "2410.11096",
    361       "relevance": "The other main dataset used, extended by the authors into SecCodePLT+ with unit tests."
    362     },
    363     {
    364       "title": "Purple llama cyberseceval: A secure coding benchmark for language models",
    365       "authors": ["Manish Bhatt"],
    366       "year": 2023,
    367       "arxiv_id": "2312.04724",
    368       "relevance": "CyberSecEval benchmark for LLM code security evaluation, discussed as related work."
    369     },
    370     {
    371       "title": "Understanding the effectiveness of large language models in detecting security vulnerabilities",
    372       "authors": ["Avishree Khare", "Saikat Dutta"],
    373       "year": 2025,
    374       "relevance": "Study on LLMs for vulnerability detection, findings referenced regarding LLM detection capabilities."
    375     },
    376     {
    377       "title": "Evaluating Large Language Models Trained on Code",
    378       "authors": ["Mark Chen"],
    379       "year": 2021,
    380       "arxiv_id": "2107.03374",
    381       "relevance": "HumanEval benchmark paper, foundational code generation evaluation benchmark discussed as insufficient for security assessment."
    382     },
    383     {
    384       "title": "CWEval: Outcome-driven Evaluation on Functionality and Security of LLM Code Generation",
    385       "authors": ["Jinjun Peng", "Leyi Cui"],
    386       "year": 2025,
    387       "arxiv_id": "2501.08200",
    388       "relevance": "Related benchmark that evaluates both functionality and security, discussed as limited by small dataset size."
    389     },
    390     {
    391       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    392       "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan"],
    393       "year": 2022,
    394       "relevance": "Pioneering work on security assessment of LLM-generated code using CodeQL, established the single-scanner evaluation practice this paper critiques."
    395     },
    396     {
    397       "title": "SecurityEval dataset: mining vulnerability examples to evaluate machine learning-based code generation techniques",
    398       "authors": ["Mohammed Latif Siddiq", "Joanna CS Santos"],
    399       "year": 2022,
    400       "relevance": "Security evaluation dataset discussed as potentially contaminated since data comes from MITRE CWE web page."
    401     }
    402   ]
    403 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs