scan.json (19476B)
1 { 2 "paper": { 3 "title": "CWEVAL: Outcome-driven Evaluation on Functionality and Security of LLM Code Generation", 4 "authors": ["Jinjun Peng", "Leyi Cui", "Kele Huang", "Junfeng Yang", "Baishakhi Ray"], 5 "year": 2025, 6 "venue": "", 7 "arxiv_id": "", 8 "doi": "" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": true, 15 "justification": "GitHub repository provided: https://github.com/Co1lin/CWEval, explicitly mentioned in the contributions section." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "CWEVAL-BENCH is open-sourced as part of the GitHub repository, containing 119 security-critical coding tasks." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is described in the paper. The paper does not specify library versions or dependencies needed to run the benchmark." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "No step-by-step reproduction instructions are provided in the paper. The paper references the GitHub repository but does not include commands or a reproducing-results section." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "Results are reported as point estimates (e.g., func@10 = 60.28%) with no confidence intervals or error bars." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper makes comparative claims (e.g., larger models achieve higher func-sec@k) but provides no statistical significance tests." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper reports percentage differences with baseline context, e.g., 'From func@10 to func-sec@10, the performance drops around 30%' and specific numbers like 'up to 9.8% on func-sec@10 for Claude 3.5 Haiku.'" 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "The benchmark has 119 tasks and n=100 samples per task. No justification is given for why these numbers are sufficient, nor is a power analysis discussed." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. The unbiased pass@k estimator is used but no uncertainty in the estimates is shown." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper compares against prior benchmarks (SecurityEval, CyberSecEval) qualitatively and evaluates multiple LLMs as baselines against each other." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "Models evaluated include GPT-4o, Claude 3.5 Sonnet/Haiku, Gemini 1.5 Pro/Flash, and Llama 3.1 variants — all contemporary at time of writing." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": false, 74 "justification": "No ablation study is conducted on the CWEVAL framework components (e.g., effect of different test oracle types, specification design choices)." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": true, 79 "justification": "Two metrics are used: func@k (functional correctness) and func-sec@k (functional correctness and security), at multiple k values." 80 }, 81 "human_evaluation": { 82 "applies": true, 83 "answer": false, 84 "justification": "The paper evaluates LLM-generated code quality but uses only automated test oracles. No human evaluation of the system's outputs is included. Human verification was used to create the benchmark itself but not to evaluate outputs." 85 }, 86 "held_out_test_set": { 87 "applies": false, 88 "answer": false, 89 "justification": "This is a benchmark paper — the entire benchmark is the test set. There is no tuning/dev split relevant here." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": false, 94 "justification": "Results are reported as aggregate scores across all 119 tasks. No per-CWE-type or per-language breakdown is provided in the main results." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": true, 99 "justification": "The paper discusses failure cases of prior benchmarks (SecurityEval, CyberSecEval) with specific examples in Figure 1 and Figure 2, and discusses SafeCoder's degradation on CWEVAL-BENCH." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "SafeCoder fine-tuning showed significant functionality degradation on CWEVAL-BENCH (Table II), which is a negative result. Security instruction prompting showed mixed results with some models showing slight func@k decreases." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims about a notable portion of functional but insecure code and inaccuracy of previous evaluations are supported by the results in Section V (30% gap between func@k and func-sec@k, SafeCoder degradation)." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": false, 116 "justification": "The paper claims SafeCoder's degradation is 'possibly due to' the model learning to avoid security-sensitive code entirely. This is a causal claim supported only by speculation, not by controlled analysis." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": false, 121 "justification": "The title says 'LLM Code Generation' broadly, but the benchmark covers only 31 CWE types out of hundreds, and 5 languages. The paper does not bound its claims to these specific CWEs and languages." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": false, 126 "justification": "No alternative explanations are discussed for the observed security gap. For example, the role of training data composition, prompt sensitivity, or model architecture differences is not explored." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": true, 133 "justification": "Exact model versions are specified: gpt-4o-mini-2024-07-18, claude-3-5-haiku-20241022, gemini-1.5-flash-002, etc. (Section V-B)." 134 }, 135 "prompts_provided": { 136 "applies": true, 137 "answer": false, 138 "justification": "The security instruction prompt is partially quoted ('Your code should be secure and should NOT contain any vulnerability') but the full prompts including task specifications sent to models are not provided in the paper." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": true, 143 "justification": "Temperature settings (0.2, 0.4, 0.6, 0.8) and number of samples (n=100) are reported in Section V-B." 144 }, 145 "scaffolding_described": { 146 "applies": false, 147 "answer": false, 148 "justification": "No agentic scaffolding is used. The evaluation is direct code generation from prompts." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": true, 153 "justification": "The benchmark creation process is documented in Section IV with four steps: coding tasks design, specifications writing, test oracle development, and multilingual evolution." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": false, 160 "justification": "No dedicated limitations or threats-to-validity section exists. The conclusion briefly mentions future work but does not discuss limitations." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": false, 165 "justification": "No threats to validity are discussed anywhere in the paper." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": false, 170 "justification": "The paper does not explicitly state what the results do NOT show or what settings are excluded. The limited task count is acknowledged but framed as a resource constraint rather than a scope boundary." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": true, 177 "justification": "The benchmark and evaluation pipeline are open-sourced on GitHub, allowing independent verification of tasks, test oracles, and reference implementations." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section IV describes the four-step process for creating CWEVAL-BENCH: task design from CWE documentation, specification writing, test oracle development, and multilingual evolution." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No human participants were recruited. The benchmark is created by the authors using CWE documentation." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": true, 192 "justification": "The pipeline from CWE documentation to final benchmark tasks is documented in Section IV with four explicit steps, though exact counts at each filtering stage are not provided." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": false, 199 "justification": "No funding source or acknowledgments section is present in the paper." 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "All authors are from Columbia University, clearly stated. No evaluated product is affiliated with the authors." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": false, 209 "justification": "No funding is disclosed, so independence cannot be assessed." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests or financial interests statement is present in the paper." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": true, 220 "answer": false, 221 "justification": "No training data cutoff dates are stated for any of the evaluated models." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": true, 225 "answer": false, 226 "justification": "No discussion of whether CWEVAL-BENCH tasks or CWE documentation examples could have appeared in model training data." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": true, 230 "answer": false, 231 "justification": "The benchmark tasks are inspired by CWE documentation which is publicly available. No discussion of whether models may have seen similar or identical examples during training." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "No human participants in this study." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants in this study." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants in this study." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": false, 275 "justification": "The paper generates 100 samples per task across multiple models and temperatures but does not report API costs, tokens consumed, or wall-clock time." 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": false, 280 "justification": "No total computational budget is stated. The paper mentions 'limited budget' for RQ.2/RQ.3 but does not quantify it." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "There is a significant performance gap (~30%) between functional correctness and combined functional-security pass rates across all evaluated LLMs.", 287 "evidence": "Figure 3 shows func@10 vs func-sec@10 with drops around 30% across all models, maximum 35.79% for Gemini 1.5 Flash (Section V-C1).", 288 "supported": "strong" 289 }, 290 { 291 "claim": "Larger models within the same family achieve higher func-sec@k scores.", 292 "evidence": "Table I shows larger variants (GPT-4o, Claude 3.5 Sonnet, Gemini 1.5 Pro) outperform smaller ones on func-sec@k, though differences on func@k are small (Section V-C2).", 293 "supported": "moderate" 294 }, 295 { 296 "claim": "Simple security instruction prompting improves func-sec@k for most LLMs with only slight decrease on func@k.", 297 "evidence": "Table II shows improvements up to 9.8% on func-sec@10 for Claude 3.5 Haiku with security instruction (Section V-C3).", 298 "supported": "moderate" 299 }, 300 { 301 "claim": "SafeCoder fine-tuning causes significant functionality degradation on CWEVAL-BENCH despite showing improvements on separate functionality/security benchmarks.", 302 "evidence": "Table II shows CodeLlama-7b-hf func@1 drops from 30.77% to 15.35% and func-sec@1 from 13.76% to 7.37% with SafeCoder (Section V-C3).", 303 "supported": "strong" 304 }, 305 { 306 "claim": "Previous benchmarks (SecurityEval, CyberSecEval) have unclear specifications and inaccurate evaluations due to reliance on static analysis.", 307 "evidence": "Figure 1 shows false positive/negative examples from static analysis on SecurityEval samples. Qualitative analysis in Sections I and II (Section I, Figure 1).", 308 "supported": "moderate" 309 } 310 ], 311 "methodology_tags": ["benchmark-eval"], 312 "key_findings": "CWEVAL introduces an outcome-driven evaluation framework that simultaneously assesses both functionality and security of LLM-generated code using dynamic test oracles rather than static analysis. Evaluation of 9 LLMs on CWEVAL-BENCH reveals a consistent ~30% gap between functional correctness and combined functional-security pass rates. Simple security instruction prompting provides modest improvements, while SafeCoder fine-tuning causes significant functionality degradation, exposing a previously undetected alignment tax that separate functionality/security benchmarks miss.", 313 "red_flags": [ 314 { 315 "flag": "No per-category breakdown", 316 "detail": "Results are reported only as aggregates across all 119 tasks. No breakdown by CWE type or programming language is provided, making it impossible to assess where models struggle most." 317 }, 318 { 319 "flag": "No contamination analysis", 320 "detail": "Benchmark tasks are inspired by publicly available CWE documentation. Models may have seen similar examples during training, but this is not discussed." 321 }, 322 { 323 "flag": "No limitations section", 324 "detail": "The paper lacks any discussion of limitations or threats to validity, which is a significant methodological omission." 325 }, 326 { 327 "flag": "No statistical uncertainty", 328 "detail": "All results are point estimates with no confidence intervals, error bars, or significance tests despite making comparative claims between models." 329 } 330 ], 331 "cited_papers": [ 332 { 333 "title": "SecurityEval dataset: mining vulnerability examples to evaluate machine learning-based code generation techniques", 334 "authors": ["M. L. Siddiq", "J. C. Santos"], 335 "year": 2022, 336 "relevance": "Prior security evaluation benchmark for LLM code generation that CWEVAL aims to improve upon." 337 }, 338 { 339 "title": "Evaluating large language models trained on code", 340 "authors": ["M. Chen", "J. Tworek", "H. Jun"], 341 "year": 2021, 342 "arxiv_id": "2107.03374", 343 "relevance": "Introduces HumanEval benchmark and pass@k metric, foundational for LLM code generation evaluation." 344 }, 345 { 346 "title": "Is your code generated by chatgpt really correct? rigorous evaluation of large language models for code generation", 347 "authors": ["J. Liu", "C. S. Xia", "Y. Wang", "L. Zhang"], 348 "year": 2024, 349 "relevance": "EvalPlus benchmark for rigorous LLM code generation evaluation, methodology referenced by CWEVAL." 350 }, 351 { 352 "title": "Purple llama cyberseceval: A secure coding benchmark for language models", 353 "authors": ["M. Bhatt", "S. Chennabasappa"], 354 "year": 2023, 355 "arxiv_id": "2312.04724", 356 "relevance": "CyberSecEval benchmark that CWEVAL critiques for unclear specifications and noisy data." 357 }, 358 { 359 "title": "Large language models for code: Security hardening and adversarial testing", 360 "authors": ["J. He", "M. Vechev"], 361 "year": 2023, 362 "relevance": "SVEN system for security-focused fine-tuning of LLMs, related approach to secure code generation." 363 }, 364 { 365 "title": "Asleep at the keyboard? assessing the security of github copilot's code contributions", 366 "authors": ["H. Pearce", "B. Ahmad", "B. Tan", "B. Dolan-Gavitt", "R. Karri"], 367 "year": 2022, 368 "relevance": "Early empirical study of security vulnerabilities in LLM-generated code (GitHub Copilot)." 369 }, 370 { 371 "title": "Livecodebench: Holistic and contamination free evaluation of large language models for code", 372 "authors": ["N. Jain", "K. Han", "A. Gu"], 373 "year": 2024, 374 "arxiv_id": "2403.07974", 375 "relevance": "Contamination-free code benchmark, addresses data leakage concerns in LLM evaluation." 376 }, 377 { 378 "title": "Cyberseceval 2: A wide-ranging cybersecurity evaluation suite for large language models", 379 "authors": ["M. Bhatt", "S. Chennabasappa"], 380 "year": 2024, 381 "arxiv_id": "2404.13161", 382 "relevance": "Extended version of CyberSecEval for broader security evaluation of LLMs." 383 }, 384 { 385 "title": "Instruction tuning for secure code generation", 386 "authors": ["J. He", "M. Vero", "G. Krasnopolska", "M. Vechev"], 387 "year": 2024, 388 "arxiv_id": "2402.09497", 389 "relevance": "Instruction tuning approach for improving security of LLM-generated code." 390 } 391 ] 392 }