scan.json (31549B)
1 { 2 "paper": { 3 "title": "SECODEPLT: A Unified Benchmark for Evaluating the Security Risks and Capabilities of Code Agents", 4 "authors": [ 5 "Yuzhou Nie", 6 "Zhun Wang", 7 "Yu Yang", 8 "Ruizhe Jiang", 9 "Yuheng Tang", 10 "Xander Davies", 11 "Yarin Gal", 12 "Bo Li", 13 "Wenbo Guo", 14 "Dawn Song" 15 ], 16 "year": 2024, 17 "venue": "arXiv", 18 "arxiv_id": "2410.11096" 19 }, 20 "scan_version": 3, 21 "active_modules": ["experimental_rigor", "data_leakage"], 22 "methodology_tags": ["benchmark-eval"], 23 "key_findings": "SeCodePLT introduces a two-stage benchmark construction pipeline yielding 5.9k samples across 44 CWE categories in Python, C/C++, and Java, achieving nearly 100% security relevance compared to 68% for CYBERSECEVAL. Evaluations of six SOTA models reveal that C/C++ and Java are significantly harder than Python (>50% failure on C/C++ secure coding, <20% patch generation pass@5), reasoning models outperform non-reasoning ones, and security reminders consistently improve performance. Claude-3.7-Sonnet shows much stronger safety alignment than GPT-4o on cyberattack assistance tasks (90%+ refusal rate vs <10% on critical steps).", 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "Code is released at https://github.com/ucsb-mlsec/SecCodePLT, stated in the abstract footnote." 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": true, 34 "justification": "Dataset released on HuggingFace at https://huggingface.co/datasets/secmlr/SecCodePLT, stated in the abstract footnote." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "No requirements.txt, Dockerfile, or detailed environment setup is described in the paper. Only the cyberattack environment setup in Appendix G.3.1 describes server configurations, not the experimental evaluation environment." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "While code and data are released, the paper does not include step-by-step reproduction instructions or describe how to replicate the main experimental results." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": false, 51 "justification": "Results in Figure 5 and Table 2 are point estimates (percentages, pass@1, F1 scores) without confidence intervals or error bars." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": false, 56 "justification": "Comparative claims such as 'Claude-3.7-Sonnet and O4-Mini have the best performance' and 'Qwen2.5-Coder has the worst performance' are made by comparing raw numbers without any statistical significance tests." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "The paper reports raw percentages and rates for each model but does not compute formal effect sizes or frame differences as relative improvements with standardized measures." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "No justification for why 5.9k samples or specific seed counts (130 Python, 1k C/C++, 509 Java) were chosen. No power analysis is presented." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": false, 71 "justification": "Results appear to be from single runs. No standard deviations, variance across seeds, or spread measures are reported for the main experiments." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "Table 1 compares SeCodePLT against eight existing benchmarks including CYBERSECEVAL, PrimeVul, SecLLMHolmes, SVEN, CodeLMSec, BaxBench, and AutoPatchBench on multiple dimensions." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": true, 83 "justification": "Comparison benchmarks include recent work: BaxBench (2025), AutoPatchBench (2025), CYBERSECEVAL (2023-2024). Models evaluated include O4-Mini (2025) and Claude-3.7-Sonnet (2025)." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": true, 88 "justification": "Multiple controlled comparisons serve as ablations: with/without security reminder (Fig 5), with/without CWE policy (Table 2), different context levels (Table 5, Appendix J.2), and different prompting strategies (Table 4, Appendix J.1)." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "Multiple metrics are used: security relevance rate, prompt faithfulness, pass@1, pass@5, F1 score for CWE identification, rule-based test pass rate, dynamic safety test pass rate, and line coverage." 94 }, 95 "human_evaluation": { 96 "applies": true, 97 "answer": false, 98 "justification": "Model outputs are evaluated via automated dynamic testing and rule-based checks. The only human evaluation of outputs is a brief spot check: 'We manually inspected a subset of valid patches and found that nearly all were correct patches' (Section 4.4), which is not a systematic human evaluation." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": true, 103 "justification": "The benchmark is a novel test set that evaluated models were not trained or tuned on. The mutation pipeline explicitly creates novel samples to prevent memorization (Section 3.1)." 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "Results are broken down by CWE category (Figs 4, 7, 8), by programming language (Python, C/C++, Java in Fig 5 and Table 2), and by task type (secure coding, vulnerability detection, patch generation)." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": true, 113 "justification": "Appendix J.3 provides a detailed error analysis of SOTA models (Claude-3.7-Sonnet and O4-Mini) with a specific code example showing where models fail. Appendix H shows Cursor failure examples." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": true, 118 "justification": "Several negative results are reported: Manual CoT performs worse than default prompting (Table 4), removing context significantly degrades performance (Table 5), and all models show poor C/C++ patch generation (<20% even at pass@5)." 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "Abstract claims about broader coverage (Table 1 shows 44 CWEs vs competitors), higher data fidelity (Section 4.1 shows ~100% security relevance), greater scale (5.9k vs competitors), and revealing model strengths/weaknesses (Sections 4.2-4.4) are all supported by experimental results." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": true, 130 "justification": "Causal claims are made through controlled ablation-like experiments: 'providing a security reminder can consistently improve the selected models' performance' is justified by with/without comparisons on the same models and data (Fig 5). Context retrieval experiments (Table 5) similarly use controlled single-variable manipulation." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": false, 135 "justification": "The title claims 'Code Agents' but only one agent (Cursor) is tested in an appendix; main experiments evaluate six standalone LLMs. The benchmark covers three languages and 44 CWEs, which is well-specified in the text, but the title overpromises relative to agent coverage." 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper does not substantively discuss alternative explanations for observed results. For example, the finding that C/C++ is harder is attributed to real-world codebase complexity, but confounds like data source differences (manual Python vs. extracted C/C++), model training data composition, or task formatting differences are not explored." 141 }, 142 "proxy_outcome_distinction": { 143 "applies": true, 144 "answer": true, 145 "justification": "The paper clearly defines what it measures (pass@1 on dynamic tests, F1 for CWE identification, rule-based detection rates) and frames claims at the matching level of granularity. Dynamic test passage is a direct measure of code security, and the paper acknowledges the measurement limitation: 'dynamic testing is by nature unsound' (Section 4.4)." 146 } 147 }, 148 "setup_transparency": { 149 "model_versions_specified": { 150 "applies": true, 151 "answer": false, 152 "justification": "Main experiments use marketing names: 'DeepSeek-R1', 'QwQ-32B', 'GPT-4o', 'O4-Mini', 'Claude-3.7-Sonnet', 'Qwen2.5-Coder-7B' without API snapshot dates or version identifiers. Judge experiments in Appendix C.3 do specify 'GPT-4o-2024-08-06' and 'Claude-3.5-Sonnet-20240620', but main evaluation models lack these details." 153 }, 154 "prompts_provided": { 155 "applies": true, 156 "answer": true, 157 "justification": "Full prompt templates and examples are provided: Appendix A shows complete data format/prompts for coding tasks, Appendix B shows mutation prompts, Appendix C shows judge prompts, and Appendix G.3.4 shows the full cyberattack instruction prompt. Figure 3 illustrates task prompts for all three evaluation tasks." 158 }, 159 "hyperparameters_reported": { 160 "applies": true, 161 "answer": false, 162 "justification": "The paper mentions 'We set a large temperature to enable more variations' for mutation (Section 3.3) without specifying the value. No temperature, top-p, or max token settings are reported for model evaluations. The maximum conversation round for cyberattack evaluation is 40 (Appendix G.1)." 163 }, 164 "scaffolding_described": { 165 "applies": true, 166 "answer": true, 167 "justification": "The cyberattack evaluation in Appendix G uses a multi-turn agentic setup that is described in detail: goal provision, shell environment management, command execution feedback loop, shell switching protocol, and maximum conversation rounds (40). The full instruction prompt is provided in Appendix G.3.4." 168 }, 169 "data_preprocessing_documented": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section 3 documents the full data creation pipeline in detail: seed generation methodology per language, mutation procedures, dynamic validation, Levenshtein distance filtering (threshold 0.8), and sample counts at each stage (Figure 1)." 173 } 174 }, 175 "limitations_and_scope": { 176 "limitations_section_present": { 177 "applies": true, 178 "answer": false, 179 "justification": "There is no dedicated limitations section. Section 5 'Conclusion and Future Work' lists future directions that implicitly acknowledge limitations, and Section 4.4 notes 'dynamic testing is by nature unsound,' but these are scattered mentions, not a substantive limitations discussion." 180 }, 181 "threats_to_validity_specific": { 182 "applies": true, 183 "answer": false, 184 "justification": "No systematic threats-to-validity discussion. Section 4.4 acknowledges 'dynamic testing is by nature unsound, even if a patch passes all of our test cases, there is still no guarantee that it is correct,' which is a specific threat, but it is isolated. No discussion of construct validity threats, data quality threats, or external validity threats." 185 }, 186 "scope_boundaries_stated": { 187 "applies": true, 188 "answer": false, 189 "justification": "The paper does not explicitly state what results do NOT show. Section 5 frames limitations as future work opportunities (extending to more languages, enriching PoC tests, repository-level evaluation) rather than explicitly bounding the current claims." 190 } 191 }, 192 "data_integrity": { 193 "raw_data_available": { 194 "applies": true, 195 "answer": true, 196 "justification": "Raw benchmark data is available on HuggingFace at https://huggingface.co/datasets/secmlr/SecCodePLT, and code is available on GitHub." 197 }, 198 "data_collection_described": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section 3.2 describes seed generation in detail for each language: Python (manual CWE analysis and scenario creation), C/C++ (Arvo dataset extraction with clangd context, GPT-4.1 task description generation, fuzzing for functionality tests), and Java (Juliet Test Suite with obfuscation and JavaParser processing)." 202 }, 203 "recruitment_methods_described": { 204 "applies": false, 205 "answer": false, 206 "justification": "No human participants in the study. Data sources are CWE databases, the Arvo dataset, and the Juliet Test Suite — all publicly available benchmark/test collections." 207 }, 208 "data_pipeline_documented": { 209 "applies": true, 210 "answer": true, 211 "justification": "Figure 1 documents the full pipeline with sample counts at each stage. Section 3.3 details mutation (3 text mutations × 3 code mutations = up to 10 per seed), validation via dynamic testing, and filtering via Levenshtein distance (threshold 0.8). Appendix I provides additional pipeline charts for C/C++ and Java." 212 } 213 }, 214 "conflicts_of_interest": { 215 "funding_disclosed": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding sources or acknowledgments section is present in the paper." 219 }, 220 "affiliations_disclosed": { 221 "applies": true, 222 "answer": true, 223 "justification": "Author affiliations are clearly listed: UC Santa Barbara, UC Berkeley, VirtueAI, UIUC, UChicago, University of Oxford, UK AI Safety Institute." 224 }, 225 "funder_independent_of_outcome": { 226 "applies": true, 227 "answer": false, 228 "justification": "No funding is disclosed, making independence impossible to assess. Some authors are affiliated with VirtueAI (a commercial entity), and the paper evaluates commercial models without disclosing potential conflicts." 229 }, 230 "financial_interests_declared": { 231 "applies": true, 232 "answer": false, 233 "justification": "No competing interests or financial interests statement is present. Authors include affiliations with VirtueAI but no declaration of financial interests related to the findings." 234 } 235 }, 236 "contamination": { 237 "training_cutoff_stated": { 238 "applies": true, 239 "answer": false, 240 "justification": "No training data cutoff dates are stated for any of the six evaluated models, despite this being crucial for assessing whether benchmark seeds (especially from public CWE databases and Arvo/Juliet datasets) could be in training data." 241 }, 242 "train_test_overlap_discussed": { 243 "applies": true, 244 "answer": true, 245 "justification": "Section 3.1 states the mutation pipeline 'can also create unseen samples for LLMs, ensuring the model cannot rely on memorization to complete the tasks,' directly addressing potential train/test overlap through data novelty." 246 }, 247 "benchmark_contamination_addressed": { 248 "applies": true, 249 "answer": true, 250 "justification": "The two-stage pipeline with mutation (variable/function name changes, instruction rephrasing) and Levenshtein distance filtering (Section 3.3) is explicitly designed to create novel samples that prevent contamination from memorized training data." 251 } 252 }, 253 "human_studies": { 254 "pre_registered": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in the study. The paper evaluates LLMs on a benchmark." 258 }, 259 "irb_or_ethics_approval": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants. The study involves automated evaluation of LLMs on coding tasks." 263 }, 264 "demographics_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 }, 269 "inclusion_exclusion_criteria": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in the study." 273 }, 274 "randomization_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants or experimental conditions requiring randomization." 278 }, 279 "blinding_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in the study." 283 }, 284 "attrition_reported": { 285 "applies": false, 286 "answer": false, 287 "justification": "No human participants in the study." 288 } 289 }, 290 "cost_and_practicality": { 291 "inference_cost_reported": { 292 "applies": true, 293 "answer": false, 294 "justification": "No API costs, token counts, or latency measures are reported for the model evaluations despite querying six models across thousands of samples." 295 }, 296 "compute_budget_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "No computational budget, GPU hours, or total API spend is stated for benchmark construction or model evaluation." 300 } 301 }, 302 "experimental_rigor": { 303 "seed_sensitivity_reported": { 304 "applies": true, 305 "answer": false, 306 "justification": "Results appear to be from single runs. No analysis of sensitivity to random seeds or stochastic variation in model outputs." 307 }, 308 "number_of_runs_stated": { 309 "applies": true, 310 "answer": false, 311 "justification": "The number of runs for main experiments (secure coding, vulnerability detection) is not stated. The cyberattack evaluation specifies 50 experiments per category and 500 for end-to-end, but main experiments lack this detail." 312 }, 313 "hyperparameter_search_budget": { 314 "applies": true, 315 "answer": false, 316 "justification": "No hyperparameter search is described. The paper tests three prompting strategies (Table 4) but does not describe any systematic hyperparameter tuning." 317 }, 318 "best_config_selection_justified": { 319 "applies": true, 320 "answer": false, 321 "justification": "The default prompting strategy is used for main results without justification for why it was selected. Table 4 shows different strategies perform similarly, but no formal selection process is described." 322 }, 323 "multiple_comparison_correction": { 324 "applies": true, 325 "answer": false, 326 "justification": "Many comparisons are made across 6 models, 3 languages, multiple CWEs, and 3 tasks without any correction for multiple comparisons." 327 }, 328 "self_comparison_bias_addressed": { 329 "applies": true, 330 "answer": false, 331 "justification": "The authors built SeCodePLT and evaluate other benchmarks (CYBERSECEVAL) as competitors. They do not acknowledge the inherent bias of evaluating their own benchmark's advantages." 332 }, 333 "compute_budget_vs_performance": { 334 "applies": true, 335 "answer": false, 336 "justification": "Models of vastly different sizes are compared (Qwen2.5-Coder-7B vs. GPT-4o, DeepSeek-R1) without any discussion of compute budget differences or performance normalized by compute." 337 }, 338 "benchmark_construct_validity": { 339 "applies": true, 340 "answer": true, 341 "justification": "Section 4.1 evaluates benchmark construct validity through security relevance assessment (nearly 100%), prompt faithfulness evaluation, and test case coverage analysis (90.92% line coverage). Both metrics are validated with two different LLM judges showing consistency (Appendix C.3)." 342 }, 343 "scaffold_confound_addressed": { 344 "applies": false, 345 "answer": false, 346 "justification": "Main model comparisons use direct LLM queries without scaffolding. The Cursor evaluation in Appendix D tests it as a bundled product. No scaffold confound exists in the main comparisons." 347 } 348 }, 349 "data_leakage": { 350 "temporal_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "The paper does not discuss temporal leakage. Seed data from Arvo and Juliet Test Suite may predate model training. While mutations create variants, the underlying vulnerability patterns and solutions may be in training data. No temporal analysis is provided." 354 }, 355 "feature_leakage_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of whether the evaluation setup leaks answer information. The security reminder/policy deliberately provides hints, but unintentional feature leakage through task formatting or context is not discussed." 359 }, 360 "non_independence_addressed": { 361 "applies": true, 362 "answer": false, 363 "justification": "No discussion of whether mutated samples from the same seed are independent enough, or whether seed data patterns overlap with models' training distributions." 364 }, 365 "leakage_detection_method": { 366 "applies": true, 367 "answer": false, 368 "justification": "No concrete leakage detection method (canary strings, membership inference, n-gram overlap analysis) is used. The mutation pipeline changes surface features (variable names, instruction wording) but does not detect or prevent leakage of underlying vulnerability patterns." 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "SeCodePLT achieves nearly 100% in both security relevance and instruction faithfulness, significantly outperforming CYBERSECEVAL's 67.81% security relevance rate.", 375 "evidence": "Figure 4a shows nearly 100% security relevance across all CWEs for SeCodePLT vs. varied rates for CYBERSECEVAL (e.g., CWE-338: 4/30, CWE-798: 20/37). Section 4.1 reports CYBERSECEVAL overall rate of 67.81%.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Python is consistently less challenging than C/C++ and Java across all three evaluation tasks and all models.", 380 "evidence": "Figure 5 shows higher correct rates on Python (e.g., Claude: 66-80% Python vs. 24-30% C/C++). Table 2 shows higher vulnerability detection and patch generation rates for Python across all models.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Large and reasoning models perform better than small non-reasoning models on security tasks.", 385 "evidence": "Figure 5 and Table 2 consistently show Qwen2.5-Coder-7B performing worst while Claude-3.7-Sonnet and O4-Mini perform best. QwQ-32B (reasoning) outperforms Qwen2.5-Coder (non-reasoning) from the same family.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Security reminders consistently improve model performance on secure coding tasks across all languages.", 390 "evidence": "Figure 5 shows hatched bars (with reminder) consistently higher than solid bars (without) for all models and languages.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "SOTA models still perform poorly on C/C++ and Java vulnerability detection and patch generation, with less than 20% pass@5 for C/C++ patching.", 395 "evidence": "Table 2 shows C/C++ patch generation pass@5 rates: Claude 0.193, O4-Mini 0.129, DeepSeek-R1 0.091, QwQ-32B 0.113, GPT-4o 0.174, Qwen2.5-Coder 0.", 396 "supported": "strong" 397 }, 398 { 399 "claim": "GPT-4o poses significantly higher risk than Claude in assisting cyberattack implementation, with Claude showing >90% refusal rate on critical attack steps.", 400 "evidence": "Figure 13 shows for Initial Access: GPT-4o refusal 8% vs. Claude 94%; for C2 & Execution: GPT-4o refusal 10% vs. Claude 90%. Based on 50 experiments per category.", 401 "supported": "moderate" 402 } 403 ], 404 "red_flags": [ 405 { 406 "flag": "No error bars or variance reporting", 407 "detail": "All main experimental results (Fig 5, Table 2) are point estimates without confidence intervals, standard deviations, or any uncertainty quantification. Given stochastic model outputs, this undermines the reliability of comparative claims." 408 }, 409 { 410 "flag": "No statistical significance tests", 411 "detail": "Comparative claims ('Claude-3.7-Sonnet and O4-Mini have the best performance,' 'Qwen2.5-Coder has the worst performance') are based solely on comparing raw numbers without any significance testing." 412 }, 413 { 414 "flag": "Missing funding and conflict disclosures", 415 "detail": "No funding sources, competing interests, or financial interest declarations are provided. Authors include an affiliation with VirtueAI (a commercial entity), and the paper evaluates commercial LLMs without disclosing potential conflicts." 416 }, 417 { 418 "flag": "Unequal data provenance across languages", 419 "detail": "Python seeds are manually handcrafted while C/C++ and Java seeds are extracted from existing databases (Arvo, Juliet). The difficulty difference attributed to 'language difficulty' may be substantially driven by this data source difference, which is acknowledged but not controlled for." 420 }, 421 { 422 "flag": "No dedicated limitations section", 423 "detail": "The paper lacks a structured discussion of limitations, threats to validity, or scope boundaries. Limitations are scattered across sections or framed as future work." 424 }, 425 { 426 "flag": "Compute-unmatched model comparisons", 427 "detail": "Models of vastly different sizes (Qwen2.5-Coder-7B vs. GPT-4o/DeepSeek-R1) are compared on the same tasks without discussing compute differences, making it unsurprising that larger models perform better." 428 } 429 ], 430 "cited_papers": [ 431 { 432 "title": "Asleep at the keyboard? Assessing the security of GitHub Copilot's code contributions", 433 "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"], 434 "year": 2022, 435 "relevance": "Foundational benchmark for evaluating security of AI-generated code, directly comparable to SeCodePLT." 436 }, 437 { 438 "title": "Purple llama cyberseceval: A secure coding benchmark for language models", 439 "authors": ["Manish Bhatt", "Sahana Chennabasappa", "Cyrus Nikolaidis"], 440 "year": 2023, 441 "arxiv_id": "2312.04724", 442 "relevance": "Primary baseline benchmark that SeCodePLT directly compares against and critiques for low data quality." 443 }, 444 { 445 "title": "Evaluating large language models trained on code", 446 "authors": ["Mark Chen", "Jerry Tworek"], 447 "year": 2021, 448 "arxiv_id": "2107.03374", 449 "relevance": "Seminal work on code generation LLM evaluation (Codex/HumanEval), foundational to code security evaluation." 450 }, 451 { 452 "title": "SWE-bench: Can language models resolve real-world GitHub issues?", 453 "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig"], 454 "year": 2023, 455 "arxiv_id": "2310.06770", 456 "relevance": "Major code agent benchmark that SeCodePLT explicitly contrasts with as not security-specific." 457 }, 458 { 459 "title": "LLMs cannot reliably identify and reason about security vulnerabilities (yet?)", 460 "authors": ["Saad Ullah", "Mingji Han", "Saurabh Pujar"], 461 "year": 2024, 462 "relevance": "Comprehensive evaluation of LLM vulnerability detection capabilities, directly comparable benchmark (SecLLMHolmes)." 463 }, 464 { 465 "title": "Vulnerability detection with code language models: How far are we?", 466 "authors": ["Yangruibo Ding", "Yanjun Fu", "Omniyyah Ibrahim"], 467 "year": 2024, 468 "arxiv_id": "2403.18624", 469 "relevance": "Largest vulnerability detection benchmark (PrimeVul) that SeCodePLT compares against for data quality." 470 }, 471 { 472 "title": "BaxBench: Can LLMs generate correct and secure backends?", 473 "authors": ["Mark Vero", "Niels Mündler"], 474 "year": 2025, 475 "arxiv_id": "2502.11844", 476 "relevance": "Contemporary benchmark for evaluating LLM-generated backend security with dynamic testing." 477 }, 478 { 479 "title": "RedCode: Risky code execution and generation benchmark for code agents", 480 "authors": ["Chengquan Guo", "Xun Liu", "Chulin Xie"], 481 "year": 2024, 482 "relevance": "Evaluates LLM risks in generating malware, complementary to SeCodePLT's security evaluation scope." 483 }, 484 { 485 "title": "Large language models for code: Security hardening and adversarial testing", 486 "authors": ["Jingxuan He", "Martin Vechev"], 487 "year": 2023, 488 "relevance": "Proposes SVEN dataset for secure code generation evaluation, directly compared in Table 1." 489 }, 490 { 491 "title": "Cybench: A framework for evaluating cybersecurity capabilities and risk of language models", 492 "authors": ["Andy K Zhang", "Neil Perry"], 493 "year": 2024, 494 "arxiv_id": "2408.08926", 495 "relevance": "CTF benchmark for evaluating LLM cybersecurity capabilities, complementary to SeCodePLT's approach." 496 }, 497 { 498 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", 499 "authors": ["DeepSeek-AI"], 500 "year": 2025, 501 "arxiv_id": "2501.12948", 502 "relevance": "One of the evaluated SOTA reasoning models, showing reasoning capability impacts on security tasks." 503 }, 504 { 505 "title": "CWEval: Outcome-driven evaluation on functionality and security of LLM code generation", 506 "authors": ["Jinjun Peng", "Leyi Cui", "Kele Huang"], 507 "year": 2025, 508 "arxiv_id": "2501.08200", 509 "relevance": "Contemporary secure code generation evaluation benchmark using outcome-driven metrics." 510 } 511 ], 512 "engagement_factors": { 513 "practical_relevance": { 514 "score": 2, 515 "justification": "Released benchmark with code and data on GitHub/HuggingFace that practitioners can use to evaluate code LLM security, though it requires significant setup effort." 516 }, 517 "surprise_contrarian": { 518 "score": 1, 519 "justification": "Results largely confirm expected patterns (larger models are better, security coding is hard) rather than challenging conventional wisdom." 520 }, 521 "fear_safety": { 522 "score": 2, 523 "justification": "Demonstrates that SOTA models generate insecure code at high rates and GPT-4o can assist cyberattacks with low refusal rate, raising concrete AI safety concerns." 524 }, 525 "drama_conflict": { 526 "score": 1, 527 "justification": "Shows CYBERSECEVAL has only 68% security relevance, mildly critical of Meta's benchmark quality, but framed constructively." 528 }, 529 "demo_ability": { 530 "score": 2, 531 "justification": "Code repository and HuggingFace dataset are publicly available for anyone to run evaluations." 532 }, 533 "brand_recognition": { 534 "score": 2, 535 "justification": "Evaluates well-known models (GPT-4o, Claude, DeepSeek) and authored by researchers from prominent institutions (UC Berkeley, Oxford) including Dawn Song and Bo Li." 536 } 537 } 538 }