scan.json (31607B)
1 { 2 "paper": { 3 "title": "SecureAgentBench: Benchmarking Secure Code Generation under Realistic Vulnerability Scenarios", 4 "authors": [ 5 "Junkai Chen", 6 "Huihui Huang", 7 "Yunbo Lyu", 8 "Junwen An", 9 "Jieke Shi", 10 "Chengran Yang", 11 "Ting Zhang", 12 "Haoye Tian", 13 "Yikun Li", 14 "Zhenhao Li", 15 "Xin Zhou", 16 "Xing Hu", 17 "David Lo" 18 ], 19 "year": 2025, 20 "venue": "arXiv", 21 "arxiv_id": "2509.22097", 22 "doi": "10.48550/arXiv.2509.22097" 23 }, 24 "scan_version": 2, 25 "active_modules": ["experimental_rigor", "data_leakage"], 26 "methodology_tags": ["benchmark-eval"], 27 "key_findings": "SecureAgentBench evaluates 3 code agents (SWE-agent, OpenHands, Aider) with 3 LLMs on 105 real-world vulnerability-introduction tasks. The best combination (SWE-agent + DeepSeek-V3.1) achieves only 15.2% correct-and-secure solutions. Code agents introduce new vulnerability types not in the benchmark's historical records, and explicit security reminders in prompts do not improve secure coding outcomes. The backbone model has a stronger influence on performance than the agent framework, with DeepSeek-V3.1 roughly doubling the C&S rate of Claude 3.7 Sonnet and GPT-4.1.", 28 "checklist": { 29 "artifacts": { 30 "code_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "A repository URL is provided (https://github.com/iCSawyer/SecureAgentBench) directly under the abstract. However, the contributions section also states 'We will publicly release our code and dataset upon acceptance,' which creates ambiguity about current availability." 34 }, 35 "data_released": { 36 "applies": true, 37 "answer": true, 38 "justification": "The benchmark tasks are derived from public OSS-Fuzz/ARVO vulnerabilities and a GitHub repository URL is provided. The 105 tasks with Dockerized environments are described as part of the release." 39 }, 40 "environment_specified": { 41 "applies": true, 42 "answer": true, 43 "justification": "Each task includes a Dockerized environment (Section 2.1). Agent versions are specified: SWE-agent v1.1.0, OpenHands v0.50.0, Aider v0.86.1 (Appendix E.3). Hardware is listed: Intel Xeon Platinum 8480C, 2TB RAM, 8 NVIDIA H100 GPUs (Appendix E.1)." 44 }, 45 "reproduction_instructions": { 46 "applies": true, 47 "answer": true, 48 "justification": "Appendix E provides detailed experimental configurations for each agent. Docker environments are provided per task. Prompt templates are given in Appendix G (Figures 15-17). The GitHub repository would presumably contain reproduction scripts." 49 } 50 }, 51 "statistical_methodology": { 52 "confidence_intervals_or_error_bars": { 53 "applies": true, 54 "answer": false, 55 "justification": "All results are reported as point estimates (e.g., 15.2% C&S, 9.2% average). No confidence intervals, error bars, or ± notation appear in any table or figure. The authors acknowledge in limitations: 'we did not repeat experiments to fully mitigate the potential nondeterminism of LLMs.'" 56 }, 57 "significance_tests": { 58 "applies": true, 59 "answer": false, 60 "justification": "Claims like 'DeepSeek-V3.1 outperforms both Claude 3.7 Sonnet and GPT-4.1' (Section 3) are made by comparing raw percentages without any statistical significance testing." 61 }, 62 "effect_sizes_reported": { 63 "applies": true, 64 "answer": true, 65 "justification": "Results are reported with sufficient baseline context for magnitude assessment: '14.3% vs. 7.3% and 6.0%' (Table 4), 'more than 80% of the correct code remains vulnerable' for CWE-120 vs 'only about 10%' for CWE-415 (Section 3). Cost comparisons also contextualize performance (Figure 5)." 66 }, 67 "sample_size_justified": { 68 "applies": true, 69 "answer": false, 70 "justification": "The benchmark contains 105 tasks. The paper says this is 'comparable in scale to prior benchmarks (e.g., Peng et al. (2025))' but does not justify why 105 is sufficient for the claims made, nor is a power analysis provided." 71 }, 72 "variance_reported": { 73 "applies": true, 74 "answer": false, 75 "justification": "Single-run results with no variance, standard deviation, or spread measures. The limitations explicitly state: 'Due to budget limitations, we did not repeat experiments to fully mitigate the potential nondeterminism of LLMs.'" 76 } 77 }, 78 "evaluation_design": { 79 "baselines_included": { 80 "applies": true, 81 "answer": true, 82 "justification": "Three agent frameworks (SWE-agent, OpenHands, Aider) and three LLMs (Claude 3.7 Sonnet, GPT-4.1, DeepSeek-V3.1) are compared against each other, providing a 3×3 matrix of comparisons (Table 4, Figure 4)." 83 }, 84 "baselines_contemporary": { 85 "applies": true, 86 "answer": true, 87 "justification": "All models and agents are state-of-the-art at time of writing: Claude 3.7 Sonnet (Feb 2025), GPT-4.1 (Apr 2025), DeepSeek-V3.1 (Mar 2025), SWE-agent v1.1.0, OpenHands v0.50.0, Aider v0.86.1." 88 }, 89 "ablation_study": { 90 "applies": true, 91 "answer": true, 92 "justification": "The security reminder experiment (Figure 7) serves as an ablation, comparing performance with and without explicit security instructions using the best-performing setting (SWE-agent + DeepSeek-V3.1)." 93 }, 94 "multiple_metrics": { 95 "applies": true, 96 "answer": true, 97 "justification": "Six outcome categories are tracked: No Output, Compilation Error, Incorrect, Correct but Vulnerable, Correct but Suspicious, Correct and Secure (Section 2.1). Cost-effectiveness is also reported (Figure 5)." 98 }, 99 "human_evaluation": { 100 "applies": true, 101 "answer": false, 102 "justification": "Evaluation is entirely automated: functional test suites, PoC exploit execution, and SAST (Semgrep) scanning. No human evaluation of agent-generated code is performed. Manual inspection occurs only during benchmark construction (quality assurance), not evaluation of agent outputs." 103 }, 104 "held_out_test_set": { 105 "applies": true, 106 "answer": true, 107 "justification": "The 105 tasks constitute a newly constructed benchmark not used for any tuning. Agents are evaluated with fixed configurations (temperature 0.0) without task-specific adaptation. The benchmark itself serves as a held-out test set for the evaluated agents." 108 }, 109 "per_category_breakdown": { 110 "applies": true, 111 "answer": true, 112 "justification": "Table 4 breaks down by agent and model. Figure 6 provides per-CWE vulnerability proportions. Table 5 shows CWE distribution of newly introduced vulnerabilities. Table 6 shows per-project distribution." 113 }, 114 "failure_cases_discussed": { 115 "applies": true, 116 "answer": true, 117 "justification": "Section 3 discusses failure modes (invalid outputs, compilation errors). Figure 14 shows a concrete example of a correct-but-vulnerable patch. The security reminder analysis explains why more failures occur with the reminder. Appendix B discusses implications of failures." 118 }, 119 "negative_results_reported": { 120 "applies": true, 121 "answer": true, 122 "justification": "The security reminder experiment (Figure 7) is a clear negative result: 'the number of securely resolved instances does not increase (i.e., 16 vs. 16), the cases yielding no valid output rise.' Aider's poor performance is also honestly reported." 123 } 124 }, 125 "claims_and_evidence": { 126 "abstract_claims_supported": { 127 "applies": true, 128 "answer": true, 129 "justification": "Abstract claim of 15.2% C&S for best agent matches Figure 4 and Table 4. Claim about agents introducing new vulnerabilities is supported by Table 5. Claim about security reminders being insufficient is supported by Figure 7. All three key findings are substantiated in the results." 130 }, 131 "causal_claims_justified": { 132 "applies": true, 133 "answer": true, 134 "justification": "The security reminder experiment is a controlled comparison (same agent, same model, prompt varied). Claims about model vs. agent influence are supported by the factorial design (3 agents × 3 models). The paper uses appropriately hedged language ('suggests', 'indicates') for most findings." 135 }, 136 "generalization_bounded": { 137 "applies": true, 138 "answer": true, 139 "justification": "The limitations section explicitly bounds scope: 'SecureAgentBench is built on vulnerabilities identified by OSS-Fuzz; hence, the range of vulnerability types and programming languages may be somewhat constrained.' The benchmark is C/C++ specific and focuses on memory-safety vulnerabilities." 140 }, 141 "alternative_explanations_discussed": { 142 "applies": true, 143 "answer": true, 144 "justification": "The paper offers alternative explanations for the security reminder result: 'the additional security reminder makes the agent more cautious, prompting extra deliberation and testing. This, however, increases the likelihood of hitting time and cost limits.' Limitations discuss SAST tool characteristics and LLM nondeterminism as confounds." 145 }, 146 "proxy_outcome_distinction": { 147 "applies": true, 148 "answer": true, 149 "justification": "The paper directly measures functionality (test suite passing) and security (PoC exploit + SAST) and frames claims in terms of these same measurements. The 'Correct and Secure' metric is clearly defined. No significant gap exists between what is measured and what is claimed." 150 } 151 }, 152 "setup_transparency": { 153 "model_versions_specified": { 154 "applies": true, 155 "answer": true, 156 "justification": "Specific model versions are named: Claude 3.7 Sonnet, GPT-4.1, DeepSeek-V3.1 (DeepSeek-Chat). Agent versions are also specified: SWE-agent v1.1.0, OpenHands v0.50.0, Aider v0.86.1 (Appendix E.3). However, no API snapshot dates are provided." 157 }, 158 "prompts_provided": { 159 "applies": true, 160 "answer": true, 161 "justification": "Full prompt templates are provided in Appendix G: Figure 15 (default), Figure 16 (with security reminder), and Figure 17 (requirement generation). The templates include placeholders for working_dir and problem_statement, and an example requirement is shown in Figure 11." 162 }, 163 "hyperparameters_reported": { 164 "applies": true, 165 "answer": true, 166 "justification": "Appendix E.3 reports: temperature 0.0 for all models, maximum 75 iterations and cost limit of 2 USD for SWE-agent and OpenHands. Aider temperature 0.0 with no explicit iteration/cost limits. Browser interaction disabled for OpenHands and Aider." 167 }, 168 "scaffolding_described": { 169 "applies": true, 170 "answer": true, 171 "justification": "Appendix E.2-E.3 describes each agent's architecture: SWE-agent uses terminal commands and bash-based tool execution; OpenHands uses CodeAct agent with browser disabled; Aider integrates with Git for iterative editing. Each agent's version and interaction mode is specified." 172 }, 173 "data_preprocessing_documented": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section 2.2 and Figure 2 document the full pipeline with counts at each stage: 4,993 ARVO instances → 1,632 (single VIC) → 254 (validated) → 232 (oracles) → 105 (quality assured). Each filtering criterion is described in detail." 177 } 178 }, 179 "limitations_and_scope": { 180 "limitations_section_present": { 181 "applies": true, 182 "answer": true, 183 "justification": "Appendix B contains a 'Limitations and Future Work' subsection with substantive discussion of three specific limitations: OSS-Fuzz scope constraints, Semgrep tool limitations, and lack of repeated experiments." 184 }, 185 "threats_to_validity_specific": { 186 "applies": true, 187 "answer": true, 188 "justification": "Specific threats are discussed: 'the range of vulnerability types and programming languages may be somewhat constrained' (OSS-Fuzz dependency), 'its findings may reflect the characteristics of its scanning rules and mechanisms' (Semgrep), and 'we did not repeat experiments to fully mitigate the potential nondeterminism of LLMs' (single runs)." 189 }, 190 "scope_boundaries_stated": { 191 "applies": true, 192 "answer": true, 193 "justification": "The paper states what it does NOT cover: other vulnerability sources beyond OSS-Fuzz, non-C/C++ languages, reasoning models ('We do not use reasoning models due to budget constraints'). It explicitly positions against BaxBench: 'we mainly focus on the evolution stage of software (i.e., code editing based on existing code bases).'" 194 } 195 }, 196 "data_integrity": { 197 "raw_data_available": { 198 "applies": true, 199 "answer": false, 200 "justification": "A GitHub URL is provided but the paper states 'We will publicly release our code and dataset upon acceptance,' indicating the data is not yet publicly available. The benchmark tasks and evaluation data cannot be independently verified at time of publication." 201 }, 202 "data_collection_described": { 203 "applies": true, 204 "answer": true, 205 "justification": "Section 2.2 describes data collection in detail: vulnerabilities sourced from OSS-Fuzz via ARVO, VIC identification using a two-stage static+dynamic approach, PoC validation across three commits, and evaluation oracle acquisition. Each step has explicit criteria." 206 }, 207 "recruitment_methods_described": { 208 "applies": false, 209 "answer": false, 210 "justification": "No human participants. Data sources are public vulnerability databases (OSS-Fuzz, ARVO) and open-source repositories." 211 }, 212 "data_pipeline_documented": { 213 "applies": true, 214 "answer": true, 215 "justification": "Figure 2 shows the full pipeline with counts: 4,993 → 1,632 (single VIC candidate) → 254 (VIC validated) → 232 (oracles acquired) → 105 (quality assured). Each reduction step is explained with specific filtering criteria in Section 2.2." 216 } 217 }, 218 "conflicts_of_interest": { 219 "funding_disclosed": { 220 "applies": true, 221 "answer": false, 222 "justification": "No funding statement, acknowledgments section, or grant information appears in the paper. The authors are from six different universities but no funding sources are mentioned." 223 }, 224 "affiliations_disclosed": { 225 "applies": true, 226 "answer": true, 227 "justification": "All author affiliations are clearly listed: Singapore Management University, National University of Singapore, Monash University, Aalto University, York University, Zhejiang University. None are affiliated with the companies whose products are evaluated (Anthropic, OpenAI, DeepSeek)." 228 }, 229 "funder_independent_of_outcome": { 230 "applies": true, 231 "answer": false, 232 "justification": "No funding is disclosed, so independence cannot be assessed. Authors are at universities unaffiliated with the evaluated products, but the absence of any funding disclosure prevents verification." 233 }, 234 "financial_interests_declared": { 235 "applies": true, 236 "answer": false, 237 "justification": "No competing interests statement or financial disclosure is present anywhere in the paper." 238 } 239 }, 240 "contamination": { 241 "training_cutoff_stated": { 242 "applies": true, 243 "answer": false, 244 "justification": "No training data cutoff dates are stated for any of the three LLMs used (Claude 3.7 Sonnet, GPT-4.1, DeepSeek-V3.1). This is important because the vulnerability fixes and PoC programs are publicly available on GitHub and OSS-Fuzz." 245 }, 246 "train_test_overlap_discussed": { 247 "applies": true, 248 "answer": false, 249 "justification": "The requirement generation process avoids leaking vulnerability details, but whether the LLMs' training data includes the vulnerability-fixing commits from GitHub is never discussed. The fixes are public and could be in training data." 250 }, 251 "benchmark_contamination_addressed": { 252 "applies": true, 253 "answer": false, 254 "justification": "The OSS-Fuzz vulnerabilities, fixing commits, and PoC programs are all publicly available on GitHub. LLMs trained on code could have seen the exact fixes. This contamination risk is not addressed anywhere in the paper." 255 } 256 }, 257 "human_studies": { 258 "pre_registered": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study. The benchmark evaluates code agents on automated tasks." 262 }, 263 "irb_or_ethics_approval": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants. The study uses publicly available vulnerability data and evaluates code agents." 267 }, 268 "demographics_reported": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "inclusion_exclusion_criteria": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "randomization_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 }, 283 "blinding_described": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in this study." 287 }, 288 "attrition_reported": { 289 "applies": false, 290 "answer": false, 291 "justification": "No human participants in this study." 292 } 293 }, 294 "cost_and_practicality": { 295 "inference_cost_reported": { 296 "applies": true, 297 "answer": true, 298 "justification": "Figure 5 shows per-task cost for each agent-model combination. The text states DeepSeek-based agents cost 'below 0.2 USD per task' while SWE+GPT 'consumes more than 1.0 USD.' Cost limits of 2 USD per task are set for SWE-agent and OpenHands." 299 }, 300 "compute_budget_stated": { 301 "applies": true, 302 "answer": true, 303 "justification": "Hardware is specified: two Intel Xeon Platinum 8480C CPUs at 3.80 GHz, 2 TB RAM, 8 NVIDIA H100 GPUs with 80 GB HBM3 (Appendix E.1). Per-task cost limits are set at 2 USD. Per-task costs are shown in Figure 5." 304 } 305 }, 306 "experimental_rigor": { 307 "seed_sensitivity_reported": { 308 "applies": true, 309 "answer": false, 310 "justification": "The paper explicitly acknowledges: 'Due to budget limitations, we did not repeat experiments to fully mitigate the potential nondeterminism of LLMs.' All results are single-run, and temperature is set to 0.0 but output is still nondeterministic for these models." 311 }, 312 "number_of_runs_stated": { 313 "applies": true, 314 "answer": false, 315 "justification": "The number of experimental runs is not explicitly stated. The limitations imply single runs ('did not repeat experiments'), but this is not stated in the experimental setup or results sections." 316 }, 317 "hyperparameter_search_budget": { 318 "applies": true, 319 "answer": false, 320 "justification": "No hyperparameter search budget is reported. Temperature is fixed at 0.0 and iteration limits at 75, but no justification for these choices or exploration of alternatives is provided." 321 }, 322 "best_config_selection_justified": { 323 "applies": true, 324 "answer": true, 325 "justification": "All 9 agent-model configurations (3×3) are reported in Table 4 and Figure 4. The security reminder experiment uses the best-performing combination (SWE-agent + DeepSeek-V3.1) with explicit justification. No cherry-picking of configurations." 326 }, 327 "multiple_comparison_correction": { 328 "applies": false, 329 "answer": false, 330 "justification": "No statistical tests are performed in this paper, so multiple comparison correction is not applicable." 331 }, 332 "self_comparison_bias_addressed": { 333 "applies": true, 334 "answer": false, 335 "justification": "The authors created the benchmark and evaluate third-party tools on it. While they don't implement the tools, potential bias in benchmark design (e.g., task selection favoring certain failure modes) is not discussed." 336 }, 337 "compute_budget_vs_performance": { 338 "applies": true, 339 "answer": true, 340 "justification": "Figure 5 directly plots resolve rate (C&S) vs. cost per task for all 9 configurations. The paper explicitly discusses cost-effectiveness: 'Agents supported by DeepSeek-V3.1 are the most cost-effective, appearing in the upper-left area of the figure.'" 341 }, 342 "benchmark_construct_validity": { 343 "applies": true, 344 "answer": true, 345 "justification": "Section 2 extensively argues why the benchmark design measures secure code generation: aligned vulnerability introduction contexts, comprehensive evaluation combining functionality testing + PoC exploits + SAST. Table 1 compares construct dimensions against prior benchmarks. The three-part evaluation design is justified as more comprehensive than prior work." 346 }, 347 "scaffold_confound_addressed": { 348 "applies": true, 349 "answer": true, 350 "justification": "The 3×3 factorial design (3 agents × 3 models) explicitly separates scaffold and model effects. Table 4 reports averages across agents and across models separately. The paper concludes 'it is the backbone model that exerts the stronger influence' based on this separation." 351 } 352 }, 353 "data_leakage": { 354 "temporal_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "The benchmark uses historical OSS-Fuzz vulnerabilities with public fixing commits. The paper does not discuss whether the LLMs were trained on data that includes these fixes. The vulnerability fixes are publicly available on GitHub and could be in training data." 358 }, 359 "feature_leakage_addressed": { 360 "applies": true, 361 "answer": true, 362 "justification": "The requirement generation process (Section 2.2, Figure 17) explicitly ensures descriptions are 'security-neutral without explicitly mentioning vulnerabilities' and do not include 'code from the gold patches.' This prevents the prompt from leaking vulnerability or solution information." 363 }, 364 "non_independence_addressed": { 365 "applies": true, 366 "answer": false, 367 "justification": "Table 6 shows that harfbuzz alone accounts for 15.2% of tasks (16/105), and several projects contribute 6-7 tasks each. Non-independence of tasks from the same project (shared codebase, similar patterns) is not discussed as a potential confound." 368 }, 369 "leakage_detection_method": { 370 "applies": true, 371 "answer": false, 372 "justification": "No concrete leakage detection methods (canary strings, membership inference, n-gram overlap analysis) are applied. The paper only ensures security-neutral requirements, but does not detect whether training data contains the vulnerability fixes." 373 } 374 } 375 }, 376 "claims": [ 377 { 378 "claim": "Current code agents struggle to produce secure code, with the best combination (SWE-agent + DeepSeek-V3.1) achieving only 15.2% correct-and-secure solutions and an average of 9.2% across all configurations.", 379 "evidence": "Figure 4 and Table 4 report C&S rates for all 9 agent-model configurations. SWE+DS achieves 15.2% and the overall average is 9.2%.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Code agents introduce new types of security vulnerabilities not previously recorded in the benchmark, with 14 distinct CWE types detected by SAST compared to 11 historical types.", 384 "evidence": "Table 5 reports CWE distribution of newly introduced suspected vulnerabilities, including CWE-14 (12.9%) which is not in the historical benchmark. Among correct solutions, 'more than 20% generated code is reported to produce new potential vulnerabilities' (Section 1).", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "An explicit security reminder does not significantly improve secure coding ability, with C&S remaining at 16/105 while invalid outputs increase.", 389 "evidence": "Figure 7 compares SWE-agent + DeepSeek-V3.1 with and without security reminders. C&S stays at 16 while NO increases from 2 to 6 and CE from 28 to 31.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "The backbone model has a stronger influence on performance than the agent framework, with DeepSeek-V3.1 achieving nearly twice the C&S rate of other models.", 394 "evidence": "Table 4 shows model averages: DeepSeek 14.3% vs Claude 7.3% and GPT 6.0%. Figure 5 shows all three agent frameworks cluster similarly when using the same model. The paper states 'it is the backbone model that exerts the stronger influence.'", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "Claude 3.7 Sonnet generates the highest proportion of vulnerable outputs despite comparable functional correctness to DeepSeek-V3.1.", 399 "evidence": "Table 4 reports Claude's functional correctness at 31.5% vs DeepSeek's 35.9%, but Claude's CV rate (17.5%) exceeds DeepSeek's (13.7%). The paper notes Claude's 'limited capability in ensuring software security.'", 400 "supported": "moderate" 401 } 402 ], 403 "red_flags": [ 404 { 405 "flag": "No repeated experiments", 406 "detail": "All results are single-run. The authors acknowledge 'we did not repeat experiments to fully mitigate the potential nondeterminism of LLMs.' With temperature set to 0.0, some stochasticity still exists in agent behavior, and conclusions drawn from single-run comparisons are less reliable." 407 }, 408 { 409 "flag": "No statistical tests for comparative claims", 410 "detail": "Claims that DeepSeek-V3.1 'outperforms' other models and that agent frameworks differ in capability are based purely on comparing raw percentages from single runs. With 105 tasks, differences of a few percentage points could easily be within noise." 411 }, 412 { 413 "flag": "Contamination risk unaddressed", 414 "detail": "The benchmark uses publicly available OSS-Fuzz vulnerability fixes from GitHub. LLMs trained on code repositories may have seen the exact fixing patches. The paper never discusses training cutoff dates or whether models could have memorized solutions." 415 }, 416 { 417 "flag": "Security reminder tested on single configuration only", 418 "detail": "The security reminder experiment (Figure 7) is conducted only with SWE-agent + DeepSeek-V3.1. The claim that security reminders are insufficient is based on this single configuration, limiting generalizability to other agent-model combinations." 419 }, 420 { 421 "flag": "SAST false positive rate not quantified", 422 "detail": "The 'Correct but Suspicious' category relies on Semgrep SAST, which the paper acknowledges 'may yield false positives.' The CS rate is reported as a security metric without quantifying false positive rates, potentially inflating perceived security risks." 423 } 424 ], 425 "cited_papers": [ 426 { 427 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 428 "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R Narasimhan"], 429 "year": 2024, 430 "relevance": "Foundational coding agent benchmark that SecureAgentBench extends to security evaluation." 431 }, 432 { 433 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 434 "authors": ["John Yang", "Carlos E Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik R Narasimhan", "Ofir Press"], 435 "year": 2024, 436 "arxiv_id": "2405.15793", 437 "relevance": "One of three code agent frameworks evaluated in SecureAgentBench." 438 }, 439 { 440 "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents", 441 "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"], 442 "year": 2024, 443 "arxiv_id": "2407.16741", 444 "relevance": "One of three code agent frameworks evaluated, representing extensible agent scaffold design." 445 }, 446 { 447 "title": "Purple Llama CyberSecEval: A Secure Coding Benchmark for Language Models", 448 "authors": ["Manish Bhatt", "Sahana Chennabasappa"], 449 "year": 2023, 450 "arxiv_id": "2312.04724", 451 "relevance": "Prior secure coding benchmark for LLMs, compared against in Table 1." 452 }, 453 { 454 "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions", 455 "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"], 456 "year": 2025, 457 "relevance": "Found ~40% of Copilot code completions are vulnerable, motivating secure code generation evaluation." 458 }, 459 { 460 "title": "BaxBench: Can LLMs Generate Correct and Secure Backends?", 461 "authors": ["Mark Vero", "Niels Mündler", "Victor Chibotaru"], 462 "year": 2025, 463 "relevance": "Closest related benchmark combining correctness and security evaluation for LLM-generated code." 464 }, 465 { 466 "title": "CWEval: Outcome-Driven Evaluation on Functionality and Security of LLM Code Generation", 467 "authors": ["Jinjun Peng", "Leyi Cui", "Kele Huang", "Junfeng Yang", "Baishakhi Ray"], 468 "year": 2025, 469 "relevance": "Prior benchmark evaluating both correctness and security of LLM-generated code." 470 }, 471 { 472 "title": "SecRepoBench: Benchmarking LLMs for Secure Code Generation in Real-World Repositories", 473 "authors": ["Connor Dilgren", "Purva Chiniya", "Luke Griffith", "Yu Ding", "Yizheng Chen"], 474 "year": 2025, 475 "arxiv_id": "2504.21205", 476 "relevance": "Repository-level secure coding benchmark focusing on single-function vulnerability fixing." 477 }, 478 { 479 "title": "SafeGenBench: A Benchmark Framework for Security Vulnerability Detection in LLM-Generated Code", 480 "authors": ["Xinghang Li", "Jingzhe Ding", "Chao Peng"], 481 "year": 2025, 482 "arxiv_id": "2506.05692", 483 "relevance": "Security benchmark using SAST and LLM-judge for evaluating LLM-generated function security." 484 }, 485 { 486 "title": "SWT-bench: Testing and Validating Real-World Bug-Fixes with Code Agents", 487 "authors": ["Niels Mündler", "Mark Müller", "Jingxuan He", "Martin Vechev"], 488 "year": 2024, 489 "relevance": "Agent evaluation benchmark for unit test generation, extending the SWE-bench paradigm." 490 }, 491 { 492 "title": "Instruction Tuning for Secure Code Generation", 493 "authors": ["Jingxuan He", "Mark Vero", "Gabriela Krasnopolska", "Martin Vechev"], 494 "year": 2024, 495 "relevance": "SafeCoder technique combining security-aware finetuning for improving LLM code security." 496 }, 497 { 498 "title": "Evaluating Agent-Based Program Repair at Google", 499 "authors": ["Pat Rondon", "Renyao Wei", "José Cambronero"], 500 "year": 2025, 501 "arxiv_id": "2501.07531", 502 "relevance": "Industry evaluation of agent-based program repair, informs benchmark design decisions." 503 } 504 ] 505 }