scan.json (32830B)
1 { 2 "paper": { 3 "title": "SEC-bench: Automated Benchmarking of LLM Agents on Real-World Software Security Tasks", 4 "authors": [ 5 "Hwiwon Lee", 6 "Ziqi Zhang", 7 "Hanxiao Lu", 8 "Lingming Zhang" 9 ], 10 "year": 2025, 11 "venue": "NeurIPS 2025", 12 "arxiv_id": "2506.11791", 13 "doi": "10.48550/arXiv.2506.11791" 14 }, 15 "scan_version": 3, 16 "active_modules": ["experimental_rigor", "data_leakage"], 17 "methodology_tags": ["benchmark-eval"], 18 "key_findings": "SEC-bench introduces a multi-agent framework (SECVERIFIER) that automatically verifies 200 real-world CVE instances from 898 candidates at $0.87/instance, an 85.7% improvement over single-agent baselines. State-of-the-art code agents (SWE-agent, OpenHands, Aider) achieve at most 18.0% success on PoC generation and 34.0% on vulnerability patching, far below the >60% resolve rates seen on general software engineering benchmarks like SWE-bench. Failure analysis reveals distinct failure modes: compilation errors and lingering vulnerabilities dominate SWE-agent failures, while improper patch formatting dominates OpenHands failures.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "Code is released at https://github.com/SEC-bench/SEC-bench and dataset at https://hf.co/datasets/SEC-bench/SEC-bench, with a leaderboard at https://sec-bench.github.io. These URLs are provided on page 1." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "The dataset is released on HuggingFace at https://hf.co/datasets/SEC-bench/SEC-bench. Docker images for each verified instance are also provided as part of the framework." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": true, 34 "justification": "Section B.3 specifies agent versions (SWE-agent 1.0.1, OpenHands 0.33.0, Aider 0.82.0), Docker container configurations, and the evaluation environment setup. Each benchmark instance is packaged as a Docker image with all dependencies." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": true, 39 "justification": "The paper provides full prompt templates (Appendix B.4, B.5), agent configurations (Section B.3), Docker-based evaluation environments, and the released code repository includes the full framework for reproducing the benchmark and evaluations." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "Main results in Table 4 are reported as point estimates without confidence intervals or error bars. One stability check reports std dev (30.0% ± 7.9% for SWE-agent + o3-mini over 5 runs), but this covers only one of the nine agent-model configurations." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "The Wilcoxon signed-rank test (p=0.27) is used for the contamination analysis (Section 3.2), but no significance tests are applied to the main comparative claims between agents and models in Table 4. Claims like 'SWE-agent achieves 33.8% vs Aider at 20.0%' are based on raw number comparisons." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Effect sizes are reported with baseline context throughout: '85.7% improvement over CODEACT' (26.0% vs 14.0%), success rates broken down by component (Builder 81.7%, Exploiter 39.4%, Fixer 69.2%), and percentage-point differences in contamination analysis (Table 5)." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "The 200 verified instances result from the automated pipeline, not from a pre-specified target. The 80-instance subset for agent comparison is motivated by 'budget constraints' (Section 3.1) with no power analysis or justification that 80 instances provide sufficient statistical power." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "Only one of nine agent-model configurations was repeated (SWE-agent + o3-mini, 5 runs, 30.0% ± 7.9%). All main results in Table 4 are single-run point estimates with no variance measures." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "SECVERIFIER is compared against CODEACT (single-agent baseline) in Table 3. For agent evaluation, three agent scaffolds (SWE-agent, OpenHands, Aider) are compared against each other across three models." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "All baselines are contemporary: Claude 3.7 Sonnet, GPT-4o, o3-mini for models; SWE-agent v1.0.1, OpenHands v0.33.0, Aider v0.82.0 for agent scaffolds. CODEACT is built on the same OpenHands framework for controlled comparison." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "Table 3 presents an ablation comparing multi-agent SECVERIFIER vs single-agent CODEACT on 50 random instances, isolating the impact of the multi-agent decomposition. Per-component success rates (Builder, Exploiter, Fixer) further decompose the pipeline." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "Results report resolved rate (%), submitted rate (%), average cost ($), and average steps. Failure analysis (Section 3.3) adds four failure type categories. Per-component success rates are also reported." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": false, 93 "justification": "Agent outputs are evaluated entirely by automated sanitizer verdicts. The manual verification in Section 2.5 is for benchmark construction quality (inspecting bug reports and patches), not for evaluating agent performance on the security tasks." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": false, 98 "justification": "The 80-instance subset used for model/agent comparison (Table 4) is drawn from the same 200-instance dataset used for full evaluation. Claude 3.7 Sonnet was selected as the best model on the 80-instance subset, then evaluated on the full 200 instances that include those same 80." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Results are broken down by project (Table 1, 29 projects), by agent scaffold and model (Table 4), by task type (PoC generation vs patching), by failure category (Figure 2: NP, IF, CE, SV), and by pre/post-cutoff (Table 5)." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 3.3 provides detailed failure analysis with four categories (No Patch, Improper Format, Compilation Error, Still Vulnerable). Specific failure examples are given, such as OpenHands modifying ~7,000 lines in gpac.cve-2023-0358 and SWE-agent misidentifying root cause in mruby.cve-2022-1201." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The overall low success rates (18.0% PoC, 34.0% patching) are themselves negative results showing current LLM limitations. Aider's consistently poor performance is reported. The exploiter agent's low from-scratch PoC generation rate (3/289 in Appendix E) is also reported." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "Abstract claims are supported: $0.87/instance matches Table 1 average; 18.0% PoC and 34.0% patching match full-dataset evaluation in Section 3.2; the 85.7% improvement over CODEACT matches Table 3 (26.0% vs 14.0%)." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "The main causal claim—that the multi-agent framework improves verification—is supported by a controlled comparison against CODEACT using the same underlying framework (OpenHands), same 50 random instances, isolating the multi-agent variable (Table 3)." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The title claims 'Real-World Software Security Tasks' and abstract says 'software security tasks' generally, but the benchmark only covers C/C++ memory safety vulnerabilities detectable by sanitizers. While Section 5 acknowledges the C/C++ focus, the title and abstract overstate the scope relative to the actual evaluation." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper attributes low agent performance to inherent task difficulty without discussing alternative explanations: whether prompt design, cost limits ($1.0-$1.5), iteration caps (75), or the evaluation setup itself contribute to low scores. The contamination analysis partially addresses one alternative but the main results lack such discussion." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper explicitly justifies sanitizer verdicts as the evaluation oracle (Section 2.1, Appendix B.2), discussing why this provides 'objective, deterministic verification without subjective judgment.' Section 5 acknowledges this only covers memory safety vulnerabilities, not all security engineering capabilities." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "Models are identified by marketing names only: 'Claude 3.7 Sonnet', 'GPT-4o', 'o3-mini', 'Claude 3 Haiku'. No snapshot dates, API version identifiers, or model build dates are provided for any model." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": true, 152 "justification": "Full prompt text is provided in Appendix B.4 (PoC generation, Figure 4), B.5 (vulnerability patching, Figure 5), and Appendix D (all SECVERIFIER agent prompts: Builder D.1, Exploiter D.2, Fixer D.3, single-agent D.4)." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": true, 157 "justification": "Section B.3 reports temperature (0.0 for all models), maximum iterations (75 for SWE-agent/OpenHands), and cost limits ($1.5 for Claude 3.7 Sonnet, $1.0 for GPT-4o and o3-mini). Agent versions are also specified." 158 }, 159 "scaffolding_described": { 160 "applies": true, 161 "answer": true, 162 "justification": "SECVERIFIER's multi-agent architecture is described in detail (Section 2.3): Manager, Builder, Exploiter, Fixer agents with specific roles, coordination logic, and iterative refinement. Evaluation agent configurations are described in Section B.3 (SWE-agent terminal interface, OpenHands CODEACT scaffold, Aider Git integration)." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 2.2 documents the full pipeline with counts: 38,201 OSV instances → 4,836 with sufficient documentation → 898 with sanitizer reports → 200 verified instances. Each filtering step's criteria are described. Section 2.5 documents the manual inspection process." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 5 'Limitations and Future Work' provides substantive discussion of two specific limitations: the C/C++ language restriction and the memory safety sanitizer subset of vulnerability types." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 5 discusses specific threats: 'we focus on C/C++ projects due to the reliability of memory safety sanitizers,' 'our current implementation covers a specific subset of vulnerability types detectable by memory safety sanitizers.' These are specific to this study's design choices." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": true, 184 "justification": "Section 5 explicitly states what is not covered: other programming languages, vulnerability types beyond memory safety, web applications, operating system kernels, and distributed systems. Future work directions to address these gaps are described." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": true, 191 "justification": "The full dataset is released on HuggingFace (https://hf.co/datasets/SEC-bench/SEC-bench) including verified instances with Docker images, PoC artifacts, and patches. The source code for the framework is on GitHub." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 2.2 describes collection from the OSV database, customized web scraping of bug reports from diverse platforms (GitHub Issues, RedHat Bugzilla, Chromium Issue Tracker), and adaptation of OSS-FUZZ configurations. Inclusion criteria at each stage are specified." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants. Data sources are standard public vulnerability databases (OSV, NVD) and associated bug tracking platforms." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "The pipeline is documented with counts at each stage: 38,201 initial OSV instances → 4,836 with documentation → 898 candidates with sanitizer reports → 200 verified instances across 29 projects. Manual verification (Section 2.5) documents three rounds of inspection with specific criteria." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No funding acknowledgments or grant information is provided anywhere in the paper." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are listed: University of Illinois Urbana-Champaign and Purdue University. The paper evaluates third-party models and agents, not products from the authors' institutions." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure is itself the issue." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests statement or financial disclosure is present in the paper." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": true, 234 "answer": false, 235 "justification": "Training cutoffs are stated only for the contamination analysis models: GPT-4o (KC: Sep 2023) and Claude 3 Haiku (KC: Aug 2023) in Section 3.2. The main evaluation model (Claude 3.7 Sonnet) and o3-mini do not have their training cutoffs stated." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": true, 239 "answer": true, 240 "justification": "Section 3.2 presents a dedicated data contamination analysis comparing 15 pre-cutoff and 15 post-cutoff instances based on CVE reserved dates, with Wilcoxon signed-rank test (p=0.27) showing no significant difference." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": true, 244 "answer": true, 245 "justification": "The contamination analysis (Table 5) uses CVE reserved dates to split instances relative to model knowledge cutoffs and tests for performance differences. The Wilcoxon test finds no significant contamination effect. However, this analysis covers only GPT-4o and Claude 3 Haiku, not the main evaluation models." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study. The benchmark evaluates automated LLM agents on security tasks." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants. The study uses public vulnerability databases and automated agent evaluation." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the evaluation. The manual verification involves two authors but this is quality assurance, not a human subjects study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": true, 289 "justification": "Per-instance costs are reported throughout: $0.87 average for SECVERIFIER (Table 1), per-agent average costs in Table 4 (e.g., SWE-agent + Claude 3.7 Sonnet: $1.29 for patching, $1.52 for PoC), and SECVERIFIER vs CODEACT costs in Table 3." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "Per-instance costs and average steps are reported, but no total computational budget (total API spend, total GPU hours, or aggregate cost for the full evaluation campaign) is stated." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": true, 301 "justification": "Section 3.2 reports a stability analysis: 'we select SWE-agent and o3-mini as the representative agent and model, and repeat the experiments five times. The average success rate is 30.0% with a standard deviation of 7.9%.' However, this covers only one of nine configurations." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": false, 306 "justification": "The stability check explicitly states 5 runs for one configuration (SWE-agent + o3-mini). The main results in Table 4 do not state the number of runs, implying single-run evaluations." 307 }, 308 "hyperparameter_search_budget": { 309 "applies": true, 310 "answer": false, 311 "justification": "No hyperparameter search is described. Agent configurations (temperature, iteration limits, cost caps) appear to be set without any reported search process or justification for the specific values chosen." 312 }, 313 "best_config_selection_justified": { 314 "applies": true, 315 "answer": true, 316 "justification": "Section 3.1 explains: 'we evaluate the best-performing agent on the full dataset' and 'The reason to select Claude 3.7 Sonnet is that it has better performance than other models in our evaluation over a random selected 80-instance subset.'" 317 }, 318 "multiple_comparison_correction": { 319 "applies": false, 320 "answer": false, 321 "justification": "Only one formal statistical test is performed (Wilcoxon signed-rank for contamination). Main agent comparisons use raw percentages without statistical tests, so no multiple comparison correction is applicable." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": false, 326 "justification": "The authors compare their SECVERIFIER framework against CODEACT without acknowledging self-evaluation bias. No independent evaluation or discussion of author-implementation bias in the baseline comparison." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": true, 330 "answer": false, 331 "justification": "Cost limits differ by model ($1.5 for Claude 3.7 Sonnet vs $1.0 for GPT-4o and o3-mini) but performance is not analyzed as a function of compute budget. No performance-vs-cost curves are provided." 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": true, 336 "justification": "Section 2.1 and Appendix B.2 discuss why sanitizer verdicts are appropriate evaluation oracles. Section 5 explicitly acknowledges that the benchmark covers only memory safety vulnerabilities in C/C++, not all security engineering. Comparison with SWE-bench statistics (Table 2) contextualizes the benchmark." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": true, 340 "answer": true, 341 "justification": "Table 4 presents a full 3×3 grid of agent scaffolds × models, allowing separation of scaffold effects from model effects. Section 3.2 analyzes both dimensions independently, noting that SWE-agent and OpenHands achieve comparable performance while Aider is consistently lower." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": true, 348 "justification": "Section 3.2 uses CVE reserved dates to split instances into pre/post knowledge-cutoff groups, directly addressing temporal leakage. The Wilcoxon test (p=0.27) finds no significant temporal effect." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": true, 353 "justification": "Section 2.5 addresses feature leakage through manual inspection of bug reports: 'We inspect all bug reports and remove directly provided patches while preserving essential context.' This prevents agents from simply copying patch code from the task description." 354 }, 355 "non_independence_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "Multiple instances come from the same project (e.g., 43 from gpac, 31 from imagemagick, 21 from mruby). The paper does not discuss whether performance on instances from the same project is correlated or whether this violates independence assumptions in the evaluation." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": true, 363 "justification": "A concrete detection method is applied: temporal splitting of 30 instances by CVE reserved date relative to model knowledge cutoffs, with Wilcoxon signed-rank test for statistical comparison (Table 5, p=0.27)." 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "SECVERIFIER successfully verifies 200 real-world CVE instances, an 85.7% improvement over the single-agent CODEACT baseline.", 370 "evidence": "Table 1 shows 200 verified from 898 seed instances (22.3% overall). Table 3 compares SECVERIFIER (26.0%) vs CODEACT (14.0%) on 50 random instances across 23 projects.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "State-of-the-art code agents achieve at most 18.0% success in PoC generation and 34.0% in vulnerability patching.", 375 "evidence": "Section 3.2 reports full-dataset (200 instances) evaluation results. Table 4 shows detailed 80-instance results across 3 agents × 3 models. Stability check on one config shows 30.0% ± 7.9% over 5 runs.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "SEC-bench automatically creates benchmark instances at $0.87 per verified instance.", 380 "evidence": "Table 1 reports average cost across 29 projects, ranging from $0.49 (readstat) to $1.20 (matio), with an overall average of $0.87.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "No significant data contamination effect exists in the benchmark (Wilcoxon p=0.27).", 385 "evidence": "Table 5 compares performance on 15 pre-cutoff and 15 post-cutoff instances for GPT-4o and Claude 3 Haiku. Results show inconsistent patterns across tasks.", 386 "supported": "weak" 387 }, 388 { 389 "claim": "Security tasks are significantly more challenging than general software engineering tasks, with agents scoring >60% on SWE-bench but only 18-34% on SEC-bench.", 390 "evidence": "Section 3.2 compares against SWE-bench verified leaderboard scores. Claude 3.7 Sonnet achieves >60% on SWE-bench verified but 18-34% on SEC-bench tasks.", 391 "supported": "weak" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "High result variance with single-run main results", 397 "detail": "The one repeated experiment (SWE-agent + o3-mini) shows 30.0% ± 7.9% std dev—a coefficient of variation of 26%. Yet all main results in Table 4 are single-run point estimates. This level of variance means the reported rankings between agents/models may not be stable." 398 }, 399 { 400 "flag": "Selection and evaluation data overlap", 401 "detail": "Claude 3.7 Sonnet was selected as the best model based on the 80-instance subset, then evaluated on the full 200-instance dataset that includes those same 80 instances. This inflates the reported performance for the full-dataset evaluation." 402 }, 403 { 404 "flag": "Very small contamination analysis sample", 405 "detail": "The data contamination analysis uses only 30 instances (15 pre-cutoff + 15 post-cutoff) with secondary models (GPT-4o, Claude 3 Haiku), not the main evaluation model (Claude 3.7 Sonnet). The sample is too small to reliably detect contamination effects." 406 }, 407 { 408 "flag": "Unequal cost limits across models", 409 "detail": "Claude 3.7 Sonnet is given a $1.5 cost limit while GPT-4o and o3-mini are capped at $1.0. Since Claude is the best-performing model, the higher budget may contribute to its advantage rather than model capability alone." 410 }, 411 { 412 "flag": "Cross-benchmark comparison is uncontrolled", 413 "detail": "The claim that security tasks are harder than SE tasks is based on comparing SEC-bench scores (C/C++ memory safety, sanitizer evaluation) with SWE-bench scores (Python, test-suite evaluation). These differ in language, evaluation method, task type, and domain, making the comparison largely anecdotal." 414 } 415 ], 416 "cited_papers": [ 417 { 418 "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?", 419 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R. Narasimhan"], 420 "year": 2024, 421 "relevance": "Foundational software engineering benchmark for evaluating LLM agents on real-world GitHub issues; SEC-bench positions itself as the security counterpart." 422 }, 423 { 424 "title": "SWE-agent: Agent-computer interfaces enable automated software engineering", 425 "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"], 426 "year": 2024, 427 "arxiv_id": "2405.15793", 428 "relevance": "One of the three agent scaffolds evaluated in the benchmark; introduces agent-computer interface concept for code agents." 429 }, 430 { 431 "title": "OpenHands: An open platform for AI software developers as generalist agents", 432 "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"], 433 "year": 2024, 434 "arxiv_id": "2407.16741", 435 "relevance": "Open-source agent framework evaluated in SEC-bench; SECVERIFIER is built on top of the OpenHands platform." 436 }, 437 { 438 "title": "Executable code actions elicit better LLM agents", 439 "authors": ["Xingyao Wang", "Yangyi Chen", "Lifan Yuan", "Yizhe Zhang", "Yunzhu Li", "Hao Peng", "Heng Ji"], 440 "year": 2024, 441 "relevance": "CODEACT single-agent framework used as the baseline comparison for SECVERIFIER's multi-agent approach." 442 }, 443 { 444 "title": "Cybench: A framework for evaluating cybersecurity capabilities and risks of language models", 445 "authors": ["Andy K Zhang", "Neil Perry", "Riya Dulepet"], 446 "year": 2024, 447 "arxiv_id": "2408.08926", 448 "relevance": "CTF-based cybersecurity benchmark for LLMs; SEC-bench contrasts its automated approach against Cybench's manual CTF construction." 449 }, 450 { 451 "title": "CVE-Bench: Benchmarking LLM-based Software Engineering Agent's Ability to Repair Real-World CVE Vulnerabilities", 452 "authors": ["Peiran Wang", "Xiaogeng Liu", "Chaowei Xiao"], 453 "year": 2025, 454 "relevance": "Vulnerability repair benchmark based on CVEFixes dataset; SEC-bench critiques its 51% label accuracy due to lack of verification." 455 }, 456 { 457 "title": "CVE-Bench: A Benchmark for AI Agents' Ability to Exploit Real-World Web Application Vulnerabilities", 458 "authors": ["Yuxuan Zhu", "Antony Kellermann", "Dylan Bowman"], 459 "year": 2025, 460 "arxiv_id": "2503.17332", 461 "relevance": "Web application vulnerability benchmark for AI agents; SEC-bench contrasts its general C/C++ approach against CVE-Bench's web-framework-specific design." 462 }, 463 { 464 "title": "ARVO: Atlas of reproducible vulnerabilities for open source software", 465 "authors": ["Xiang Mei", "Pulkit Singh Singaria"], 466 "year": 2024, 467 "arxiv_id": "2408.02153", 468 "relevance": "Reproducible vulnerability dataset from OSS-FUZZ; SEC-bench extends beyond structured bug datasets to handle in-the-wild CVEs." 469 }, 470 { 471 "title": "Agentless: Demystifying LLM-based software engineering agents", 472 "authors": ["Chunqiu Steven Xia", "Yinlin Deng", "Soren Dunn", "Lingming Zhang"], 473 "year": 2024, 474 "arxiv_id": "2407.01489", 475 "relevance": "Two-stage agentless framework for SE tasks, representing an alternative architecture to agent-based approaches evaluated in SEC-bench." 476 }, 477 { 478 "title": "Enigma: Enhanced interactive generative model agent for CTF challenges", 479 "authors": ["Talor Abramovich", "Meet Udeshi", "Minghao Shao"], 480 "year": 2024, 481 "arxiv_id": "2409.16165", 482 "relevance": "Agent framework specifically designed for CTF security challenges, using agent-computer interfaces relevant to security task evaluation." 483 }, 484 { 485 "title": "SWE-RL: Advancing LLM reasoning via reinforcement learning on open software evolution", 486 "authors": ["Yuxiang Wei", "Olivier Duchenne", "Jade Copet"], 487 "year": 2025, 488 "arxiv_id": "2502.18449", 489 "relevance": "Applies GRPO reinforcement learning to improve agent reasoning for software engineering, a potential approach for improving security task performance." 490 }, 491 { 492 "title": "LLMs Cannot Reliably Identify and Reason About Security Vulnerabilities (Yet?)", 493 "authors": ["Saad Ullah", "Mingji Han", "Saurabh Pujar", "Hammond Pearce", "Ayse K. Coskun", "Gianluca Stringhini"], 494 "year": 2024, 495 "relevance": "Comprehensive evaluation of LLMs on vulnerability identification showing fundamental limitations, corroborating SEC-bench's findings of low agent performance." 496 } 497 ], 498 "engagement_factors": { 499 "practical_relevance": { 500 "score": 2, 501 "justification": "Released framework, dataset, and Docker images that security researchers and tool developers can use to evaluate LLM agents on real vulnerability tasks." 502 }, 503 "surprise_contrarian": { 504 "score": 1, 505 "justification": "Low LLM performance on security tasks (18-34%) is expected and confirms the conventional view that security engineering remains beyond current agent capabilities." 506 }, 507 "fear_safety": { 508 "score": 1, 509 "justification": "Touches on AI-security intersection by testing PoC generation, but results show LLMs are poor at exploit generation, which is somewhat reassuring rather than alarming." 510 }, 511 "drama_conflict": { 512 "score": 1, 513 "justification": "Criticizes existing benchmarks for quality issues (up to 71% inaccurate data) but the tone is constructive and academic rather than provocative." 514 }, 515 "demo_ability": { 516 "score": 2, 517 "justification": "Code on GitHub, dataset on HuggingFace, and a public leaderboard are available, though running the full benchmark requires significant Docker infrastructure." 518 }, 519 "brand_recognition": { 520 "score": 1, 521 "justification": "From UIUC (Lingming Zhang's group, well-known in SE/testing) published at NeurIPS 2025, but not from a headline AI lab." 522 } 523 } 524 }