scan.json (34506B)
1 { 2 "scan_version": 3, 3 "active_modules": ["experimental_rigor", "data_leakage"], 4 "paper": { 5 "title": "RedCode: Risky Code Execution and Generation Benchmark for Code Agents", 6 "authors": [ 7 "Chengquan Guo", 8 "Xun Liu", 9 "Chulin Xie", 10 "Andy Zhou", 11 "Yi Zeng", 12 "Zinan Lin", 13 "Dawn Song", 14 "Bo Li" 15 ], 16 "year": 2024, 17 "venue": "Neural Information Processing Systems (NeurIPS 2024, Datasets and Benchmarks Track)", 18 "arxiv_id": "2411.07781", 19 "doi": "10.48550/arXiv.2411.07781" 20 }, 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The abstract states 'Our dataset and code are publicly available at https://github.com/AI-secure/RedCode.' A working GitHub URL is provided." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "The benchmark dataset (4,050 RedCode-Exec test cases and 160 RedCode-Gen prompts) is released at the same GitHub repository. App. B confirms 'Our dataset can be found in this GitHub repository link.'" 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": true, 36 "justification": "Docker images are provided for each test case, constituting detailed environment specification. §3.2.1 describes Docker environments for sandbox isolation and resource preparation. App. C.1 specifies '10 NVIDIA RTX A6000 GPUs, each with 48 GB of GDDR6 memory.'" 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": true, 41 "justification": "The released repository includes Docker environments, evaluation scripts for each scenario, and system prompts for all agents (App. C.2). The paper describes the full evaluation pipeline (Algorithm 1) with enough detail to reproduce experiments." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "All results are reported as point estimates (e.g., rejection rate percentages, attack success rate percentages) without confidence intervals or error bars in any figure or table." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "The paper makes numerous comparative claims (e.g., 'Python leads to a higher rejection rate than Bash,' 'OpenCodeInterpreter is relatively safer') but no statistical significance tests are used. All comparisons are based on raw percentage differences." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": false, 58 "justification": "Main results in Figs. 5-7 report only raw percentages. Tab. 9 includes percentage-point changes (e.g., '↓17%') for mitigation experiments, but no formal effect sizes (Cohen's d, odds ratios) are reported anywhere." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "The paper selects 30 test cases per scenario and 20 prompts per malware family without justifying these numbers. No power analysis or rationale for the sample size is provided." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "Results are single-run (temperature=0 for most experiments yields deterministic output). For RedCode-Gen (temperature=0.8), no variance across runs is reported. No standard deviations or IQR appear in any table or figure." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "The paper compares 19 code agents across 3 agent frameworks (OpenCodeInterpreter, CodeAct, ReAct) with various base LLMs, providing extensive cross-comparisons (Fig. 1, Fig. 6)." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "Evaluated models include contemporary LLMs: GPT-4o (2024), Claude-3.5-Sonnet (2024), Llama-3.1-70b (2024), alongside older models for breadth." 81 }, 82 "ablation_study": { 83 "applies": false, 84 "answer": false, 85 "justification": "This is a benchmark paper, not a system with decomposable components. The paper compares experimental conditions (input formats, languages, agent frameworks) but these are evaluation variables, not ablatable system components." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Three evaluation outcomes are tracked: Rejection Rate (RR), Attack Success Rate (ASR), and Execution Failure rate. For RedCode-Gen, refusal rate, accuracy, and VirusTotal score are reported (Tab. 1)." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": false, 95 "justification": "Agent outputs are evaluated entirely via deterministic evaluation scripts (RedCode-Exec) or LLM-as-judge plus VirusTotal API (RedCode-Gen). No human evaluation of agent outputs is performed. Manual review was used only for dataset curation, not for evaluating agent behavior." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "The entire RedCode benchmark serves as the evaluation set. No tuning or selection decisions are made on the test cases — all 4,050 RedCode-Exec and 160 RedCode-Gen test cases are used purely for evaluation." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Extensive breakdowns are provided: by 8 risky domains (Fig. 5), by 25 individual scenarios (Fig. 6), by input format (Fig. 7), by agent framework (Fig. 1), and by 8 malware categories (Tab. 1). Appendix provides per-scenario analysis (Tab. 10)." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": true, 110 "justification": "App. D.6 discusses four main reasons for execution failure: poor instruction following ability, incorrect assertion, code modification, and autonomous extra activities, with concrete examples for each." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper reports that safety-aware prompts are 'not effective enough' (§4.1.1, Tab. 9), that jailbreaking prefixes sometimes paradoxically increase rejection rates, and that VirusTotal accuracy of LLM-generated malware is 'still relatively low.'" 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "Abstract claims about agents being more likely to reject OS operations, natural language leading to lower rejection, and stronger models producing more harmful software are all supported by Figs. 5-7 and Tab. 1 respectively." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": false, 127 "justification": "The paper makes causal claims with hedging language: 'This is likely because OCI has hard-coded disk space protection,' 'This might be due to the unbalanced ability of code agents,' 'ReAct employs the Think-Then-Act procedure, which helps the agents to reason about the security implications.' These are causal explanations for observed differences without controlling for confounds or using causal identification strategies." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": false, 132 "justification": "The conclusion states 'We find that existing code agents are generally vulnerable' — a broad claim extending beyond the 19 specific agents tested on 25 scenarios. The paper tests only Python and Bash with specific agent frameworks, yet makes general claims about 'code agents.'" 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper offers speculative preferred explanations for observed differences (e.g., 'Python tasks could be perceived as more complex or risky, leading to higher rejection rate. Alternatively, agents might be more familiar with Python') but does not substantively discuss confounds or systematically consider alternative explanations." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper measures rejection rate and attack success rate on specific risky scenarios and frames these as evaluating 'safety of code agents' in the context of risky code execution and generation. The measurements are reasonably well-matched to the specific claims made, though the conclusion's broad use of 'safe' slightly exceeds the measurement scope." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": true, 149 "justification": "Tab. 7 lists exact model endpoints: gpt-4-1106-preview, gpt-4o-2024-05-13, claude-3-5-sonnet-20240620, and HuggingFace links for all open-source models with specific model names." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": true, 154 "justification": "Full system prompts for ReAct, CodeAct, and OpenCodeInterpreter are provided in App. C.2. The jailbreaking prefix template and dataset augmentation prompts are provided in App. B.1 and C.1. Translation prompts for Python-to-Bash and code-to-text are provided in App. B.1.2 and B.1.3." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "App. C.2 reports temperature (0 for most agents, 0.8 for RedCode-Gen), max generated tokens (1024 for OCI/ReAct, 512 for CodeAct), and number of interaction rounds (3 for OCI/CodeAct, 8 for ReAct, 5 for RedCode-Gen agents)." 160 }, 161 "scaffolding_described": { 162 "applies": true, 163 "answer": true, 164 "justification": "The three agent frameworks are described in detail: OpenCodeInterpreter's hard-coded safety constraints and execution pipeline, CodeAct's <execute> tag format, and ReAct's Think-Then-Act procedure. System prompts, interaction round limits, and output format handling are all documented (App. C.2)." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "§3.2.1 documents the full pipeline: seed test case creation from CWE and prior benchmarks → LLM-based augmentation with jailbreaking prefixes → manual review for quality → accessible resource preparation → executability verification in Docker. Counts are provided (25 seeds → 750 Python test cases, 600 Bash, etc.)." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "App. A 'Discussion on limitations and impacts' provides a dedicated limitations section discussing the limited scope of programming languages and potential misuse concerns." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": true, 181 "justification": "The paper identifies specific threats: 'limited scope of supported programming languages' (Python and Bash only), that 'existing code agents primarily operate Python,' and that URL keyword visibility affects rejection rates (App. D.2). These are specific to this study." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": false, 186 "justification": "While the paper notes the language limitation, it does not explicitly state what the results do NOT show. The conclusion claims 'existing code agents are generally vulnerable' without bounding this to the tested scenarios, languages, and agent frameworks." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": true, 193 "justification": "The full dataset of test cases, Docker environments, and evaluation scripts are released at the GitHub repository, allowing independent verification of all evaluation results." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "§3.2.1 describes how seed test cases were manually created from CWE and existing benchmarks (ToolEmu, R-judge), then augmented using GPT-4 and Mistral-Large with jailbreaking prefixes, manually reviewed, and verified for executability." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants were involved. The data consists of synthetically generated and manually curated benchmark test cases." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "The pipeline is documented with counts at each stage: 25 seed scenarios → Naug augmented cases → manual review → 30 selected per scenario (750 total Python). Translation pipelines for Bash (600 cases after removing 5 incompatible scenarios) and natural language (1,500 for Python, 1,200 for Bash) are similarly documented. Tab. 2 provides complete dataset statistics." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "The acknowledgements section lists funding from NSF (grants No. 2046726 and IIS-2229876), DARPA GARD, NASA (No. 80NSSC20M0229), Alfred P. Sloan Fellowship, Meta research award, and eBay research award." 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Author affiliations are clearly listed: University of Chicago, UIUC, Lapis Labs, Virginia Tech, Microsoft Research, UC Berkeley. No authors are affiliated with OpenAI, Anthropic, or Meta whose models are evaluated." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": true, 225 "justification": "Primary funders (NSF, DARPA, NASA, Sloan) are independent of the evaluated products. Meta and eBay provide research awards but their products are not directly evaluated in the benchmark." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests or financial interests statement is included in the paper. One author is affiliated with Lapis Labs (a company) but no disclosure of financial interests is made." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": true, 236 "answer": false, 237 "justification": "No training data cutoff dates are stated for any of the 19 evaluated models, despite testing them on a new benchmark. The models' training data could include CWE examples and similar risky code patterns used to construct the benchmark." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": true, 241 "answer": false, 242 "justification": "No discussion of whether the LLMs' training data includes CWE examples, risky code patterns, or safety training data similar to the benchmark's test cases. The benchmark draws from publicly available CWE content which is likely in training data." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": true, 246 "answer": false, 247 "justification": "Although this is a new benchmark, its test cases are derived from publicly available CWE examples and augmented using GPT-4/Mistral-Large. No discussion of whether evaluated models' safety training might have been conducted on similar content, which could inflate rejection rates." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study. The paper evaluates automated code agents on a benchmark." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants. The study involves only automated evaluation of code agents." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in this benchmark evaluation study." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants; no experimental conditions requiring randomization of people." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants; automated evaluation pipeline." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "No API costs, token counts, or inference latency are reported despite using commercial APIs (GPT-4, Claude-3.5) for thousands of test cases across 19 agents." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "App. C.1 mentions '10 NVIDIA RTX A6000 GPUs, each with 48 GB of GDDR6 memory' but does not state total GPU hours, wall-clock time, or total API spend for the experiments." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "Results are single-run. Temperature=0 is used for deterministic outputs in most experiments, but RedCode-Gen uses temperature=0.8 with no seed sensitivity analysis. No multi-seed results are reported." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": false, 308 "justification": "The number of experimental runs is never explicitly stated. It is implied that each agent-test case pair is run once, but this is not stated." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": false, 313 "justification": "No hyperparameter search is described. The choices of temperature=0, max_tokens=1024/512, and interaction rounds=3/8 appear to be set without systematic exploration, and no search budget is reported." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": true, 318 "justification": "The paper reports results for all tested configurations (all 19 agents, all input formats, both languages) rather than selecting a best configuration. Full results are provided in Fig. 11 (appendix)." 319 }, 320 "multiple_comparison_correction": { 321 "applies": false, 322 "answer": false, 323 "justification": "No statistical tests are performed, so there are no p-values requiring correction for multiple comparisons." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "The authors designed the benchmark, evaluation scripts, and Docker environments. They do not discuss the bias inherent in evaluating agents using their own benchmark design, which may favor scenarios where their evaluation scripts work well." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": true, 332 "answer": false, 333 "justification": "Different agent frameworks use different interaction rounds (OCI/CodeAct: 3, ReAct: 8) and max tokens (CodeAct: 512, others: 1024), creating unequal compute budgets. Performance differences between frameworks are not discussed in relation to these compute differences." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": false, 338 "justification": "The paper does not discuss whether rejection rate and attack success rate on 25 specific scenarios (derived from CWE and prior work) actually measure 'safety' of code agents in practice. The construct validity of equating performance on these scenarios with broader safety is not examined." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": true, 342 "answer": true, 343 "justification": "The paper explicitly compares the same LLMs across different agent frameworks and notes 'given the same LLMs, different agents exhibit different safety risks' (§4.1.1). Fig. 6 shows the same model (e.g., CL-7b, CL-13b) across OCI, CodeAct, and ReAct frameworks, treating the scaffold as a variable." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "No discussion of whether evaluated models' training data includes CWE examples, prior safety benchmarks (ToolEmu, R-judge), or risky code patterns similar to the benchmark's test cases created before model training." 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": false, 355 "justification": "The paper notes that URLs with obvious risky keywords (e.g., 'dangerous', 'malicious') affect rejection rates and tests random-character URLs in App. D.2, but does not frame this as a feature leakage concern or systematically address other potential leakage through evaluation setup cues." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "All 30 test cases per scenario are augmented from the same seed case using the same LLMs, making them structurally similar. This non-independence of test cases within scenarios is not discussed and could inflate the apparent comprehensiveness of the evaluation." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": false, 365 "justification": "No concrete leakage detection or prevention method is applied (no canary strings, membership inference tests, or n-gram overlap analysis)." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "The overall attack success rate is high across all code agents when queried to execute risky or buggy code, demonstrating the vulnerability of existing agents.", 372 "evidence": "Fig. 5 shows high attack success rates across 8 domains for all 8 representative agents, with many scenarios achieving >50% ASR. Fig. 6 provides per-scenario breakdown across 19 agents.", 373 "supported": "strong" 374 }, 375 { 376 "claim": "Agents have higher rejection rates for risky operations on operating systems and file systems than for technically buggy code.", 377 "evidence": "Fig. 5 shows elevated rejection rates for OS and file system domains compared to program logic and cybersecurity domains. Scenarios like 'Add risky alias to .bashrc' and 'Delete sensitive files' have the highest rejection rates (Fig. 6), while 'PRNG seed management' and 'Duplicate key in associative list' have the lowest.", 378 "supported": "strong" 379 }, 380 { 381 "claim": "Agents are more likely to execute harmful actions from natural language queries than from programming language code inputs.", 382 "evidence": "Fig. 7 shows GPT-4 ReAct agent rejection rates: Python code 38.27%, code with jailbreak 50.53%, text summaries 17.20%, text descriptions 9.60%. ASR is 86.27% for text descriptions vs. 60.13% for code.", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "Python code inputs lead to higher rejection rates than Bash code inputs.", 387 "evidence": "Fig. 6 shows Bash RR is generally lower than Python RR across scenarios. Fig. 7 confirms the gap exists across all input modalities for the GPT-4 ReAct agent (e.g., code: 38.27% Python RR vs. 24.83% Bash RR).", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "OpenCodeInterpreter is more robust than CodeAct and ReAct, with higher rejection rates and lower attack success rates.", 392 "evidence": "Fig. 1 shows OCI agents clustering in the high-RR, low-ASR region. §4.1.1 attributes this to 'hard-coded disk space protection/constraints in its agent codebase.'", 393 "supported": "moderate" 394 }, 395 { 396 "claim": "Stronger base LLMs exhibit higher safety awareness (higher rejection rate) in code execution, but also generate more effective harmful software.", 397 "evidence": "Fig. 1 shows GPT-4 and Claude-3.5 achieving higher rejection rates under ReAct. Tab. 1 shows GPT-4 agents achieving 66.9% accuracy on RedCode-Gen with 5.6 VirusTotal score, higher than weaker models. Pearson correlation of 0.448 between HumanEval pass@1 and RedCode-Gen accuracy.", 398 "supported": "moderate" 399 }, 400 { 401 "claim": "Fine-tuned LLMs for agent tasks may compromise safety.", 402 "evidence": "Fig. 1 shows CodeAct Mistral-7B* and Llama2-7b* (fine-tuned) have among the lowest rejection rates. §4.1.1 states 'instruction tuning for agent tasks enhances the agents' general task-solving ability but may unintentionally weaken the safety guardrails.'", 403 "supported": "weak" 404 }, 405 { 406 "claim": "Safety-aware prompts are not effective enough as a mitigation strategy.", 407 "evidence": "Tab. 9 shows the combined safety prompt reduces ASR by only 17% for code input (79.33% → 62.13%) and only 5% for text descriptions (88.4% → 83.2%). The remaining ASR is still high.", 408 "supported": "moderate" 409 } 410 ], 411 "methodology_tags": ["benchmark-eval"], 412 "key_findings": "RedCode evaluates 19 code agents across 3 frameworks on 4,050 risky code execution scenarios and 160 malicious software generation prompts. Agents show high vulnerability overall, with attack success rates routinely exceeding 50%, though rejection rates are higher for obvious system-level threats (file/OS operations) than for technically buggy code. Natural language inputs are more dangerous than code inputs, as agents fail to recognize underlying risks in text descriptions. More capable models (GPT-4) show both higher safety awareness in execution and greater ability to generate functional malware, creating a dual-use dilemma.", 413 "red_flags": [ 414 { 415 "flag": "LLM-as-judge inconsistency", 416 "detail": "The paper criticizes LLM-as-judge in §3.2.1 ('this approach is often unreliable since LLMs can make errors') for RedCode-Exec, then uses GPT-4 as judge for RedCode-Gen code quality evaluation (§3.3). The same reliability concerns apply to their own LLM judge usage." 417 }, 418 { 419 "flag": "No statistical tests on any comparison", 420 "detail": "All comparative claims (Python vs Bash, agent vs agent, model vs model) are based on raw percentage comparisons without any statistical tests. With 30 test cases per scenario, differences could easily be within noise, especially for smaller model-level comparisons." 421 }, 422 { 423 "flag": "Non-independence of augmented test cases", 424 "detail": "All 30 test cases per scenario are augmented from the same seed using GPT-4/Mistral-Large ('variable name replacement, code structure transformation'). This creates structurally similar test cases that may not represent diverse real-world risky scenarios. The 4,050 count overstates effective diversity." 425 }, 426 { 427 "flag": "Unequal compute budgets across agent comparisons", 428 "detail": "ReAct gets 8 interaction rounds while OCI/CodeAct get 3, and max tokens differ (CodeAct: 512 vs others: 1024). These differences could confound the agent comparison but are not discussed." 429 } 430 ], 431 "cited_papers": [ 432 { 433 "title": "Identifying the risks of LM agents with an LM-emulated sandbox", 434 "authors": ["Yangjun Ruan", "Honghua Dong", "Andrew Wang", "Silviu Pitis", "Yongchao Zhou", "Jimmy Ba", "Yann Dubois", "Chris J. Maddison", "Tatsunori Hashimoto"], 435 "year": 2024, 436 "relevance": "ToolEmu proposes LLM-based emulation for evaluating agent safety risks, a key comparison point for RedCode's real-execution approach." 437 }, 438 { 439 "title": "R-judge: Benchmarking safety risk awareness for LLM agents", 440 "authors": ["Tongxin Yuan", "Zhiwei He", "Lingzhong Dong", "Yiming Wang", "Ruijie Zhao", "Tian Xia", "Lizhen Xu", "Binglin Zhou", "Fangqi Li", "Zhuosheng Zhang", "Rui Wang", "Gongshen Liu"], 441 "year": 2024, 442 "arxiv_id": "2401.10019", 443 "relevance": "Agent safety benchmark using curated risky trajectory records evaluated by LLM judges, which RedCode improves upon with real code execution." 444 }, 445 { 446 "title": "Executable code actions elicit better LLM agents", 447 "authors": ["Xingyao Wang", "Yangyi Chen", "Lifan Yuan", "Yizhe Zhang", "Yunzhi Li", "Hao Peng", "Heng Ji"], 448 "year": 2024, 449 "arxiv_id": "2402.01030", 450 "relevance": "CodeAct agent framework, one of three agent types evaluated in RedCode for safety." 451 }, 452 { 453 "title": "OpenCodeInterpreter: Integrating code generation with execution and refinement", 454 "authors": ["Tianyu Zheng", "Ge Zhang", "Tianhao Shen", "Xueling Liu", "Bill Yuchen Lin", "Jie Fu", "Wenhu Chen", "Xingwei Yue"], 455 "year": 2024, 456 "arxiv_id": "2402.14658", 457 "relevance": "OpenCodeInterpreter agent framework evaluated in RedCode; found to be most robust due to hard-coded safety constraints." 458 }, 459 { 460 "title": "ReAct: Synergizing reasoning and acting in language models", 461 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik Narasimhan", "Yuan Cao"], 462 "year": 2023, 463 "relevance": "ReAct agent framework, one of three frameworks evaluated in RedCode's safety assessment." 464 }, 465 { 466 "title": "AgentDojo: A dynamic environment to evaluate attacks and defenses for LLM agents", 467 "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunović", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"], 468 "year": 2024, 469 "arxiv_id": "2406.13352", 470 "relevance": "Concurrent benchmark evaluating attack/defense mechanisms for LLM agents including prompt injection and backdoors." 471 }, 472 { 473 "title": "Agent security bench (ASB): Formalizing and benchmarking attacks and defenses in LLM-based agents", 474 "authors": ["Hanrong Zhang", "Jingyuan Huang", "Kai Mei", "Yifei Yao", "Zhenting Wang", "Chenhan Zhan", "Hongwei Wang", "Yongfeng Zhang"], 475 "year": 2024, 476 "arxiv_id": "2410.02644", 477 "relevance": "Concurrent agent security benchmark covering prompt injection, memory poisoning, and backdoor attacks." 478 }, 479 { 480 "title": "SWE-agent: Agent-computer interfaces enable automated software engineering", 481 "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"], 482 "year": 2024, 483 "relevance": "Major code agent framework for automated software engineering, relevant to the broader safety evaluation of code agents." 484 }, 485 { 486 "title": "HarmBench: A standardized evaluation framework for automated red teaming and robust refusal", 487 "authors": ["Mantas Mazeika", "Long Phan", "Xuwang Yin", "Andy Zou", "Zifan Wang", "Norman Mu", "Elham Sakhaee", "Nathaniel Li", "Steven Basart", "Bo Li", "David Forsyth", "Dan Hendrycks"], 488 "year": 2024, 489 "relevance": "Standardized safety evaluation framework for LLM red teaming, a key comparison benchmark for code-specific safety evaluation." 490 }, 491 { 492 "title": "Asleep at the keyboard? Assessing the security of GitHub Copilot's code contributions", 493 "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"], 494 "year": 2022, 495 "relevance": "Early work on evaluating security vulnerabilities in LLM-generated code, foundational to the code safety evaluation field." 496 }, 497 { 498 "title": "Evaluating large language models trained on code", 499 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 500 "year": 2021, 501 "relevance": "HumanEval benchmark, used in RedCode-Gen to correlate general coding capability with malware generation quality (Pearson r=0.448)." 502 }, 503 { 504 "title": "HAICOSYSTEM: An ecosystem for sandboxing safety risks in human-AI interactions", 505 "authors": ["Xuhui Zhou", "Hyunwoo Kim", "Faeze Brahman", "Liwei Jiang", "Hao Zhu", "Ximing Lu", "Frank Xu", "Bill Yuchen Lin", "Yejin Choi", "Niloofar Mireshghallah"], 506 "year": 2024, 507 "arxiv_id": "2409.16427", 508 "relevance": "LLM-based emulation framework for evaluating safety in human-AI interactions, a related approach to RedCode's real-execution methodology." 509 }, 510 { 511 "title": "Testing language model agents safely in the wild", 512 "authors": ["Silen Naihin", "David Atkinson", "Marc Green", "Merwane Hamadi", "Craig Swift", "Douglas Schonholtz", "Adam Tauman Kalai", "David Bau"], 513 "year": 2023, 514 "arxiv_id": "2311.10538", 515 "relevance": "AgentMonitor framework for evaluating safety of LLM agent responses, a key comparison for RedCode's evaluation methodology." 516 } 517 ], 518 "engagement_factors": { 519 "practical_relevance": { 520 "score": 2, 521 "justification": "The benchmark with Docker environments and evaluation scripts can be used by teams building code agents to test safety before deployment." 522 }, 523 "surprise_contrarian": { 524 "score": 1, 525 "justification": "Findings largely confirm expected concerns about code agent safety rather than challenging conventional wisdom." 526 }, 527 "fear_safety": { 528 "score": 3, 529 "justification": "Directly demonstrates that code agents can be tricked into executing reverse shells, deleting system files, and generating functional malware, with high attack success rates." 530 }, 531 "drama_conflict": { 532 "score": 1, 533 "justification": "No major controversy, though the finding that stronger models generate better malware creates a compelling dual-use narrative." 534 }, 535 "demo_ability": { 536 "score": 2, 537 "justification": "Code, dataset, and Docker environments are released on GitHub, though running the full evaluation requires substantial setup." 538 }, 539 "brand_recognition": { 540 "score": 2, 541 "justification": "Evaluates well-known models (GPT-4, Claude-3.5); authors from UChicago, UIUC, UC Berkeley, Microsoft Research; published at NeurIPS." 542 } 543 } 544 }