scan.json (27466B)
1 { 2 "paper": { 3 "title": "StepShield: When, Not Whether to Intervene on Rogue Agents", 4 "authors": ["Gloria Felicia", "Michael Eniolade", "Jinfeng He", "Zitha Sasindran", "Hemant Kumar", "Milan Hussain Angati", "Sandeep Bandarupalli"], 5 "year": 2026, 6 "venue": "arXiv.org", 7 "arxiv_id": "2601.22136", 8 "doi": "10.48550/arXiv.2601.22136" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "StepShield introduces the first benchmark for evaluating temporal detection of rogue agent behavior, revealing a 2.3x gap in Early Intervention Rate between LLM-based (0.59) and static (0.26) detectors that is invisible to standard accuracy metrics. All detectors struggle with subtle L3 violations requiring cross-step reasoning (best EIR: 0.38). A cascaded HybridGuard detector reduces monitoring costs by 75% compared to no detection, projecting $108M cumulative savings over 5 years at enterprise scale.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "Appendix C provides a GitHub URL and installation instructions: 'git clone https://github.com/glo26/stepshield'. Released under Apache 2.0 license." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "Dataset of 9,213 trajectories released under CC BY 4.0 (data) and Apache 2.0 (code). Appendix C describes the data directory structure (data/train/, data/test/)." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Appendix C specifies hardware (8 NVIDIA A100 GPUs, 80GB) and provides pip install command. However, no requirements.txt or specific library versions are mentioned in the paper text, though the installable package ('pip install -e .') implies dependency specification." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": true, 34 "justification": "Appendix C provides step-by-step instructions: clone, install, run with specific commands ('python -m stepshield.evaluate --detector [detector]'). Key hyperparameters are listed: seed=42, model=GPT-4.1-mini, context_steps=5, cascade_threshold=0.7, temperature=0.0." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": true, 41 "justification": "Appendix F reports 95% bootstrap confidence intervals for EIR (e.g., LLMJudge [0.59, 0.62], StaticGuard [0.23, 0.29]) computed with 10,000 bootstrap samples." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": true, 46 "justification": "McNemar's test used to compare detector performance (Appendix F, Table 14). All differences reported as statistically significant with p-values (e.g., p < .001 for StaticGuard vs LLMJudge)." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports effect sizes in context: '2.3x gap in EIR between LLMJudge (0.59) and StaticGuard (0.26)', '75% cost reduction ($0.025 vs. $0.100)', and absolute differences across categories (Tables 4-7)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No power analysis or justification for the choice of 639 rogue trajectories, 213 per severity level, or 7,935 test trajectories. The numbers appear chosen for balance rather than statistical power." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Appendix D states 'All ablations are run on the full training set (1,278 trajectories) with 5 random seeds, and we report mean values. Standard deviations are ±0.02 across all experiments.'" 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Four detection paradigms compared: StaticGuard, ConstraintGuard, LLMJudge, and HybridGuard, spanning fast-but-shallow to slow-but-deep (Section 5.1)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "LLMJudge uses GPT-4.1-mini (current model). Baselines represent the range of real-world detection approaches from regex to LLM-based. The paper is proposing a new benchmark, so existing detector baselines are appropriate." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Appendix D provides extensive ablations: prompt sensitivity (Table 9), model scaling (Table 10), context window size (Table 11), and HybridGuard cascade threshold (Table 12)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Five metrics reported: EIR, Intervention Gap, Accuracy, F1, and Tokens Saved (Table 4). The temporal metrics (EIR, IG, Tokens Saved) are novel contributions." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "All evaluation is automated. No human evaluation of detector outputs is conducted. Human annotation was used for dataset creation (94% inter-annotator agreement) but not for evaluating detector performance." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Explicit train/test split: 1,278 training trajectories with labels, 7,935 test trajectories with 'labels withheld to prevent overfitting' (Section 3.4)." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 5 breaks down EIR by all 6 rogue categories (UFO, SEC, RES, INV, TST, DEC). Table 6 breaks down by severity level (L1, L2, L3)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Appendix E: '100 randomly sampled failure cases of LLMJudge' analyzed with distribution: Subtle L3 (38%), Multi-Step Reasoning (22%), False Positives (15%), Ambiguous Boundary (12%), Other (13%)." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Several negative results: all detectors struggle with L3 (LLMJudge only 0.38 EIR), HybridGuard trades timeliness for accuracy, ConstraintGuard achieves worst EIR (0.18). The accuracy-timeliness trade-off is explicitly discussed as a limitation." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims match results: 9,213 trajectories (Section 3.4), 2.3x EIR gap (Table 4: 0.59/0.26), 75% cost reduction (Table 7: $0.025 vs $0.100), $108M savings (Section 6.3)." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "Causal claims are primarily about detector design choices (e.g., cascade strategy delays detection). These are supported by controlled ablation studies (Appendix D) with single-variable manipulation. The economic claims are projections, clearly labeled as such." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title 'When, Not Whether to Intervene on Rogue Agents' and abstract frame the contribution broadly for all agents, but Section 9 acknowledges 'StepShield currently focuses on code-generating agents.' The $108M enterprise savings projection extrapolates well beyond the tested domain." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "Section 9 discusses several alternatives: taxonomy may not be exhaustive, prompt sensitivity affects LLM detector, uniform pricing underestimates savings. Appendix D's ablations address prompt phrasing and model choice as alternative drivers of results." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper measures detection on synthetic GPT-4/Claude-3 generated trajectories and frames this as measuring real-world rogue detection capability. No discussion of whether synthetic trajectories are a valid proxy for actual rogue agent behavior in production deployments." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Uses 'GPT-4.1-mini' without a snapshot date or API version. Appendix A mentions trajectory generation with 'GPT-4/Claude-3' without specific versions. Per schema criteria, marketing names without snapshot dates do not count." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "Prompt strategies are described (zero-shot, few-shot, chain-of-thought in Table 9) but actual prompt text is not provided in the paper or appendices. The reader cannot reconstruct the prompts sent to GPT-4.1-mini." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Appendix C lists key hyperparameters: 'seed=42, model=GPT-4.1-mini, context_steps=5, cascade_threshold=0.7, temperature=0.0.'" 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "All four detector architectures are described in Section 5.1: StaticGuard (regex/keyword), ConstraintGuard (constraint checking), LLMJudge (per-step GPT-4.1-mini evaluation), HybridGuard (cascade with confidence threshold). Latency and cost per step specified." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Appendix A describes the 4-stage collection process (incident analysis, trajectory generation, manual annotation with 94% IAA, benign expansion). Training set composition is fully documented (Table 3). Annotation guidelines in Appendix G." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 9 'Limitations' is a dedicated section with four specific limitations discussed." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 9 identifies specific threats: taxonomy not exhaustive as agents evolve, limited to code-generating agents, uniform per-token pricing assumption, LLM detector sensitivity to prompt engineering (with ablations provided)." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 9: 'StepShield currently focuses on code-generating agents; future work should extend to web, API, and multi-agent domains.' Also: 'our rogue behavior taxonomy... is not exhaustive.'" 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "Full dataset released: 'data/train/ contains 1,278 annotated trajectories... data/test/ contains 7,935 trajectories' (Appendix C). Released under CC BY 4.0." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Appendix A: '4-stage process: incident analysis, trajectory generation (GPT-4/Claude-3), manual annotation (94% inter-annotator agreement), and benign expansion.' Rogue categories grounded in real incidents (Table 1)." 192 }, 193 "recruitment_methods_described": { 194 "applies": true, 195 "answer": false, 196 "justification": "Trajectories are synthetic (generated by GPT-4/Claude-3) but the generation process is only briefly described. For annotators: 'two independent annotators' are mentioned but recruitment/selection criteria for annotators are not described." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "Pipeline documented with counts: 639 rogue + 639 clean pairs for training (balanced across 6 categories × 3 severity levels, Table 3), 7,935 test (639 rogue + 639 clean + 6,657 benign, 8.1% rogue rate). Annotation process in Appendix G." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding or acknowledgments section in the paper. Authors are from multiple universities but no funding sources are mentioned." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All author affiliations listed on the first page: University of Virginia, University of the Cumberlands, Cornell University, IISc Bangalore, University of Arizona, CSU Northridge, University of Cincinnati." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed. Absence of funding disclosure is not the same as unfunded work." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "GPT-4.1-mini is used as LLMJudge but its training data cutoff is not stated. The model could have seen similar code patterns or security incident descriptions in training." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether GPT-4.1-mini's training data contains trajectories similar to the test set, or whether the model has seen the security incidents used to ground the taxonomy." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "No discussion of benchmark contamination. The trajectories are newly generated, which mitigates contamination risk, but this is not explicitly discussed as a design choice for contamination prevention." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants. Annotators created the dataset but were not research subjects." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in the study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in the study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in the study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in the study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Table 7 provides cost per trajectory for each detector (e.g., LLMJudge ~$0.02, HybridGuard ~$0.003). Latency also reported per step (LLMJudge ~500ms, StaticGuard <1ms)." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": true, 289 "justification": "Appendix C: 'Total computation time for all experiments was approximately 2,000 GPU hours' on '8 NVIDIA A100 GPUs (80GB).'" 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": true, 296 "justification": "Appendix D: 'All ablations are run on the full training set (1,278 trajectories) with 5 random seeds, and we report mean values. Standard deviations are ±0.02 across all experiments.'" 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Appendix D explicitly states '5 random seeds' for ablation experiments." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "Cascade threshold tested at 3 values (0.5, 0.7, 0.9 in Table 12) and context window at 4 values (Table 11), but no overall search budget reported. No description of how other hyperparameters (e.g., regex patterns for StaticGuard) were selected." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "Table 12 shows cascade threshold selection with clear trade-off analysis. Context window selection justified via diminishing returns (Table 11). Config selected on training set, not test." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "McNemar's test performed across multiple detector pairs (Table 14) but no correction for multiple comparisons (Bonferroni, Holm, etc.) is mentioned." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "Authors designed and evaluate all four detectors. No acknowledgment of self-comparison bias or independent evaluation by third parties." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "Table 10 reports EIR and cost per trajectory across three models (GPT-4.1-mini, GPT-4, GPT-4-Turbo). Table 7 compares detection cost vs. performance trade-offs across all detectors." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether the synthetic trajectories generated by GPT-4/Claude-3 actually represent real-world rogue agent behavior. The construct validity of the benchmark itself is not examined." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "The paper compares detection paradigms, not models within scaffolds. Each detector is a distinct system; scaffold confounding is not applicable." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of whether GPT-4.1-mini's training data includes patterns from the security incidents used to generate rogue trajectories, or whether the model has temporal familiarity with the violation patterns." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information. For example, the step-by-step presentation format may provide cues that wouldn't exist in real deployment." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of structural similarities between training and test trajectories. Both are generated by the same models (GPT-4/Claude-3) from the same taxonomy, which could introduce systematic shared patterns." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No concrete leakage detection or prevention method applied. The test set labels are withheld, but no analysis of train/test overlap or contamination checking is performed." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "LLM-based judge achieves 2.3x higher Early Intervention Rate (0.59) than static analyzer (0.26), a gap invisible to standard accuracy metrics.", 365 "evidence": "Table 4 (Section 5.2): LLMJudge EIR=0.59, Acc=0.63; StaticGuard EIR=0.26, Acc=0.56. Statistical significance confirmed via McNemar's test (p<.001, Appendix F).", 366 "supported": "strong" 367 }, 368 { 369 "claim": "HybridGuard reduces monitoring costs by 75% compared to no detection ($0.025 vs $0.100 per trajectory).", 370 "evidence": "Table 7 (Section 6.2): detailed cost breakdown showing detection + execution costs. Cascade uses fast detectors for 85% of cases.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "HybridGuard projects to $108M in cumulative savings over 5 years at enterprise scale.", 375 "evidence": "Section 6.3 and Appendix B. Based on assumptions: 100M trajectories/year starting scale, 35% YoY growth, $0.10 baseline cost. Sensitivity analysis shows $54M-$432M range.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "All detectors struggle with L3 (subtle) violations requiring cross-step reasoning; even LLMJudge achieves only 0.38 EIR.", 380 "evidence": "Table 6 (Section 5.4): L3 EIR ranges from 0.04 (ConstraintGuard) to 0.38 (LLMJudge).", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Binary accuracy is fundamentally insufficient for evaluating detection systems (Theorem 1).", 385 "evidence": "Section 4.4: proof by construction that two monitors can have identical accuracy but maximally different EIR.", 386 "supported": "strong" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "Synthetic data validity unexamined", 392 "detail": "All trajectories are generated by GPT-4/Claude-3, not collected from real agent deployments. The paper does not discuss whether synthetic rogue behavior is representative of actual rogue agent behavior in production, which is a significant construct validity concern for a benchmark paper." 393 }, 394 { 395 "flag": "Economic projections extrapolate far beyond evidence", 396 "detail": "The $108M savings projection relies on assumed 35% YoY growth, 100M starting trajectories, and uniform pricing — all acknowledged as assumptions but presented prominently. The projection is for a domain (enterprise agent deployment) quite different from the tested domain (synthetic code agent trajectories)." 397 }, 398 { 399 "flag": "Uniform ±0.02 standard deviation across all experiments", 400 "detail": "Appendix D claims 'Standard deviations are ±0.02 across all experiments' — identical variance across prompt sensitivity, model scaling, and context window ablations is suspiciously uniform and not how real experimental variance typically behaves." 401 }, 402 { 403 "flag": "No prompts provided despite being central to LLMJudge", 404 "detail": "LLMJudge is the star performer and central to the paper's claims, but the actual prompts used for per-step evaluation are not provided. Only prompt strategy labels (zero-shot, few-shot, CoT) are given." 405 }, 406 { 407 "flag": "Annotator details thin", 408 "detail": "Only 'two independent annotators' mentioned with Cohen's Kappa of 0.82. No information about annotator qualifications, selection, or whether they had security expertise relevant to identifying rogue agent behavior." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "AgentHarm: A benchmark for measuring harmfulness of LLM agents", 414 "authors": ["Maksym Andriushchenko", "Alexandra Souly", "Mateusz Dziemian"], 415 "year": 2025, 416 "relevance": "Benchmark measuring LLM agent compliance with malicious requests, directly relevant to agent safety evaluation." 417 }, 418 { 419 "title": "ShieldAgent: Shielding agents via verifiable safety policy reasoning", 420 "authors": ["Zhaorun Chen", "Mintong Kang", "Bo Li"], 421 "year": 2025, 422 "relevance": "First guardrail agent enforcing safety policy compliance through logical reasoning, relevant to agent safety monitoring." 423 }, 424 { 425 "title": "Agent-SafetyBench: Evaluating the safety of LLM agents", 426 "authors": ["Zhexin Zhang"], 427 "year": 2024, 428 "arxiv_id": "2412.14470", 429 "relevance": "Comprehensive agent safety benchmark with 300 test cases, key baseline for StepShield's temporal evaluation paradigm." 430 }, 431 { 432 "title": "SHADE-Arena: Evaluating sabotage and monitoring in LLM agents", 433 "authors": ["Jonathan Kutasov", "Yuqi Sun", "Paul Colognese"], 434 "year": 2025, 435 "arxiv_id": "2506.15740", 436 "relevance": "Evaluates LLM agents' ability to evade monitoring during sabotage, complementary to StepShield's detection timeliness focus." 437 }, 438 { 439 "title": "Sleeper agents: Training deceptive LLMs that persist through safety training", 440 "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"], 441 "year": 2024, 442 "arxiv_id": "2401.05566", 443 "relevance": "Demonstrates deceptive behaviors persist through safety training, motivating runtime monitoring approaches like StepShield." 444 }, 445 { 446 "title": "AgentBench: Evaluating LLMs as agents", 447 "authors": ["Xiao Liu", "Hao Yu", "Hanchen Zhang"], 448 "year": 2024, 449 "relevance": "Multi-environment LLM agent benchmark that StepShield complements with safety-focused temporal evaluation." 450 }, 451 { 452 "title": "SafeArena: Evaluating the safety of autonomous web agents", 453 "authors": ["Alexander Daniel Tur", "Nicholas Meade"], 454 "year": 2025, 455 "relevance": "Safety benchmark for autonomous web agents, uses LLM-as-Judge paradigm that StepShield extends with step-level evaluation." 456 }, 457 { 458 "title": "GuardAgent: Safeguard LLM agents via knowledge-enabled reasoning", 459 "authors": ["Zhen Xiang", "Linzhi Zheng"], 460 "year": 2025, 461 "relevance": "Guardrail agent that generates guardrail code for safety checks, complementary infrastructure to StepShield's evaluation framework." 462 }, 463 { 464 "title": "Agent-as-a-judge: Evaluate agents with agents", 465 "authors": ["Mingchen Zhuge", "Changsheng Zhao", "Dylan Ashley"], 466 "year": 2025, 467 "relevance": "Extends LLM-as-Judge to agentic evaluation with intermediate feedback, relevant to step-level evaluation paradigm." 468 }, 469 { 470 "title": "R-Judge: Benchmarking safety risk awareness", 471 "authors": ["Tongxin Yuan"], 472 "year": 2024, 473 "arxiv_id": "2401.10019", 474 "relevance": "Uses LLM to judge agent helpfulness and harmlessness, a key prior agent safety benchmark." 475 }, 476 { 477 "title": "SafeAgentBench: A benchmark for safe task planning of embodied LLM agents", 478 "authors": ["Sheng Yin", "Xianghe Pang"], 479 "year": 2024, 480 "arxiv_id": "2412.13178", 481 "relevance": "Safety-aware task planning benchmark for embodied agents with 750 tasks, extends safety evaluation to embodied domain." 482 }, 483 { 484 "title": "ReAct: Synergizing reasoning and acting in language models", 485 "authors": ["Shunyu Yao", "Jeffrey Zhao"], 486 "year": 2023, 487 "relevance": "Foundational agent architecture interleaving reasoning and action, enables the agent paradigm StepShield evaluates." 488 } 489 ] 490 }