calibration.json (22801B)
1 { 2 "paper_slug": "agentdojo-dynamic-environment-2024", 3 "calibration_date": "2026-02-28", 4 "calibrator": "opus", 5 "total_questions": 50, 6 "agreement_count": 45, 7 "disagreement_count": 5, 8 "agreement_rate": 0.90, 9 "disagreements": [ 10 { 11 "category": "evaluation_design", 12 "question": "held_out_test_set", 13 "sonnet": {"applies": true, "answer": true}, 14 "opus": {"applies": false, "answer": false}, 15 "direction": "applies_boundary", 16 "explanation": "This paper introduces a new benchmark framework and evaluates pre-trained LLMs via API on it. There is no training involved and no dev/test split. The concept of a 'held-out test set' is structurally inapplicable because the LLMs are not tuned on any portion of the AgentDojo data. The benchmark IS the evaluation set. Sonnet treated the benchmark as a held-out test set, but the schema asks about separation of dev/test splits which does not apply here." 17 }, 18 { 19 "category": "setup_transparency", 20 "question": "model_versions_specified", 21 "sonnet": {"applies": true, "answer": true}, 22 "opus": {"applies": true, "answer": false}, 23 "direction": "sonnet_generous", 24 "explanation": "The code in Figure 12 shows 'gpt-4o-2024-05-13' for one model, but the paper text refers to models by marketing names only: 'Claude 3 Sonnet', 'Claude 3 Opus', 'Claude 3.5 Sonnet', 'GPT-3.5 Turbo', 'GPT-4 Turbo', 'GPT-4o', 'Gemini 1.5 Flash', 'Gemini 1.5 Pro', 'Llama 3 70B', 'Command R+'. The schema requires 'specific model names with version (e.g., gpt-4-0613)' and states 'marketing names like Gemini-2.5 or GPT-4o without a snapshot date or API version do NOT count.' One snapshot date in a code figure for one model out of ten does not satisfy this criterion. Most models lack version/snapshot identifiers." 25 }, 26 { 27 "category": "contamination", 28 "question": "training_cutoff_stated", 29 "sonnet": {"applies": true, "answer": false}, 30 "opus": {"applies": false, "answer": false}, 31 "direction": "applies_boundary", 32 "explanation": "The schema states NA if 'the paper does not evaluate a pre-trained model's capability on any benchmark' and includes 'studies that test defenses/tools rather than model knowledge.' AgentDojo evaluates prompt injection attacks and defenses — it tests security properties and task-completion utility in a stateful tool-calling environment, not model knowledge or capability on a benchmark in the traditional sense. The tasks are newly designed by the authors and require dynamic interaction, not recall of training data. Contamination is largely irrelevant here." 33 }, 34 { 35 "category": "contamination", 36 "question": "train_test_overlap_discussed", 37 "sonnet": {"applies": true, "answer": false}, 38 "opus": {"applies": false, "answer": false}, 39 "direction": "applies_boundary", 40 "explanation": "Same reasoning as training_cutoff_stated. The paper evaluates security properties (prompt injection robustness) in a dynamic tool-calling environment, not model knowledge on a static benchmark. Train/test overlap is not a meaningful concern for a security benchmark where tasks require multi-step stateful tool interaction. The NA rule states 'if the paper does not evaluate a pre-trained model on any benchmark (same NA rule as training_cutoff_stated).' While one could argue this is a benchmark evaluation, the nature of the tasks (dynamic tool calling, security testing) makes contamination structurally irrelevant." 41 }, 42 { 43 "category": "contamination", 44 "question": "benchmark_contamination_addressed", 45 "sonnet": {"applies": true, "answer": false}, 46 "opus": {"applies": false, "answer": false}, 47 "direction": "applies_boundary", 48 "explanation": "Same reasoning as the other contamination items. The benchmark tasks were newly created in 2024, involve dynamic stateful tool interactions, and test security properties rather than model knowledge. Even if a model had 'seen' similar task descriptions, this would not meaningfully inflate its scores on the actual benchmark since performance depends on correct multi-step tool orchestration in a specific environment state. The contamination concern is structurally inapplicable." 49 } 50 ], 51 "opus_checklist": { 52 "artifacts": { 53 "code_released": { 54 "applies": true, 55 "answer": true, 56 "justification": "Code is released under MIT license at https://github.com/ethz-spylab/agentdojo (Section 1, Appendix E.1). A Zenodo DOI (10.5281/zenodo.12528188) is also provided in Appendix E.6." 57 }, 58 "data_released": { 59 "applies": true, 60 "answer": true, 61 "justification": "The benchmark tasks, tools, and environment state are released via the GitHub repository. Model outputs and conversations are released as JSON files on Google Drive (Appendix E.3). The benchmark is installable via pip." 62 }, 63 "environment_specified": { 64 "applies": true, 65 "answer": true, 66 "justification": "Appendix E.3 explicitly states: 'we additionally include a requirements.txt file that can be used to install the exact dependencies we used for the experimental results in the paper.'" 67 }, 68 "reproduction_instructions": { 69 "applies": true, 70 "answer": true, 71 "justification": "Appendix E.3 states the GitHub README provides 'extensive documentation on how to use our framework (including how to run the existing benchmark, create new tools, task, etc.)' and includes Jupyter Notebooks to reproduce all figures and tables." 72 } 73 }, 74 "statistical_methodology": { 75 "confidence_intervals_or_error_bars": { 76 "applies": true, 77 "answer": true, 78 "justification": "95% confidence intervals are reported for all main results using statsmodels.stats.proportion.proportion_confint. Tables 3, 4, and 5 all include CI values. The NeurIPS checklist confirms this explicitly." 79 }, 80 "significance_tests": { 81 "applies": false, 82 "answer": false, 83 "justification": "This is a benchmark evaluation paper that reports performance metrics across systems. No formal claims of statistical superiority are made that would require significance tests. The paper reports CIs but does not claim one model is statistically significantly better than another." 84 }, 85 "effect_sizes_reported": { 86 "applies": true, 87 "answer": true, 88 "justification": "Absolute and relative differences are reported throughout. Table 2 shows percentage changes from baseline (e.g., '-22.6%'). Table 5 shows targeted ASR dropping from 57.69% to 6.84% with tool filter. These provide sufficient context for assessing practical magnitude." 89 }, 90 "sample_size_justified": { 91 "applies": true, 92 "answer": false, 93 "justification": "The benchmark uses 97 user tasks and 629 security test cases, but no power analysis or explicit justification for why these numbers are sufficient is provided. The tasks were designed for scenario coverage, not statistical power." 94 }, 95 "variance_reported": { 96 "applies": true, 97 "answer": true, 98 "justification": "95% confidence intervals are reported for all major results in Tables 3, 4, and 5, providing spread information. This is a deterministic benchmark (no random seeds involved in the benchmark itself), and CIs are computed from the binomial proportion, which is appropriate." 99 } 100 }, 101 "evaluation_design": { 102 "baselines_included": { 103 "applies": true, 104 "answer": true, 105 "justification": "Multiple attack baselines are compared: 'Ignore previous instructions', 'TODO', InjecAgent, and 'Important message' (Section 4.2, Figure 8, Table 4). Multiple defense baselines are compared (Section 4.3, Figure 9, Table 5). Multiple LLMs serve as agent baselines." 106 }, 107 "baselines_contemporary": { 108 "applies": true, 109 "answer": true, 110 "justification": "Baselines include contemporary methods from 2023-2024: InjecAgent (2024), Spotlighting (2024), ProtectAI (2024). LLM evaluations include Claude 3.5 Sonnet, GPT-4o, Gemini 1.5 Pro — all state-of-the-art at time of publication." 111 }, 112 "ablation_study": { 113 "applies": true, 114 "answer": true, 115 "justification": "Section 4.2 presents ablations on attacker knowledge (Table 2) showing the impact of knowing user name and model name on attack success rate. Section 4.3 ablates multiple defense strategies independently. These are controlled single-variable manipulations." 116 }, 117 "multiple_metrics": { 118 "applies": true, 119 "answer": true, 120 "justification": "Three metrics are explicitly defined in Section 3.4: Benign Utility, Utility Under Attack, and Targeted Attack Success Rate (ASR). Results are reported across all three throughout the paper." 121 }, 122 "human_evaluation": { 123 "applies": false, 124 "answer": false, 125 "justification": "The paper uses deterministic utility functions to evaluate agent task completion. Human evaluation is not relevant here — in fact, the paper explicitly argues against LLM-based evaluation because the injection might fool the evaluator too. Deterministic checks are more appropriate." 126 }, 127 "held_out_test_set": { 128 "applies": false, 129 "answer": false, 130 "justification": "This paper introduces a benchmark framework and evaluates pre-trained LLMs via API. No model training or tuning is performed on any portion of the data. There is no dev/test split because there is no development phase using the benchmark data. The concept of a held-out test set is structurally inapplicable." 131 }, 132 "per_category_breakdown": { 133 "applies": true, 134 "answer": true, 135 "justification": "Results are broken down by task suite (Workspace, Slack, Travel, Banking) in Figure 7 showing attack success rates per injection task and suite. Figure 20 shows utility by suite for different attacks. Figure 6 breaks down by model." 136 }, 137 "failure_cases_discussed": { 138 "applies": true, 139 "answer": true, 140 "justification": "Specific failure cases are discussed: travel task 6 succeeds in 0% of cases because the injection requires two unrelated actions. The Slack suite's 92% success rate is analyzed. Defense failure modes are discussed in Section 4.3 (tool filter fails when planning cannot be done in advance)." 141 }, 142 "negative_results_reported": { 143 "applies": true, 144 "answer": true, 145 "justification": "Multiple negative results reported: incorrect attacker knowledge weakens attacks significantly (-22.6%, Table 2). Prompt injection detector has too many false positives and degrades utility (Section 4.3). Denial-of-service attacks are less effective than targeted attacks (Figure 20, Appendix D)." 146 } 147 }, 148 "claims_and_evidence": { 149 "abstract_claims_supported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Abstract claims are well-supported: '97 realistic tasks' and '629 security test cases' confirmed in Table 1. 'LLMs fail at many tasks' confirmed by benign utility scores in Table 3. 'Attacks break some security properties but not all' confirmed by per-task ASR breakdowns in Figure 7." 153 }, 154 "causal_claims_justified": { 155 "applies": true, 156 "answer": true, 157 "justification": "Causal claims about the effect of attacker knowledge (Table 2) and defense mechanisms (Table 5) are made via controlled single-variable manipulations. The ablation design is adequate: one variable is changed at a time while holding others constant." 158 }, 159 "generalization_bounded": { 160 "applies": true, 161 "answer": true, 162 "justification": "Results are reported for specific models on the AgentDojo benchmark. The paper explicitly notes 'settings which AgentDojo does not cover yet' (Section 4.3, Section 5). Claims are bounded to the tested environments and attacks. The title accurately names the specific framework rather than claiming general results." 163 }, 164 "alternative_explanations_discussed": { 165 "applies": true, 166 "answer": true, 167 "justification": "Alternative explanations are discussed: the inverse scaling finding is explained by 'models with low utility often fail at correctly executing the attacker's goal, even when the prompt injection succeeds.' The high Slack ASR is attributed to the attacker controlling a significant fraction of tool output (Figure 21b). Tool filter failures are explained by task structure." 168 } 169 }, 170 "setup_transparency": { 171 "model_versions_specified": { 172 "applies": true, 173 "answer": false, 174 "justification": "The paper refers to most models by marketing names only: 'Claude 3 Sonnet', 'GPT-4o', 'Gemini 1.5 Pro', etc. Only one code figure (Figure 12) shows 'gpt-4o-2024-05-13'. The schema explicitly states 'marketing names like Gemini-2.5 or GPT-4o without a snapshot date or API version do NOT count as specified versions.' Nine out of ten models lack snapshot dates or API version identifiers in the paper text." 175 }, 176 "prompts_provided": { 177 "applies": true, 178 "answer": true, 179 "justification": "Full prompt text is provided in Appendix B: the default system prompt (Figure 14), additional Claude Sonnet prompt (Figure 15), Llama 3 70B prompt (Figure 16), defense prompts (Figures 17-18), and all four attack prompts (Figure 19) are reproduced verbatim." 180 }, 181 "hyperparameters_reported": { 182 "applies": true, 183 "answer": false, 184 "justification": "The paper does not report temperature, top-p, max tokens, or other sampling hyperparameters for any of the LLM API calls. Section 4 states models are queried 'following the respective documentation' but specific settings are not stated." 185 }, 186 "scaffolding_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "The agentic scaffolding is described in detail: Section 3.2 explains the pipeline interface, Figure 11 shows the base component, Figure 12 shows the pipeline combining LLM with injection detector, and Section 3.3 explains how attacks integrate with the scaffold. Code examples are provided." 190 }, 191 "data_preprocessing_documented": { 192 "applies": true, 193 "answer": true, 194 "justification": "Section 3.1 describes how environment state data was generated: 'manually or assisted by GPT-4o and Claude 3 Opus, by providing the models with the expected schema of the data and a few examples. For LLM-generated test data we manually inspected all outputs to ensure high quality.' The cross-product method for generating security test cases is explained in Section 3." 195 } 196 }, 197 "limitations_and_scope": { 198 "limitations_section_present": { 199 "applies": true, 200 "answer": true, 201 "justification": "Section 5 (Conclusion) contains substantive discussion of limitations with five explicitly labeled avenues for improvement. Section 4.3 discusses specific failure modes of defenses. This goes beyond a single sentence." 202 }, 203 "threats_to_validity_specific": { 204 "applies": true, 205 "answer": true, 206 "justification": "Specific threats are identified: tool filter defense fails when task planning cannot be done before observing untrusted data; defense fails when needed tools also suffice for attacks (17% of test cases); multi-session attack scenarios not yet covered; limitations of manual task specification for scaling. These are specific to THIS study." 207 }, 208 "scope_boundaries_stated": { 209 "applies": true, 210 "answer": true, 211 "justification": "The paper explicitly states 'settings which AgentDojo does not cover yet' including multi-session scenarios (Section 4.3). It notes attacks are 'general-purpose and not designed specifically for any given tasks or security scenarios' (Section 1). It acknowledges tasks are synthetically generated, not from real deployments." 212 } 213 }, 214 "data_integrity": { 215 "raw_data_available": { 216 "applies": true, 217 "answer": true, 218 "justification": "All model outputs and conversations are released as JSON files on Google Drive (Appendix E.3). The benchmark code and environment state data are on GitHub under MIT license. Zenodo DOI 10.5281/zenodo.12528188 provides archival access." 219 }, 220 "data_collection_described": { 221 "applies": true, 222 "answer": true, 223 "justification": "Task creation process is described in Section 3.1: tasks were manually designed to cover diverse scenarios, dummy data was generated manually or via GPT-4o/Claude 3 Opus with manual inspection. The four environments and their contents are described in detail in Table 1." 224 }, 225 "recruitment_methods_described": { 226 "applies": false, 227 "answer": false, 228 "justification": "No human participants or crowd-sourced contributors are involved. The benchmark uses synthetically generated data and tasks designed by the paper authors. The data card confirms 'No real data. Only dummy data.' The NeurIPS checklist marks human subjects items as N/A." 229 }, 230 "data_pipeline_documented": { 231 "applies": true, 232 "answer": true, 233 "justification": "The pipeline from task design to evaluation is documented: task design process (Section 3.1), cross-product combination to obtain security test cases (Section 3), environment state management (Section 3.1), and deterministic utility function evaluation (Section 3.1, with code examples in Figures 4-5)." 234 } 235 }, 236 "conflicts_of_interest": { 237 "funding_disclosed": { 238 "applies": true, 239 "answer": true, 240 "justification": "Acknowledgments section states: 'E.D. is supported by armasuisse Science and Technology. J.Z. is funded by the Swiss National Science Foundation (SNSF) project grant 214838.' The data card (Appendix F.2.3) adds: 'No institution provided explicit funding for the creation of this benchmark.'" 241 }, 242 "affiliations_disclosed": { 243 "applies": true, 244 "answer": true, 245 "justification": "Author affiliations are disclosed on the title page: ETH Zurich (all authors) and Invariant Labs (Balunovic, Beurer-Kellner, Fischer). The data card (Appendix F.2.2) repeats these affiliations." 246 }, 247 "funder_independent_of_outcome": { 248 "applies": true, 249 "answer": false, 250 "justification": "Three authors (Balunovic, Beurer-Kellner, Fischer) are affiliated with Invariant Labs, a company building AI security tools including prompt injection defenses. They evaluate defense concepts that are closely related to Invariant Labs' commercial interests. While not direct funders, the commercial affiliation creates a non-independent interest in the outcomes." 251 }, 252 "financial_interests_declared": { 253 "applies": true, 254 "answer": false, 255 "justification": "No competing interests or financial interests statement appears anywhere in the paper. Three authors are affiliated with Invariant Labs (a commercial AI security company), but this financial interest is not explicitly declared beyond the affiliation listing. The absence of a competing interests declaration is NO per the schema." 256 } 257 }, 258 "contamination": { 259 "training_cutoff_stated": { 260 "applies": false, 261 "answer": false, 262 "justification": "AgentDojo tests prompt injection attack/defense robustness in a dynamic tool-calling environment, not model knowledge on a static benchmark. The schema states applies=false for 'studies that test defenses/tools rather than model knowledge.' The tasks require multi-step stateful tool interaction where contamination is not a meaningful concern — knowing the task description would not help a model execute the correct tool calls in the correct order on the specific environment state." 263 }, 264 "train_test_overlap_discussed": { 265 "applies": false, 266 "answer": false, 267 "justification": "Same reasoning as training_cutoff_stated. This is a security benchmark testing defenses and attack robustness, not model knowledge. Train/test overlap is structurally irrelevant for evaluating whether a model follows injected instructions versus resisting them." 268 }, 269 "benchmark_contamination_addressed": { 270 "applies": false, 271 "answer": false, 272 "justification": "Same reasoning as the other contamination items. The benchmark was newly created in 2024 and tests dynamic security properties in stateful environments. Even theoretical contamination would not meaningfully affect the measured outcomes (attack success rates, defense effectiveness)." 273 } 274 }, 275 "human_studies": { 276 "pre_registered": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants. The NeurIPS checklist marks human subjects items as N/A." 280 }, 281 "irb_or_ethics_approval": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants. Only synthetically generated data and automated LLM evaluations." 285 }, 286 "demographics_reported": { 287 "applies": false, 288 "answer": false, 289 "justification": "No human participants involved." 290 }, 291 "inclusion_exclusion_criteria": { 292 "applies": false, 293 "answer": false, 294 "justification": "No human participants involved." 295 }, 296 "randomization_described": { 297 "applies": false, 298 "answer": false, 299 "justification": "No human participants involved." 300 }, 301 "blinding_described": { 302 "applies": false, 303 "answer": false, 304 "justification": "No human participants involved." 305 }, 306 "attrition_reported": { 307 "applies": false, 308 "answer": false, 309 "justification": "No human participants involved." 310 } 311 }, 312 "cost_and_practicality": { 313 "inference_cost_reported": { 314 "applies": true, 315 "answer": true, 316 "justification": "Appendix D explicitly states: 'We estimate that running the full suite of 629 security test cases on GPT-4o costs around US$35, and running the suite of 97 utility test cases costs US$4.'" 317 }, 318 "compute_budget_stated": { 319 "applies": true, 320 "answer": true, 321 "justification": "Total API costs are reported ($35 for the full security test suite on GPT-4o, $4 for utility cases). No GPU compute is required as only API-accessed models are evaluated (except Llama 3 70B, but no GPU details are given for it)." 322 } 323 } 324 } 325 }